scrapd · mergify · May 23, 2019 · Apr 17, 2019 · Apr 18, 2019 · Apr 20, 2019
diff --git a/scrapd/core/apd.py b/scrapd/core/apd.py
@@ -174,19 +174,6 @@ def parse_twitter_description(twitter_description):
     if tmp_dob and isinstance(tmp_dob, list):
         d[Fields.DOB] = tmp_dob[0]
 
-    # Parse the Deceased field.
-    if d.get(Fields.DECEASED):
-        try:
-            d.update(parse_deceased_field(' '.join(d.get(Fields.DECEASED))))
-        except ValueError as e:
-            logger.trace(e)
-    else:
-        logger.trace('No decease information to parse in Twitter description.')
-
-    # Compute the victim's age.
-    if d.get(Fields.DATE) and d.get(Fields.DOB):
-        d[Fields.AGE] = date_utils.compute_age(' '.join(d.get(Fields.DATE)), d.get(Fields.DOB))
-
     return sanitize_fatality_entity(d)
 
 
@@ -251,6 +238,7 @@ def parse_details_page_notes(details_page_notes):
 def sanitize_fatality_entity(d):
     """
     Clean up a fatality entity.
+    Performs parsing common to Twitter descriptions and page content.
 
     Ensures that the values are all strings and removes the 'Deceased' field which does not contain
     relevant information anymore.
@@ -259,21 +247,37 @@ def sanitize_fatality_entity(d):
     :return: A dictionary containing the details information about the fatality with sanitized entries.
     :rtype: dict
     """
+
     # All values must be strings.
     for k, v in d.items():
         if isinstance(v, list):
             d[k] = ' '.join(v)
 
-    if d.get('Date'):
-        d['Date'] = date_utils.clean_date_string(d['Date'])
-
-    if d.get('DOB'):
-        d['DOB'] = date_utils.clean_date_string(d['DOB'], True)
+    if d.get(Fields.DECEASED):
+        try:
+            d.update(parse_deceased_field(d.get(Fields.DECEASED)))
+        except ValueError as e:
+            logger.trace(e)
+    else:
+        logger.trace('No deceased information to parse in fatality page.')
 
     # The 'Deceased' field is unnecessary.
     if d.get('Deceased'):
         del d['Deceased']
 
+    # Parse the `Date` field.
+    if d.get(Fields.DATE):
+        d[Fields.DATE] = date_utils.parse_date(d[Fields.DATE])
+
+    # Parse the `DOB` field.
+    if d.get(Fields.DOB):
+        dob_guess = date_utils.parse_date(d[Fields.DOB])
+        d[Fields.DOB] = date_utils.check_dob(dob_guess)
+
+    # Compute the victim's age.
+    if d.get(Fields.DATE) and d.get(Fields.DOB):
+        d[Fields.AGE] = date_utils.compute_age(d.get(Fields.DATE), d.get(Fields.DOB))
+
     return d
 
 
@@ -345,14 +349,38 @@ def parse_deceased_field(deceased_field):
     except Exception:
         pass
 
+    # Try to parse the deceased fields assuming it contains an age.
+    try:
+        return parse_age_deceased_field(deceased_field)
+    except Exception:
+        pass
+
     raise ValueError(f'Cannot parse {Fields.DECEASED}: {deceased_field}')
 
 
+def parse_age_deceased_field(deceased_field):
+    """
+    Parse deceased field assuming it contains an age.
+
+    :param str deceased_field: the deceased field
+    :return: a dictionary representing the deceased field.
+    :rtype: dict
+    """
+    age_pattern = re.compile(r'([0-9]+) years')
+    age = re.search(age_pattern, deceased_field).group(1)
+    if not age:
+        raise ValueError(f'Cannot find age in the deceased field: {deceased_field}')
+    split_deceased_field = age_pattern.split(deceased_field)
+    d = parse_fleg(split_deceased_field[0].split())
+    d[Fields.AGE] = int(age)
+    return d
+
+
 def parse_comma_delimited_deceased_field(deceased_field):
     """
     Parse deceased fields seperated with commas.
 
-    :param list split_deceased_field: a list representing the deceased field
+    :param str deceased_field: a list representing the deceased field
     :return: a dictionary representing the deceased field.
     :rtype: dict
     """
@@ -480,26 +508,13 @@ def parse_page_content(detail_page, notes_parsed=False):
     # Parse the `Crashes` field.
     d[Fields.CRASHES] = parse_crashes_field(normalized_detail_page)
 
-    # Parse the `Deceased` field.
-    if d.get(Fields.DECEASED):
-        try:
-            d.update(parse_deceased_field(d.get(Fields.DECEASED)))
-        except ValueError as e:
-            logger.trace(e)
-    else:
-        logger.trace('No deceased information to parse in fatality page.')
-
     # Fill in Notes from Details page if not in twitter description.
     search_notes = re.compile(r'>Deceased:.*\s{2,}(.|\n)*?<\/p>(.|\n)*?<\/p>')
     match = re.search(search_notes, normalized_detail_page)
     if match and not notes_parsed:
         text_chunk = match.string[match.start(0):match.end(0)]
         d[Fields.NOTES] = parse_details_page_notes(text_chunk)
 
-    # Compute the victim's age.
-    if d.get(Fields.DATE) and d.get(Fields.DOB):
-        d[Fields.AGE] = date_utils.compute_age(d.get(Fields.DATE), d.get(Fields.DOB))
-
     return sanitize_fatality_entity(d)
 
 
@@ -688,16 +703,16 @@ async def async_retrieve(pages=-1, from_=None, to=None):
             # If the page contains fatalities, ensure all of them happened within the specified time range.
             if page_res:
                 entries_in_time_range = [
-                    entry for entry in page_res if date_utils.is_in_range(entry[Fields.DATE], from_, to)
+                    entry for entry in page_res
+                    if date_utils.from_date(from_) <= entry[Fields.DATE] <= date_utils.to_date(to)
                 ]
 
                 # If 2 pages in a row:
                 #   1) contain results
                 #   2) but none of them contain dates within the time range
                 #   3) and we did not collect any valid entries
                 # Then we can stop the operation.
-                if from_ and all([date_utils.is_posterior(entry[Fields.DATE], from_)
-                                  for entry in page_res]) and not has_entries:
+                if from_ and all([entry[Fields.DATE] < date_utils.from_date(from_) for entry in page_res]) and not has_entries:
                     no_date_within_range_count += 1
                 if no_date_within_range_count > 1:
                     logger.debug(f'{len(entries_in_time_range)} fatality page(s) within the specified time range.')

diff --git a/scrapd/core/date_utils.py b/scrapd/core/date_utils.py
@@ -4,19 +4,6 @@
 import dateparser
 
 
-def is_posterior(d1, d2):
-    """
-    Return True is d1 is posterior to d2 (i.e. it happened after).
-
-    :param str d1: date 1
-    :param str d2: date 2
-    :return: True is d1 is posterior to d2
-    :rtype: bool
-    """
-
-    return parse_date(d1) < parse_date(d2)
-
-
 def check_dob(dob):
     """
     In case that a date only contains 2 digits, determine century.
@@ -103,34 +90,16 @@ def parse_date(date, default=None, settings=None):
         raise Exception
 
 
-def is_in_range(date, from_=None, to=None):
-    """
-    Check whether a date is comprised between 2 others.
-
-    :param str date: date to vheck
-    :param str from_: start date, defaults to None
-    :param str to: end date, defaults to None
-    :return: `True` if the date is between `from_` and `to`
-    :rtype: bool
-    """
-    current_date = parse_date(date)
-    from_date_ = from_date(from_)
-    to_date_ = to_date(to)
-
-    return from_date_ <= current_date <= to_date_
-
-
 def compute_age(date, dob):
     """
     Compute a victim's age.
 
-    :param str date: crash date
-    :param str dob: date of birth
+    :param datetime.datetime date: crash date
+    :param datetime.datetime dob: date of birth
     :return: the victim's age.
     :rtype: int
     """
     DAYS_IN_YEAR = 365
-    dob_ = parse_date(dob)
 
     # Compute the age.
-    return (parse_date(date) - check_dob(dob_)).days // DAYS_IN_YEAR
+    return (date - dob).days // DAYS_IN_YEAR
diff --git a/tests/core/test_apd.py b/tests/core/test_apd.py
@@ -1,4 +1,5 @@
 """Test the APD module."""
+from datetime import datetime
 from unittest import mock
 
 import aiohttp
@@ -46,8 +47,8 @@ def news_page(scope='session'):
         Fields.AGE: 38,
         Fields.CASE: '18-3640187',
         Fields.CRASHES: '73',
-        Fields.DOB: '02/09/1980',
-        Fields.DATE: '12/30/2018',
+        Fields.DOB: datetime(1980, 2, 9),
+        Fields.DATE: datetime(2018, 12, 30),
         Fields.ETHNICITY: 'White',
         Fields.FIRST_NAME: 'Corbin',
         Fields.GENDER: 'male',
@@ -63,14 +64,14 @@ def news_page(scope='session'):
     'traffic-fatality-72-1': {
         Fields.CASE: '18-3551763',
         Fields.CRASHES: '72',
-        Fields.DATE: '12/21/2018',
+        Fields.DATE: datetime(2018, 12, 21),
         Fields.LOCATION: '9500 N Mopac SB',
         Fields.TIME: '8:20 p.m.',
     },
     'traffic-fatality-71-2': {
         Fields.CASE: '18-3381590',
         Fields.CRASHES: '71',
-        Fields.DATE: '12/04/2018',
+        Fields.DATE: datetime(2018, 12, 4),
         Fields.LOCATION: '183 service road westbound and Payton Gin Rd.',
         Fields.TIME: '8:39 p.m.',
     },
@@ -81,8 +82,8 @@ def news_page(scope='session'):
         **parse_twitter_fields_scenarios['traffic-fatality-2-3'],
         Fields.AGE: 58,
         Fields.CRASHES: '2',
-        Fields.DOB: '02/15/1960',
-        Fields.DATE: '01/16/2019',
+        Fields.DOB: datetime(1960, 2, 15),
+        Fields.DATE: datetime(2019, 1, 16),
         Fields.ETHNICITY: 'White',
         Fields.FIRST_NAME: 'Ann',
         Fields.GENDER: 'female',
@@ -94,8 +95,8 @@ def news_page(scope='session'):
         Fields.AGE: 38,
         Fields.CASE: '18-3640187',
         Fields.CRASHES: '73',
-        Fields.DOB: '02/09/1980',
-        Fields.DATE: '12/30/2018',
+        Fields.DOB: datetime(1980, 2, 9),
+        Fields.DATE: datetime(2018, 12, 30),
         Fields.ETHNICITY: 'White',
         Fields.FIRST_NAME: 'Corbin',
         Fields.GENDER: 'male',
@@ -106,15 +107,15 @@ def news_page(scope='session'):
     'traffic-fatality-72-1': {
         **parse_twitter_fields_scenarios['traffic-fatality-72-1'],
         Fields.AGE: 22,
-        Fields.DOB: '03/29/1996',
+        Fields.DOB: datetime(1996, 3, 29),
         Fields.ETHNICITY: 'White',
         Fields.FIRST_NAME: 'Elijah',
         Fields.GENDER: 'male',
         Fields.LAST_NAME: 'Perales',
     },
     'traffic-fatality-71-2': {
         **parse_twitter_fields_scenarios['traffic-fatality-71-2'],
-        Fields.DOB: '06/01/1964',
+        Fields.DOB: datetime(1964, 6, 1),
         Fields.FIRST_NAME: 'Barkat',
         Fields.LAST_NAME: 'Umatia',
         Fields.ETHNICITY: 'Other',
@@ -159,10 +160,10 @@ def test_parse_twitter_title_00(input_, expected):
         mock_data.twitter_description_00,
         {
             'Case': '18-3640187',
-            'Date': '12/30/2018',
+            'Date': datetime(2018, 12, 30),
             'Time': '2:24 a.m.',
             'Location': '1400 E. Highway 71 eastbound',
-            'DOB': '02/09/1980',
+            'DOB': datetime(1980, 2, 9),
             'Notes': 'The preliminary investigation shows that a 2003 Ford F150 was '
             'traveling northbound on the US Highway 183 northbound ramp to E. Highway 71, eastbound. '
             'The truck went across the E. Highway 71 and US Highway 183 ramp, rolled '
@@ -197,8 +198,8 @@ def test_parse_twitter_description_02():
     expected = {
         'Age': 57,
         'Case': '18-160882',
-        'DOB': '01/22/1961',
-        'Date': '01/16/2018',
+        'DOB': datetime(1961, 1, 22),
+        'Date': datetime(2018, 1, 16),
         'Location': '1500 W. Slaughter Lane',
         'Time': '5:14 p.m.',
     }

diff --git a/tests/core/test_date_utils.py b/tests/core/test_date_utils.py
@@ -11,7 +11,7 @@
 ])
 def test_is_in_range_00(current, from_, to, expected):
     """Ensure a date is in range."""
-    assert date_utils.is_in_range(current, from_, to) == expected
+    assert (date_utils.from_date(from_) <= date_utils.parse_date(current) <= date_utils.to_date(to)) == expected
 
 
 @pytest.mark.parametrize('date, default, settings, expected', [