scrapd · mergify · Apr 25, 2019 · Apr 24, 2019 · mrengler · Apr 25, 2019
diff --git a/scrapd/core/apd.py b/scrapd/core/apd.py
@@ -68,7 +68,6 @@ async def fetch_detail_page(session, url):
     :return: the page content.
     :rtype: str
     """
-
     return await fetch_text(session, url)
 
 
@@ -286,46 +285,140 @@ def parse_name(name):
     return d
 
 
-def parse_deceased_field(deceased_field):
+def dob_search(split_deceased_field):
     """
-    Parse the deceased field.
-
-    At this point the deceased field, if it exists, is garbage as it contains First Name, Last Name, Ethnicity,
-    Gender, D.O.B. and Notes. We need to explode this data into the appropriate fields.
+    Search for the DOB in a deceased field.
 
-    :param str deceased_field: the deceased field from the fatality report
-    :return: a dictionary representing a deceased field.
-    :rtype: dict
+    :param list split_deceased_field: a list representing the deceased field
+    :return: the DOB index within the split deceased field.
+    :rtype: int
     """
-    deceased_field = re.split(r' |(?<=[A-Za-z])/', deceased_field)
     dob_index = -1
     dob_tokens = [Fields.DOB, '(D.O.B', '(D.O.B.', '(D.O.B:', '(DOB', '(DOB:', 'D.O.B.', 'DOB:']
     while dob_index < 0 and dob_tokens:
         dob_token = dob_tokens.pop()
         try:
-            dob_index = deceased_field.index(dob_token)
+            dob_index = split_deceased_field.index(dob_token)
         except ValueError:
             pass
         else:
             break
 
-    if dob_index < 0:
-        raise ValueError(f'Cannot parse {Fields.DECEASED}: {deceased_field}')
+    return dob_index
+
+
+def parse_deceased_field(deceased_field):
+    """
+    Parse the deceased field.
+
+    At this point the deceased field, if it exists, is garbage as it contains First Name, Last Name, Ethnicity,
+    Gender, D.O.B. and Notes. We need to explode this data into the appropriate fields.
+
+    :param str deceased_field: the deceased field from the fatality report
+    :return: a dictionary representing a deceased field.
+    :rtype: dict
+    """
+    # Try to parse the deceased fields when the fields are comma separated.
+    try:
+        return parse_comma_delimited_deceased_field(deceased_field)
+    except Exception:
+        pass
+
+    # Try to parse the deceased fields when the fields are pipe separated.
+    try:
+        return parse_pipe_delimited_deceased_field(deceased_field)
+    except Exception:
+        pass
+
+    try:
+        return parse_space_delimited_deceased_field(deceased_field)
+    except Exception:
+        pass
+
+    raise ValueError(f'Cannot parse {Fields.DECEASED}: {deceased_field}')
 
+
+def parse_comma_delimited_deceased_field(deceased_field):
+    """Parse deceased fields seperated with commas.
+
+    :param list split_deceased_field: a list representing the deceased field
+    :return: a dictionary representing the deceased field.
+    :rtype: dict
+    """
     d = {}
-    d[Fields.DOB] = deceased_field[dob_index + 1]
-    notes = deceased_field[dob_index + 2:]
+    split_deceased_field = re.split(r' |(?<=[A-Za-z])/', deceased_field)
+    dob_index = dob_search(split_deceased_field)
+    if dob_index < 0:
+        raise ValueError(f'Cannot find DOB in the deceased field: {deceased_field}')
+    raw_dob = split_deceased_field[dob_index + 1]
+    validated_dob = date_utils.clean_date_string(raw_dob, True)
+    d[Fields.DOB] = validated_dob
+    notes = split_deceased_field[dob_index + 2:]
     if notes:
         d[Fields.NOTES] = ' '.join(notes)
 
     # `fleg` stands for First, Last, Ethnicity, Gender. It represents the info stored before the DOB.
-    fleg = deceased_field[:dob_index]
+    fleg = split_deceased_field[:dob_index]
+    d.update(parse_fleg(fleg))
+    return d
+
+
+def parse_pipe_delimited_deceased_field(deceased_field):
+    """
+    Parse deceased fields separated with pipes.
+
+    :param str deceased_field: the deceased field as a string.
+    :return: a dictionary representing the deceased field.
+    :rtype: dict
+    """
+    d = {}
+    split_deceased_field = deceased_field.split('|')
+    raw_dob = split_deceased_field[-1].strip()
+    d[Fields.DOB] = date_utils.clean_date_string(raw_dob, True)
+
+    fleg = (split_deceased_field[0] + split_deceased_field[1]).split()
+    d.update(parse_fleg(fleg))
+    return d
+
 
+def parse_space_delimited_deceased_field(deceased_field):
+    """
+    Parse deceased fields separated with spaces.
+
+    :param str deceased_field: the deceased field as a string.
+    :return: a dictionary representing the deceased field.
+    :rtype: dict
+    """
+    d = {}
+    split_deceased_field = re.split(r' |/', deceased_field)
+    raw_dob = split_deceased_field[-1].strip()
+    d[Fields.DOB] = date_utils.clean_date_string(raw_dob, True)
+
+    fleg = split_deceased_field[:-1]
+    d.update(parse_fleg(fleg))
+    return d
+
+
+def parse_fleg(fleg):
+    """
+    Parse FLEG.
+
+    :param list fleg: [description]
+    :return: [description]
+    :rtype: dict
+    """
     # Try to pop out the results one by one. If pop fails, it means there is nothing left to retrieve,
-    # For example, there is no first name and last name.
+    d = {}
     try:
         d[Fields.GENDER] = fleg.pop().replace(',', '').lower()
+        if d.get(Fields.GENDER) == 'f':
+            d[Fields.GENDER] = 'female'
+        elif d.get(Fields.GENDER) == 'm':
+            d[Fields.GENDER] = 'male'
+
         d[Fields.ETHNICITY] = fleg.pop().replace(',', '')
+        if d.get(Fields.ETHNICITY) == 'W':
+            d[Fields.ETHNICITY] = 'White'
     except IndexError:
         pass
 
@@ -350,7 +443,7 @@ def parse_page_content(detail_page, notes_parsed=False):
         (Fields.CASE, re.compile(r'Case:.*\s([0-9\-]+)<')),
         (Fields.CRASHES, re.compile(r'Traffic Fatality #(\d{1,3})')),
         (Fields.DATE, re.compile(r'>Date:.*\s{2,}([^<]*)</')),
-        (Fields.DECEASED, re.compile(r'>Deceased:.*\s{2,}([^<]*\d)\)?<')),
+        (Fields.DECEASED, re.compile(r'>Deceased:\s*(?:</span>)?(?:</strong>)?\s*>?([^<]*\d)\s*.*\)?<')),
         (Fields.LOCATION, re.compile(r'>Location:.*>\s{2,}([^<]+)')),
         (Fields.TIME, re.compile(r'>Time:.*>\s{2,}([^<]+)')),
     ]

diff --git a/tests/core/test_apd.py b/tests/core/test_apd.py
@@ -238,22 +238,69 @@ def test_extract_traffic_fatalities_page_details_link_00(news_page):
 
 @pytest.mark.parametrize('deceased,expected', (
     ("Rosbel “Rudy” Tamez, Hispanic male (D.O.B. 10-10-54)", {
+        Fields.FIRST_NAME: "Rosbel",
         Fields.LAST_NAME: "Tamez",
-        Fields.FIRST_NAME: "Rosbel"
+        Fields.ETHNICITY: "Hispanic",
+        Fields.GENDER: "male",
+        Fields.DOB: '10/10/1954',
     }),
     ("Eva Marie Gonzales, W/F, DOB: 01-22-1961 (passenger)", {
-        Fields.LAST_NAME: "Gonzales",
         Fields.FIRST_NAME: "Eva",
-        Fields.GENDER: 'f'
+        Fields.LAST_NAME: "Gonzales",
+        Fields.ETHNICITY: "White",
+        Fields.GENDER: 'female',
+        Fields.DOB: '01/22/1961',
     }),
     (
         'DOB: 01-01-99',
         {
-            Fields.DOB: '01-01-99',
+            Fields.DOB: '01/01/1999',
+        },
+    ),
+    (
+        'Wing Cheung Chou | Asian male | 08/01/1949',
+        {
+            Fields.FIRST_NAME: "Wing",
+            Fields.LAST_NAME: "Chou",
+            Fields.ETHNICITY: "Asian",
+            Fields.GENDER: "male",
+            Fields.DOB: '08/01/1949',
+        },
+    ),
+    (
+        'Christopher M Peterson W/M 10-8-1981',
+        {
+            Fields.FIRST_NAME: "Christopher",
+            Fields.LAST_NAME: "Peterson",
+            Fields.ETHNICITY: "White",
+            Fields.GENDER: "male",
+            Fields.DOB: '10/08/1981',
+        },
+    ),
+    (
+        'Luis Angel Tinoco, Hispanic male (11-12-07',
+        {
+            Fields.FIRST_NAME: "Luis",
+            Fields.LAST_NAME: "Tinoco",
+            Fields.ETHNICITY: "Hispanic",
+            Fields.GENDER: "male",
+            Fields.DOB: '11/12/2007'
+        },
+    ),
+    (
+        'Ronnie Lee Hall, White male, 8-28-51',
+        {
+            Fields.FIRST_NAME: "Ronnie",
+            Fields.LAST_NAME: "Hall",
+            Fields.ETHNICITY: "White",
+            Fields.GENDER: "male",
+            Fields.DOB: '08/28/1951'
         },
     ),
 ))
-def test_parse_deceased_field(deceased, expected):
+def test_parse_deceased_field_00(deceased, expected):
+    """Ensure a deceased field is parsed correctly."""
+    d = {}
     d = apd.parse_deceased_field(deceased)
     for key in expected:
         assert d[key] == expected[key]
@@ -341,6 +388,14 @@ def test_parse_page_content_00(filename, expected):
     assert actual == expected
 
 
+def test_parse_page_content_01(mocker):
+    """Ensure a `parse_deceased_field` exception is caught and does not propagate."""
+    page_fd = TEST_DATA_DIR / 'traffic-fatality-2-3'
+    page = page_fd.read_text()
+    mocker.patch('scrapd.core.apd.parse_deceased_field', side_effect=ValueError)
+    apd.parse_page_content(page)
+
+
 @pytest.mark.parametrize('filename,expected', [(k, v) for k, v in parse_twitter_fields_scenarios.items()])
 def test_parse_twitter_fields_00(filename, expected):
     """Ensure information are properly extracted from the twitter fields on detail page."""

diff --git a/tests/core/test_date_utils.py b/tests/core/test_date_utils.py
@@ -29,10 +29,20 @@ def test_parse_date_01():
         date_utils.parse_date('Not a date')
 
 
-@pytest.mark.parametrize('date, expected', [
-    ('Jan 10 2019', '01/10/2019'),
-    ('2019-01-10', '01/10/2019'),
+@pytest.mark.parametrize('date, dob, expected', [
+    ('Jan 10 2019', False, '01/10/2019'),
+    ('2019-01-10', False, '01/10/2019'),
+    ('10-10-54', True, '10/10/1954'),
 ])
-def test_clean_date_string_00(date, expected):
+def test_clean_date_string_00(date, dob, expected):
     """Ensure date string is properly formatted."""
-    assert date_utils.clean_date_string(date) == expected
+    assert date_utils.clean_date_string(date, dob) == expected
+
+
+@pytest.mark.parametrize('date, expected', [
+    (datetime.datetime(2019, 1, 10, 0, 0), datetime.datetime(2019, 1, 10, 0, 0)),
+    (datetime.datetime(2054, 10, 10, 0, 0), datetime.datetime(1954, 10, 10, 0, 0)),
+])
+def test_check_dob_00(date, expected):
+    """Ensure a DOB is valid."""
+    assert date_utils.check_dob(date) == expected