Extract deceased parsing to its own function

The function parse_deceased_field() was created to pull content from the corresponding field on the fatality page. There was an existing function of the same name, which has now been renamed process_deceased_field(). Test cases have been made for the new parse_deceased_field(). Fixes #106
scrapd · Jun 7, 2019 · 75dcb69 · 75dcb69
1 parent 555c3b9
commit 75dcb69
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 7 deletions.
diff --git a/scrapd/core/apd.py b/scrapd/core/apd.py
@@ -264,7 +264,7 @@ def common_fatality_parsing(d):
             deceased_field = ' '.join(deceased_field)
 
         try:
-            d.update(parse_deceased_field(deceased_field))
+            d.update(process_deceased_field(deceased_field))
         except ValueError as e:
             logger.trace(e)
     else:
@@ -341,7 +341,7 @@ def dob_search(split_deceased_field):
     return dob_index
 
 
-def parse_deceased_field(deceased_field):
+def process_deceased_field(deceased_field):
     """
     Parse the deceased field.
 
@@ -511,7 +511,6 @@ def parse_page_content(detail_page, notes_parsed=False):
     """
     d = {}
     searches = [
-        (Fields.DECEASED, re.compile(r'>Deceased:\s*(?:</span>)?(?:</strong>)?\s*>?([^<]*\d)\s*.*\)?<')),
         (Fields.LOCATION, re.compile(r'>Location:.*>\s{2,}(?:</strong>)?([^<]+)')),
     ]
     normalized_detail_page = unicodedata.normalize("NFKD", detail_page)
@@ -533,6 +532,9 @@ def parse_page_content(detail_page, notes_parsed=False):
     if date_field_str:
         d[Fields.DATE] = date_utils.parse_date(date_field_str)
 
+    # Parse the `Deceased` field.
+    d[Fields.DECEASED] = parse_deceased_field(normalized_detail_page)
+
     # Parse the `Time` field.
     d[Fields.TIME] = parse_time_field(normalized_detail_page)
 
@@ -600,6 +602,31 @@ def parse_date_field(page):
     return date[0][1].strftime("%m/%d/%Y") if date else ''
 
 
+def parse_deceased_field(page):
+    """
+    Extract content from deceased field on the fatality page.
+
+    :param str page: the content of the fatality page
+    :return: a string representing the deceased field content.
+    :rtype: str
+    """
+    deceased_pattern = re.compile(
+        r'''
+        >Deceased:      # The name of the desired field.
+        \s*             # Any whitespace character.
+        (?:</span>)?    # Non-capture (literal match).
+        (?:</strong>)?  # Non-capture (literal match).
+        \s*             # Any whitespace character.
+        >?              # Literal match.
+        ([^<]*\d)       # Capture any character/digit except '<'.
+        \s*.*           # Any character/whitespace.
+        \)?<            # Literal match ')' and '<'
+        ''',
+        re.VERBOSE,
+    )
+    return match_pattern(page, deceased_pattern)
+
+
 def parse_time_field(page):
     """
     Extract the time from the content of the fatality page.

diff --git a/tests/core/test_apd.py b/tests/core/test_apd.py
@@ -313,10 +313,10 @@ def test_extract_traffic_fatalities_page_details_link_00(news_page):
         },
     ),
 ))
-def test_parse_deceased_field_00(deceased, expected):
+def test_process_deceased_field_00(deceased, expected):
     """Ensure a deceased field is parsed correctly."""
     d = {}
-    d = apd.parse_deceased_field(deceased)
+    d = apd.process_deceased_field(deceased)
     for key in expected:
         assert d[key] == expected[key]
 
@@ -419,10 +419,10 @@ def test_parse_page_content_00(filename, expected):
 
 
 def test_parse_page_content_01(mocker):
-    """Ensure a `parse_deceased_field` exception is caught and does not propagate."""
+    """Ensure a `process_deceased_field` exception is caught and does not propagate."""
     page_fd = TEST_DATA_DIR / 'traffic-fatality-2-3'
     page = page_fd.read_text()
-    mocker.patch('scrapd.core.apd.parse_deceased_field', side_effect=ValueError)
+    mocker.patch('scrapd.core.apd.process_deceased_field', side_effect=ValueError)
     result = apd.parse_page_content(page)
     assert len(result) == 6
 
@@ -659,3 +659,18 @@ def test_parse_date_field_00(input_, expected):
     """Ensure a date field gets parsed correctly."""
     actual = apd.parse_date_field(input_)
     assert actual == expected
+
+
+@pytest.mark.parametrize('input_,expected', (
+    ('>Deceased: </strong> Luis Fernando Martinez-Vertiz | Hispanic male | 04/03/1994</p>', \
+    'Luis Fernando Martinez-Vertiz | Hispanic male | 04/03/1994'),
+    ('>Deceased: </strong> Cecil Wade Walker, White male, D.O.B. 3-7-70<', \
+    'Cecil Wade Walker, White male, D.O.B. 3-7-70'),
+    ('>Deceased: </span></strong> Halbert Glen Hendricks - Black male - 9-24-78<', \
+    'Halbert Glen Hendricks - Black male - 9-24-78'),
+    ('', ''),
+))
+def test_parse_deceased_field_00(input_, expected):
+    """Ensure the deceased field gets parsed correctly."""
+    actual = apd.parse_deceased_field(input_)
+    assert actual == expected