diff --git a/scrapd/core/apd.py b/scrapd/core/apd.py index f8f662c..9842d70 100644 --- a/scrapd/core/apd.py +++ b/scrapd/core/apd.py @@ -264,7 +264,7 @@ def common_fatality_parsing(d): deceased_field = ' '.join(deceased_field) try: - d.update(parse_deceased_field(deceased_field)) + d.update(process_deceased_field(deceased_field)) except ValueError as e: logger.trace(e) else: @@ -341,7 +341,7 @@ def dob_search(split_deceased_field): return dob_index -def parse_deceased_field(deceased_field): +def process_deceased_field(deceased_field): """ Parse the deceased field. @@ -511,7 +511,6 @@ def parse_page_content(detail_page, notes_parsed=False): """ d = {} searches = [ - (Fields.DECEASED, re.compile(r'>Deceased:\s*(?:)?(?:)?\s*>?([^<]*\d)\s*.*\)?<')), (Fields.LOCATION, re.compile(r'>Location:.*>\s{2,}(?:)?([^<]+)')), ] normalized_detail_page = unicodedata.normalize("NFKD", detail_page) @@ -533,6 +532,9 @@ def parse_page_content(detail_page, notes_parsed=False): if date_field_str: d[Fields.DATE] = date_utils.parse_date(date_field_str) + # Parse the `Deceased` field. + d[Fields.DECEASED] = parse_deceased_field(normalized_detail_page) + # Parse the `Time` field. d[Fields.TIME] = parse_time_field(normalized_detail_page) @@ -600,6 +602,31 @@ def parse_date_field(page): return date[0][1].strftime("%m/%d/%Y") if date else '' +def parse_deceased_field(page): + """ + Extract content from deceased field on the fatality page. + + :param str page: the content of the fatality page + :return: a string representing the deceased field content. + :rtype: str + """ + deceased_pattern = re.compile( + r''' + >Deceased: # The name of the desired field. + \s* # Any whitespace character. + (?:)? # Non-capture (literal match). + (?:)? # Non-capture (literal match). + \s* # Any whitespace character. + >? # Literal match. + ([^<]*\d) # Capture any character/digit except '<'. + \s*.* # Any character/whitespace. + \)?< # Literal match ')' and '<' + ''', + re.VERBOSE, + ) + return match_pattern(page, deceased_pattern) + + def parse_time_field(page): """ Extract the time from the content of the fatality page. diff --git a/tests/core/test_apd.py b/tests/core/test_apd.py index c1f2dcd..bd5c39b 100644 --- a/tests/core/test_apd.py +++ b/tests/core/test_apd.py @@ -313,10 +313,10 @@ def test_extract_traffic_fatalities_page_details_link_00(news_page): }, ), )) -def test_parse_deceased_field_00(deceased, expected): +def test_process_deceased_field_00(deceased, expected): """Ensure a deceased field is parsed correctly.""" d = {} - d = apd.parse_deceased_field(deceased) + d = apd.process_deceased_field(deceased) for key in expected: assert d[key] == expected[key] @@ -419,10 +419,10 @@ def test_parse_page_content_00(filename, expected): def test_parse_page_content_01(mocker): - """Ensure a `parse_deceased_field` exception is caught and does not propagate.""" + """Ensure a `process_deceased_field` exception is caught and does not propagate.""" page_fd = TEST_DATA_DIR / 'traffic-fatality-2-3' page = page_fd.read_text() - mocker.patch('scrapd.core.apd.parse_deceased_field', side_effect=ValueError) + mocker.patch('scrapd.core.apd.process_deceased_field', side_effect=ValueError) result = apd.parse_page_content(page) assert len(result) == 6 @@ -659,3 +659,18 @@ def test_parse_date_field_00(input_, expected): """Ensure a date field gets parsed correctly.""" actual = apd.parse_date_field(input_) assert actual == expected + + +@pytest.mark.parametrize('input_,expected', ( + ('>Deceased: Luis Fernando Martinez-Vertiz | Hispanic male | 04/03/1994

', \ + 'Luis Fernando Martinez-Vertiz | Hispanic male | 04/03/1994'), + ('>Deceased: Cecil Wade Walker, White male, D.O.B. 3-7-70<', \ + 'Cecil Wade Walker, White male, D.O.B. 3-7-70'), + ('>Deceased: Halbert Glen Hendricks - Black male - 9-24-78<', \ + 'Halbert Glen Hendricks - Black male - 9-24-78'), + ('', ''), +)) +def test_parse_deceased_field_00(input_, expected): + """Ensure the deceased field gets parsed correctly.""" + actual = apd.parse_deceased_field(input_) + assert actual == expected