Skip to content
This repository has been archived by the owner on Feb 2, 2022. It is now read-only.

Commit

Permalink
Extract deceased parsing to its own function
Browse files Browse the repository at this point in the history
The function parse_deceased_field() was created to pull content from the
corresponding field on the fatality page. There was an existing function
of the same name, which has now been renamed process_deceased_field().
Test cases have been made for the new parse_deceased_field().

Fixes #106
  • Loading branch information
anthonybaulo committed Jun 7, 2019
1 parent 555c3b9 commit 75dcb69
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 7 deletions.
33 changes: 30 additions & 3 deletions scrapd/core/apd.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def common_fatality_parsing(d):
deceased_field = ' '.join(deceased_field)

try:
d.update(parse_deceased_field(deceased_field))
d.update(process_deceased_field(deceased_field))
except ValueError as e:
logger.trace(e)
else:
Expand Down Expand Up @@ -341,7 +341,7 @@ def dob_search(split_deceased_field):
return dob_index


def parse_deceased_field(deceased_field):
def process_deceased_field(deceased_field):
"""
Parse the deceased field.
Expand Down Expand Up @@ -511,7 +511,6 @@ def parse_page_content(detail_page, notes_parsed=False):
"""
d = {}
searches = [
(Fields.DECEASED, re.compile(r'>Deceased:\s*(?:</span>)?(?:</strong>)?\s*>?([^<]*\d)\s*.*\)?<')),
(Fields.LOCATION, re.compile(r'>Location:.*>\s{2,}(?:</strong>)?([^<]+)')),
]
normalized_detail_page = unicodedata.normalize("NFKD", detail_page)
Expand All @@ -533,6 +532,9 @@ def parse_page_content(detail_page, notes_parsed=False):
if date_field_str:
d[Fields.DATE] = date_utils.parse_date(date_field_str)

# Parse the `Deceased` field.
d[Fields.DECEASED] = parse_deceased_field(normalized_detail_page)

# Parse the `Time` field.
d[Fields.TIME] = parse_time_field(normalized_detail_page)

Expand Down Expand Up @@ -600,6 +602,31 @@ def parse_date_field(page):
return date[0][1].strftime("%m/%d/%Y") if date else ''


def parse_deceased_field(page):
"""
Extract content from deceased field on the fatality page.
:param str page: the content of the fatality page
:return: a string representing the deceased field content.
:rtype: str
"""
deceased_pattern = re.compile(
r'''
>Deceased: # The name of the desired field.
\s* # Any whitespace character.
(?:</span>)? # Non-capture (literal match).
(?:</strong>)? # Non-capture (literal match).
\s* # Any whitespace character.
>? # Literal match.
([^<]*\d) # Capture any character/digit except '<'.
\s*.* # Any character/whitespace.
\)?< # Literal match ')' and '<'
''',
re.VERBOSE,
)
return match_pattern(page, deceased_pattern)


def parse_time_field(page):
"""
Extract the time from the content of the fatality page.
Expand Down
23 changes: 19 additions & 4 deletions tests/core/test_apd.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,10 +313,10 @@ def test_extract_traffic_fatalities_page_details_link_00(news_page):
},
),
))
def test_parse_deceased_field_00(deceased, expected):
def test_process_deceased_field_00(deceased, expected):
"""Ensure a deceased field is parsed correctly."""
d = {}
d = apd.parse_deceased_field(deceased)
d = apd.process_deceased_field(deceased)
for key in expected:
assert d[key] == expected[key]

Expand Down Expand Up @@ -419,10 +419,10 @@ def test_parse_page_content_00(filename, expected):


def test_parse_page_content_01(mocker):
"""Ensure a `parse_deceased_field` exception is caught and does not propagate."""
"""Ensure a `process_deceased_field` exception is caught and does not propagate."""
page_fd = TEST_DATA_DIR / 'traffic-fatality-2-3'
page = page_fd.read_text()
mocker.patch('scrapd.core.apd.parse_deceased_field', side_effect=ValueError)
mocker.patch('scrapd.core.apd.process_deceased_field', side_effect=ValueError)
result = apd.parse_page_content(page)
assert len(result) == 6

Expand Down Expand Up @@ -659,3 +659,18 @@ def test_parse_date_field_00(input_, expected):
"""Ensure a date field gets parsed correctly."""
actual = apd.parse_date_field(input_)
assert actual == expected


@pytest.mark.parametrize('input_,expected', (
('>Deceased: </strong> Luis Fernando Martinez-Vertiz | Hispanic male | 04/03/1994</p>', \
'Luis Fernando Martinez-Vertiz | Hispanic male | 04/03/1994'),
('>Deceased: </strong> Cecil Wade Walker, White male, D.O.B. 3-7-70<', \
'Cecil Wade Walker, White male, D.O.B. 3-7-70'),
('>Deceased: </span></strong> Halbert Glen Hendricks - Black male - 9-24-78<', \
'Halbert Glen Hendricks - Black male - 9-24-78'),
('', ''),
))
def test_parse_deceased_field_00(input_, expected):
"""Ensure the deceased field gets parsed correctly."""
actual = apd.parse_deceased_field(input_)
assert actual == expected

0 comments on commit 75dcb69

Please sign in to comment.