Skip to content
This repository has been archived by the owner on Feb 2, 2022. It is now read-only.

Parse notes from detail page #60

Merged
merged 19 commits into from
Mar 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 56 additions & 6 deletions scrapd/core/apd.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,55 @@ def parse_twitter_description(twitter_description):
return sanitize_fatality_entity(d)


def parse_details_page_notes(details_page_notes):
"""
Clean up a details page notes section.

The purpose of this function is to attempt to extract the sentences about
the crash with some level of fidelity, but does not always return
a perfectly parsed sentence as the HTML syntax varies widely.

:param str details_description: the paragraph after the Deceased information
:return: A paragraph containing the details of the fatality in sentence form.
:rtype: str
"""
# Ideally the Notes will be contained in a paragraph tag.
start_tag = details_page_notes.find('<p>') + len('<p>')
end_tag = details_page_notes.find('</p>', start_tag)

# Here .upper().isupper() tests if the substring of the
# text passed in contains any letters. If it doesn't,
# the Notes may be located after a <br \>.
if not details_page_notes[start_tag:end_tag].upper().isupper():
start_tag = details_page_notes.find(r'<br \>') + len(r'<br \>')

snippet = details_page_notes[start_tag:end_tag]

# Update the snippet if the following tag is an image.
if snippet[:4] == '<img':
snippet = details_page_notes[details_page_notes.find(r'<br \>') + len(r'<br \>'):end_tag]

# Remove the end of line characters.
squished = snippet.replace('\n', ' ')

# Look for the first capital letter and start from there.
first_cap = 0
for index, c in enumerate(squished):
if c.isupper():
first_cap = index
break

return squished[first_cap:]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a bunch of magic numbers in this function, they could be replaced with something more readable.

An idea would be to define constants and then use their legnth len().

BR_TAG = r'<br \>'
start_tag = details_description.find(BR_TAG) + len(BR_TAG)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea


def sanitize_fatality_entity(d):
"""
Clean up a fatality entity.

Ensures that the values are all strings and removes the 'Deceased' field which does not contain
relevant information anymore.

:param dict d: the fatality to sanatize
:param dict d: the fatality to sanitize
:return: A dictionary containing the details information about the fatality with sanitized entries.
:rtype: dict
"""
Expand Down Expand Up @@ -255,7 +296,7 @@ def parse_deceased_field(deceased_field):
return d


def parse_page_content(detail_page):
def parse_page_content(detail_page, notes_parsed=False):
"""
Parse the detail page to extract fatality information.

Expand All @@ -277,15 +318,21 @@ def parse_page_content(detail_page):
match = re.search(search[1], normalized_detail_page)
if match:
d[search[0]] = match.groups()[0]

# Parse the Deceased field.
if d.get(Fields.DECEASED):
try:
d.update(parse_deceased_field(d.get(Fields.DECEASED).split()))
except ValueError as e:
logger.trace(e)
else:
logger.trace('No decease information to parse in fatality page.')
logger.trace('No deceased information to parse in fatality page.')

# Fill in Notes from Details page if not in twitter description.
search_notes = re.compile(r'>Deceased:.*\s{2,}(.|\n)*?<\/p>(.|\n)*?<\/p>')
match = re.search(search_notes, normalized_detail_page)
if match and not notes_parsed:
text_chunk = match.string[match.start(0):match.end(0)]
d[Fields.NOTES] = parse_details_page_notes(text_chunk)

# Compute the victim's age.
if d.get(Fields.DATE) and d.get(Fields.DOB):
Expand Down Expand Up @@ -327,10 +374,13 @@ def parse_page(page):
"""
# Parse the page.
twitter_d = parse_twitter_fields(page)
page_d = parse_page_content(page)
if twitter_d.get(Fields.NOTES):
page_d = parse_page_content(page, True)
else:
page_d = parse_page_content(page, False)

# Merge the results, from right to left.
# (i.e. the rightmost object will overiide the object just before it, etc.)
# (i.e. the rightmost object will override the object just before it, etc.)
d = {**page_d, **twitter_d}
return d

Expand Down
22 changes: 19 additions & 3 deletions tests/core/test_apd.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,10 @@ def test_parse_twitter_description_00():
'Time': '2:24 a.m.',
'Location': '1400 E. Highway 71 eastbound',
'DOB': '02/09/1980',
'Notes': 'The preliminary investigation shows that a 2003 Ford F150 was traveling northbound on the US Highway 183 northbound ramp to E. Highway 71, eastbound. The truck went across the E. Highway 71 and US Highway 183 ramp, rolled and came to a stop north of the roadway.',
'Notes': 'The preliminary investigation shows that a 2003 Ford F150 was '
'traveling northbound on the US Highway 183 northbound ramp to E. Highway 71, eastbound. '
'The truck went across the E. Highway 71 and US Highway 183 ramp, rolled '
'and came to a stop north of the roadway.',
'Gender': 'male',
'Ethnicity': 'White',
'Last Name': 'Sabillon-Garcia',
Expand Down Expand Up @@ -183,6 +186,13 @@ def test_parse_twitter_description_03():
assert actual == expected


def test_parse_details_page_notes_01():
"""Ensure a malformed entry is not parsed."""
actual = apd.parse_twitter_description(mock_data.details_page_notes_01)
expected = {}
assert actual == expected


def test_extract_traffic_fatalities_page_details_link_00(news_page):
"""Ensure page detail links are extracted from news page."""
actual = apd.extract_traffic_fatalities_page_details_link(news_page)
Expand Down Expand Up @@ -236,10 +246,13 @@ def test_has_next_01():

@pytest.mark.parametrize('filename,expected', [(k, v) for k, v in parse_page_content_scenarios.items()])
def test_parse_page_content_00(filename, expected):
"""Ensure information are properly extracted from the content detail page."""
"""Ensure information are properly extracted from the content detail page.
Don't compare notes if parsed from details page."""
page_fd = TEST_DATA_DIR / filename
page = page_fd.read_text()
actual = apd.parse_page_content(page)
if 'Notes' in actual and 'Notes' not in expected:
del actual['Notes']
assert actual == expected


Expand All @@ -254,10 +267,13 @@ def test_parse_twitter_fields_00(filename, expected):

@pytest.mark.parametrize('filename,expected', [(k, v) for k, v in parse_page_scenarios.items()])
def test_parse_page_00(filename, expected):
"""Ensure information are properly extracted from the page."""
"""Ensure information are properly extracted from the page.
Don't compare notes if parsed from details page."""
page_fd = TEST_DATA_DIR / filename
page = page_fd.read_text()
actual = apd.parse_page(page)
if 'Notes' in actual and 'Notes' not in expected:
del actual['Notes']
assert actual == expected


Expand Down
7 changes: 4 additions & 3 deletions tests/mock_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
twitter_title_00 = "Traffic Fatality #73"
twitter_description_00 = """
Case: 18-3640187 Date: December 30, 2018 Time: 2:24 a.m. Location: 1400 E. Highway
71 eastbound Deceased: Corbin Sabillon-Garcia, White male, DOB 02/09/80 The preliminary investigation shows that a
2003 Ford F150 was traveling northbound on the US Highway 183 northbound ramp to E. Highway 71, eastbound. The truck
went across the E. Highway 71 and US Highway 183 ramp, rolled and came to a stop north of the roadway.
71 eastbound Deceased: Corbin Sabillon-Garcia, White male, DOB 02/09/80 The preliminary investigation shows that a
2003 Ford F150 was traveling northbound on the US Highway 183 northbound ramp to E. Highway 71, eastbound. The truck
went across the E. Highway 71 and US Highway 183 ramp, rolled and came to a stop north of the roadway.
"""
twitter_description_01 = "Case: 19-0161105"
twitter_description_02 = "Case: 18-160882 Date: Tuesday, January 16, 2018 Time: 5:14 p.m. Location: 1500 W. Slaughter Lane Deceased: Eva Marie Gonzales, W/F, DOB: 01-22-1961 (passenger)"
twitter_description_03 = "APD is asking any businesses in the area of East Cesar Chavez and Adam L. Chapa Sr. streets to check their surveillance cameras between 2 and 2:10 a.m. on Oct. 10, 2018, for this suspect vehicle. See video of suspect vehicle here --&gt; https://youtu.be/ezxaRW79PnI"
details_page_notes_01 = "<><>"
3 changes: 2 additions & 1 deletion tests/step_defs/test_retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def time_range(from_date, to_date):
@then('the generated file must contain <entry_count> entries')
@pytest.mark.asyncio
def ensure_results(mocker, event_loop, output_format, time_range, entry_count):
"""Ensure we get the right amount of entries."""
"""Ensure we get the right amount of entries.
Don't compare the parsed notes to entry."""
result, _ = event_loop.run_until_complete(apd.async_retrieve(pages=-1, **time_range))
assert result is not None
assert len(result) == entry_count