Skip to content
This repository has been archived by the owner on Feb 2, 2022. It is now read-only.

Parse more deceased field formats #90

Merged
merged 1 commit into from Apr 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
129 changes: 111 additions & 18 deletions scrapd/core/apd.py
Expand Up @@ -68,7 +68,6 @@ async def fetch_detail_page(session, url):
:return: the page content.
:rtype: str
"""

return await fetch_text(session, url)


Expand Down Expand Up @@ -286,46 +285,140 @@ def parse_name(name):
return d


def parse_deceased_field(deceased_field):
def dob_search(split_deceased_field):
"""
Parse the deceased field.

At this point the deceased field, if it exists, is garbage as it contains First Name, Last Name, Ethnicity,
Gender, D.O.B. and Notes. We need to explode this data into the appropriate fields.
Search for the DOB in a deceased field.

:param str deceased_field: the deceased field from the fatality report
:return: a dictionary representing a deceased field.
:rtype: dict
:param list split_deceased_field: a list representing the deceased field
:return: the DOB index within the split deceased field.
:rtype: int
"""
deceased_field = re.split(r' |(?<=[A-Za-z])/', deceased_field)
dob_index = -1
dob_tokens = [Fields.DOB, '(D.O.B', '(D.O.B.', '(D.O.B:', '(DOB', '(DOB:', 'D.O.B.', 'DOB:']
while dob_index < 0 and dob_tokens:
dob_token = dob_tokens.pop()
try:
dob_index = deceased_field.index(dob_token)
dob_index = split_deceased_field.index(dob_token)
except ValueError:
pass
else:
break

if dob_index < 0:
raise ValueError(f'Cannot parse {Fields.DECEASED}: {deceased_field}')
return dob_index


def parse_deceased_field(deceased_field):
"""
Parse the deceased field.

At this point the deceased field, if it exists, is garbage as it contains First Name, Last Name, Ethnicity,
Gender, D.O.B. and Notes. We need to explode this data into the appropriate fields.

:param str deceased_field: the deceased field from the fatality report
:return: a dictionary representing a deceased field.
:rtype: dict
"""
# Try to parse the deceased fields when the fields are comma separated.
try:
return parse_comma_delimited_deceased_field(deceased_field)
except Exception:
pass

# Try to parse the deceased fields when the fields are pipe separated.
try:
return parse_pipe_delimited_deceased_field(deceased_field)
except Exception:
pass

try:
return parse_space_delimited_deceased_field(deceased_field)
except Exception:
pass

raise ValueError(f'Cannot parse {Fields.DECEASED}: {deceased_field}')


def parse_comma_delimited_deceased_field(deceased_field):
"""Parse deceased fields seperated with commas.

:param list split_deceased_field: a list representing the deceased field
:return: a dictionary representing the deceased field.
:rtype: dict
"""
d = {}
d[Fields.DOB] = deceased_field[dob_index + 1]
notes = deceased_field[dob_index + 2:]
split_deceased_field = re.split(r' |(?<=[A-Za-z])/', deceased_field)
dob_index = dob_search(split_deceased_field)
if dob_index < 0:
raise ValueError(f'Cannot find DOB in the deceased field: {deceased_field}')
raw_dob = split_deceased_field[dob_index + 1]
validated_dob = date_utils.clean_date_string(raw_dob, True)
d[Fields.DOB] = validated_dob
notes = split_deceased_field[dob_index + 2:]
if notes:
d[Fields.NOTES] = ' '.join(notes)

# `fleg` stands for First, Last, Ethnicity, Gender. It represents the info stored before the DOB.
fleg = deceased_field[:dob_index]
fleg = split_deceased_field[:dob_index]
d.update(parse_fleg(fleg))
return d


def parse_pipe_delimited_deceased_field(deceased_field):
"""
Parse deceased fields separated with pipes.

:param str deceased_field: the deceased field as a string.
:return: a dictionary representing the deceased field.
:rtype: dict
"""
d = {}
split_deceased_field = deceased_field.split('|')
raw_dob = split_deceased_field[-1].strip()
d[Fields.DOB] = date_utils.clean_date_string(raw_dob, True)

fleg = (split_deceased_field[0] + split_deceased_field[1]).split()
d.update(parse_fleg(fleg))
return d


def parse_space_delimited_deceased_field(deceased_field):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So much delimiter support now 🎉 A lot of similar code between pipe and space, maybe these could be one function in the future with a delimiter parameter.

"""
Parse deceased fields separated with spaces.

:param str deceased_field: the deceased field as a string.
:return: a dictionary representing the deceased field.
:rtype: dict
"""
d = {}
split_deceased_field = re.split(r' |/', deceased_field)
raw_dob = split_deceased_field[-1].strip()
d[Fields.DOB] = date_utils.clean_date_string(raw_dob, True)

fleg = split_deceased_field[:-1]
d.update(parse_fleg(fleg))
return d


def parse_fleg(fleg):
"""
Parse FLEG.

:param list fleg: [description]
:return: [description]
:rtype: dict
"""
# Try to pop out the results one by one. If pop fails, it means there is nothing left to retrieve,
# For example, there is no first name and last name.
d = {}
try:
d[Fields.GENDER] = fleg.pop().replace(',', '').lower()
if d.get(Fields.GENDER) == 'f':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't even realize there were f's and m's...good catch

d[Fields.GENDER] = 'female'
elif d.get(Fields.GENDER) == 'm':
d[Fields.GENDER] = 'male'

d[Fields.ETHNICITY] = fleg.pop().replace(',', '')
if d.get(Fields.ETHNICITY) == 'W':
d[Fields.ETHNICITY] = 'White'
except IndexError:
pass

Expand All @@ -350,7 +443,7 @@ def parse_page_content(detail_page, notes_parsed=False):
(Fields.CASE, re.compile(r'Case:.*\s([0-9\-]+)<')),
(Fields.CRASHES, re.compile(r'Traffic Fatality #(\d{1,3})')),
(Fields.DATE, re.compile(r'>Date:.*\s{2,}([^<]*)</')),
(Fields.DECEASED, re.compile(r'>Deceased:.*\s{2,}([^<]*\d)\)?<')),
(Fields.DECEASED, re.compile(r'>Deceased:\s*(?:</span>)?(?:</strong>)?\s*>?([^<]*\d)\s*.*\)?<')),
(Fields.LOCATION, re.compile(r'>Location:.*>\s{2,}([^<]+)')),
(Fields.TIME, re.compile(r'>Time:.*>\s{2,}([^<]+)')),
]
Expand Down
65 changes: 60 additions & 5 deletions tests/core/test_apd.py
Expand Up @@ -238,22 +238,69 @@ def test_extract_traffic_fatalities_page_details_link_00(news_page):

@pytest.mark.parametrize('deceased,expected', (
("Rosbel “Rudy” Tamez, Hispanic male (D.O.B. 10-10-54)", {
Fields.FIRST_NAME: "Rosbel",
Fields.LAST_NAME: "Tamez",
Fields.FIRST_NAME: "Rosbel"
Fields.ETHNICITY: "Hispanic",
Fields.GENDER: "male",
Fields.DOB: '10/10/1954',
}),
("Eva Marie Gonzales, W/F, DOB: 01-22-1961 (passenger)", {
Fields.LAST_NAME: "Gonzales",
Fields.FIRST_NAME: "Eva",
Fields.GENDER: 'f'
Fields.LAST_NAME: "Gonzales",
Fields.ETHNICITY: "White",
Fields.GENDER: 'female',
Fields.DOB: '01/22/1961',
}),
(
'DOB: 01-01-99',
{
Fields.DOB: '01-01-99',
Fields.DOB: '01/01/1999',
},
),
(
'Wing Cheung Chou | Asian male | 08/01/1949',
{
Fields.FIRST_NAME: "Wing",
Fields.LAST_NAME: "Chou",
Fields.ETHNICITY: "Asian",
Fields.GENDER: "male",
Fields.DOB: '08/01/1949',
},
),
(
'Christopher M Peterson W/M 10-8-1981',
{
Fields.FIRST_NAME: "Christopher",
Fields.LAST_NAME: "Peterson",
Fields.ETHNICITY: "White",
Fields.GENDER: "male",
Fields.DOB: '10/08/1981',
},
),
(
'Luis Angel Tinoco, Hispanic male (11-12-07',
{
Fields.FIRST_NAME: "Luis",
Fields.LAST_NAME: "Tinoco",
Fields.ETHNICITY: "Hispanic",
Fields.GENDER: "male",
Fields.DOB: '11/12/2007'
},
),
(
'Ronnie Lee Hall, White male, 8-28-51',
{
Fields.FIRST_NAME: "Ronnie",
Fields.LAST_NAME: "Hall",
Fields.ETHNICITY: "White",
Fields.GENDER: "male",
Fields.DOB: '08/28/1951'
},
),
))
def test_parse_deceased_field(deceased, expected):
def test_parse_deceased_field_00(deceased, expected):
"""Ensure a deceased field is parsed correctly."""
d = {}
d = apd.parse_deceased_field(deceased)
for key in expected:
assert d[key] == expected[key]
Expand Down Expand Up @@ -341,6 +388,14 @@ def test_parse_page_content_00(filename, expected):
assert actual == expected


def test_parse_page_content_01(mocker):
"""Ensure a `parse_deceased_field` exception is caught and does not propagate."""
page_fd = TEST_DATA_DIR / 'traffic-fatality-2-3'
page = page_fd.read_text()
mocker.patch('scrapd.core.apd.parse_deceased_field', side_effect=ValueError)
apd.parse_page_content(page)


@pytest.mark.parametrize('filename,expected', [(k, v) for k, v in parse_twitter_fields_scenarios.items()])
def test_parse_twitter_fields_00(filename, expected):
"""Ensure information are properly extracted from the twitter fields on detail page."""
Expand Down
20 changes: 15 additions & 5 deletions tests/core/test_date_utils.py
Expand Up @@ -29,10 +29,20 @@ def test_parse_date_01():
date_utils.parse_date('Not a date')


@pytest.mark.parametrize('date, expected', [
('Jan 10 2019', '01/10/2019'),
('2019-01-10', '01/10/2019'),
@pytest.mark.parametrize('date, dob, expected', [
('Jan 10 2019', False, '01/10/2019'),
('2019-01-10', False, '01/10/2019'),
('10-10-54', True, '10/10/1954'),
])
def test_clean_date_string_00(date, expected):
def test_clean_date_string_00(date, dob, expected):
"""Ensure date string is properly formatted."""
assert date_utils.clean_date_string(date) == expected
assert date_utils.clean_date_string(date, dob) == expected


@pytest.mark.parametrize('date, expected', [
(datetime.datetime(2019, 1, 10, 0, 0), datetime.datetime(2019, 1, 10, 0, 0)),
(datetime.datetime(2054, 10, 10, 0, 0), datetime.datetime(1954, 10, 10, 0, 0)),
])
def test_check_dob_00(date, expected):
"""Ensure a DOB is valid."""
assert date_utils.check_dob(date) == expected