Skip to content
This repository has been archived by the owner on Feb 2, 2022. It is now read-only.

Parse Age without Date of Birth #125

Merged
merged 29 commits into from May 23, 2019
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ad536b6
use first item of deceased_field for First Name
mscarey Apr 17, 2019
65c9e87
make missing name data throw IndexError
mscarey Apr 18, 2019
9fb41cc
add name with punctuation to parsing test
mscarey Apr 20, 2019
3698f8f
move split method within deceased field parser
mscarey Apr 23, 2019
7ffadbb
split deceased field on some slashes
mscarey Apr 23, 2019
510032e
Merge https://github.com/scrapd/scrapd
mscarey Apr 26, 2019
7f40cde
Merge remote-tracking branch 'origin'
mscarey May 15, 2019
4fa089c
change parsing tests to expect datetimes
mscarey May 16, 2019
304879e
change date string calculations to datetime
mscarey May 16, 2019
097aaf7
update more tests to expect datetime objects
mscarey May 16, 2019
9b7dc35
add function to parse deceased field without DOB
mscarey May 16, 2019
8ba1ae7
run yapf format
mscarey May 16, 2019
a2a1ec8
change datetime objects to date objects
mscarey May 20, 2019
efffc77
add tests for formatters
mscarey May 20, 2019
d61d162
import date_utils functions by name
mscarey May 20, 2019
cabdf73
add date serializing to formatters
mscarey May 20, 2019
fabe8cd
run yapf format
mscarey May 20, 2019
c400595
remove ValueError for failed age parsing.
mscarey May 20, 2019
306b6fd
slightly delay conversion of datetime to date
mscarey May 20, 2019
9ea122c
add line breaks for formatting tests
mscarey May 22, 2019
394fbed
rephrase first lines of docstrings for pydocstyle
mscarey May 22, 2019
6fbd7a8
use strftime, not isoformat, for date format
mscarey May 22, 2019
848debf
add yapf line break
mscarey May 22, 2019
b10aeaa
delete duplicate test
mscarey May 22, 2019
6b0cfbf
change CSV formatter to month-first date format
mscarey May 22, 2019
a27a9e4
change to module-level import of date_utils
mscarey May 22, 2019
ec32be9
change DOB to datetime.date earlier
mscarey May 23, 2019
730c83b
add yapf line breaks
mscarey May 23, 2019
ccd34d7
Merge branch 'master' of https://github.com/scrapd/scrapd into age
mscarey May 23, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
85 changes: 50 additions & 35 deletions scrapd/core/apd.py
Expand Up @@ -174,19 +174,6 @@ def parse_twitter_description(twitter_description):
if tmp_dob and isinstance(tmp_dob, list):
d[Fields.DOB] = tmp_dob[0]

# Parse the Deceased field.
if d.get(Fields.DECEASED):
try:
d.update(parse_deceased_field(' '.join(d.get(Fields.DECEASED))))
except ValueError as e:
logger.trace(e)
else:
logger.trace('No decease information to parse in Twitter description.')

# Compute the victim's age.
if d.get(Fields.DATE) and d.get(Fields.DOB):
d[Fields.AGE] = date_utils.compute_age(' '.join(d.get(Fields.DATE)), d.get(Fields.DOB))

return sanitize_fatality_entity(d)


Expand Down Expand Up @@ -251,6 +238,7 @@ def parse_details_page_notes(details_page_notes):
def sanitize_fatality_entity(d):
"""
Clean up a fatality entity.
Performs parsing common to Twitter descriptions and page content.

Ensures that the values are all strings and removes the 'Deceased' field which does not contain
relevant information anymore.
Expand All @@ -259,21 +247,37 @@ def sanitize_fatality_entity(d):
:return: A dictionary containing the details information about the fatality with sanitized entries.
:rtype: dict
"""

# All values must be strings.
for k, v in d.items():
if isinstance(v, list):
d[k] = ' '.join(v)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part should go to the sanitize_fatality_entity() function. The goal of the sanitizing function was to ensure that the final values had the right format, or that unnecessary or invalid values (like None or empty ones) were discarded.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently that part needs to happen before the deceased field is parsed, while sanitize_fatality_entity() has to happen afterwards.

if d.get('Date'):
d['Date'] = date_utils.clean_date_string(d['Date'])

if d.get('DOB'):
d['DOB'] = date_utils.clean_date_string(d['DOB'], True)
if d.get(Fields.DECEASED):
try:
d.update(parse_deceased_field(d.get(Fields.DECEASED)))
except ValueError as e:
logger.trace(e)
else:
logger.trace('No deceased information to parse in fatality page.')

# The 'Deceased' field is unnecessary.
if d.get('Deceased'):
del d['Deceased']

# Parse the `Date` field.
if d.get(Fields.DATE):
d[Fields.DATE] = date_utils.parse_date(d[Fields.DATE])

# Parse the `DOB` field.
if d.get(Fields.DOB):
dob_guess = date_utils.parse_date(d[Fields.DOB])
d[Fields.DOB] = date_utils.check_dob(dob_guess)

# Compute the victim's age.
if d.get(Fields.DATE) and d.get(Fields.DOB):
d[Fields.AGE] = date_utils.compute_age(d.get(Fields.DATE), d.get(Fields.DOB))

return d


Expand Down Expand Up @@ -345,14 +349,38 @@ def parse_deceased_field(deceased_field):
except Exception:
pass

# Try to parse the deceased fields assuming it contains an age.
try:
return parse_age_deceased_field(deceased_field)
except Exception:
pass

raise ValueError(f'Cannot parse {Fields.DECEASED}: {deceased_field}')


def parse_age_deceased_field(deceased_field):
"""
Parse deceased field assuming it contains an age.

:param str deceased_field: the deceased field
:return: a dictionary representing the deceased field.
:rtype: dict
"""
age_pattern = re.compile(r'([0-9]+) years')
age = re.search(age_pattern, deceased_field).group(1)
if not age:
raise ValueError(f'Cannot find age in the deceased field: {deceased_field}')
split_deceased_field = age_pattern.split(deceased_field)
d = parse_fleg(split_deceased_field[0].split())
d[Fields.AGE] = int(age)
return d


def parse_comma_delimited_deceased_field(deceased_field):
"""
Parse deceased fields seperated with commas.

:param list split_deceased_field: a list representing the deceased field
:param str deceased_field: a list representing the deceased field
:return: a dictionary representing the deceased field.
:rtype: dict
"""
Expand Down Expand Up @@ -480,26 +508,13 @@ def parse_page_content(detail_page, notes_parsed=False):
# Parse the `Crashes` field.
d[Fields.CRASHES] = parse_crashes_field(normalized_detail_page)

# Parse the `Deceased` field.
if d.get(Fields.DECEASED):
try:
d.update(parse_deceased_field(d.get(Fields.DECEASED)))
except ValueError as e:
logger.trace(e)
else:
logger.trace('No deceased information to parse in fatality page.')

# Fill in Notes from Details page if not in twitter description.
search_notes = re.compile(r'>Deceased:.*\s{2,}(.|\n)*?<\/p>(.|\n)*?<\/p>')
match = re.search(search_notes, normalized_detail_page)
if match and not notes_parsed:
text_chunk = match.string[match.start(0):match.end(0)]
d[Fields.NOTES] = parse_details_page_notes(text_chunk)

# Compute the victim's age.
if d.get(Fields.DATE) and d.get(Fields.DOB):
d[Fields.AGE] = date_utils.compute_age(d.get(Fields.DATE), d.get(Fields.DOB))

return sanitize_fatality_entity(d)


Expand Down Expand Up @@ -688,16 +703,16 @@ async def async_retrieve(pages=-1, from_=None, to=None):
# If the page contains fatalities, ensure all of them happened within the specified time range.
if page_res:
entries_in_time_range = [
entry for entry in page_res if date_utils.is_in_range(entry[Fields.DATE], from_, to)
entry for entry in page_res
if date_utils.from_date(from_) <= entry[Fields.DATE] <= date_utils.to_date(to)
]

# If 2 pages in a row:
# 1) contain results
# 2) but none of them contain dates within the time range
# 3) and we did not collect any valid entries
# Then we can stop the operation.
if from_ and all([date_utils.is_posterior(entry[Fields.DATE], from_)
for entry in page_res]) and not has_entries:
if from_ and all([entry[Fields.DATE] < date_utils.from_date(from_) for entry in page_res]) and not has_entries:
no_date_within_range_count += 1
if no_date_within_range_count > 1:
logger.debug(f'{len(entries_in_time_range)} fatality page(s) within the specified time range.')
Expand Down
37 changes: 3 additions & 34 deletions scrapd/core/date_utils.py
Expand Up @@ -4,19 +4,6 @@
import dateparser


def is_posterior(d1, d2):
"""
Return True is d1 is posterior to d2 (i.e. it happened after).

:param str d1: date 1
:param str d2: date 2
:return: True is d1 is posterior to d2
:rtype: bool
"""

return parse_date(d1) < parse_date(d2)


def check_dob(dob):
"""
In case that a date only contains 2 digits, determine century.
Expand Down Expand Up @@ -103,34 +90,16 @@ def parse_date(date, default=None, settings=None):
raise Exception


def is_in_range(date, from_=None, to=None):
"""
Check whether a date is comprised between 2 others.

:param str date: date to vheck
:param str from_: start date, defaults to None
:param str to: end date, defaults to None
:return: `True` if the date is between `from_` and `to`
:rtype: bool
"""
current_date = parse_date(date)
from_date_ = from_date(from_)
to_date_ = to_date(to)

return from_date_ <= current_date <= to_date_


def compute_age(date, dob):
"""
Compute a victim's age.

:param str date: crash date
:param str dob: date of birth
:param datetime.datetime date: crash date
:param datetime.datetime dob: date of birth
:return: the victim's age.
:rtype: int
"""
DAYS_IN_YEAR = 365
dob_ = parse_date(dob)

# Compute the age.
return (parse_date(date) - check_dob(dob_)).days // DAYS_IN_YEAR
return (date - dob).days // DAYS_IN_YEAR
29 changes: 15 additions & 14 deletions tests/core/test_apd.py
@@ -1,4 +1,5 @@
"""Test the APD module."""
from datetime import datetime
from unittest import mock

import aiohttp
Expand Down Expand Up @@ -46,8 +47,8 @@ def news_page(scope='session'):
Fields.AGE: 38,
Fields.CASE: '18-3640187',
Fields.CRASHES: '73',
Fields.DOB: '02/09/1980',
Fields.DATE: '12/30/2018',
Fields.DOB: datetime(1980, 2, 9),
Fields.DATE: datetime(2018, 12, 30),
Fields.ETHNICITY: 'White',
Fields.FIRST_NAME: 'Corbin',
Fields.GENDER: 'male',
Expand All @@ -63,14 +64,14 @@ def news_page(scope='session'):
'traffic-fatality-72-1': {
Fields.CASE: '18-3551763',
Fields.CRASHES: '72',
Fields.DATE: '12/21/2018',
Fields.DATE: datetime(2018, 12, 21),
Fields.LOCATION: '9500 N Mopac SB',
Fields.TIME: '8:20 p.m.',
},
'traffic-fatality-71-2': {
Fields.CASE: '18-3381590',
Fields.CRASHES: '71',
Fields.DATE: '12/04/2018',
Fields.DATE: datetime(2018, 12, 4),
Fields.LOCATION: '183 service road westbound and Payton Gin Rd.',
Fields.TIME: '8:39 p.m.',
},
Expand All @@ -81,8 +82,8 @@ def news_page(scope='session'):
**parse_twitter_fields_scenarios['traffic-fatality-2-3'],
Fields.AGE: 58,
Fields.CRASHES: '2',
Fields.DOB: '02/15/1960',
Fields.DATE: '01/16/2019',
Fields.DOB: datetime(1960, 2, 15),
Fields.DATE: datetime(2019, 1, 16),
Fields.ETHNICITY: 'White',
Fields.FIRST_NAME: 'Ann',
Fields.GENDER: 'female',
Expand All @@ -94,8 +95,8 @@ def news_page(scope='session'):
Fields.AGE: 38,
Fields.CASE: '18-3640187',
Fields.CRASHES: '73',
Fields.DOB: '02/09/1980',
Fields.DATE: '12/30/2018',
Fields.DOB: datetime(1980, 2, 9),
Fields.DATE: datetime(2018, 12, 30),
Fields.ETHNICITY: 'White',
Fields.FIRST_NAME: 'Corbin',
Fields.GENDER: 'male',
Expand All @@ -106,15 +107,15 @@ def news_page(scope='session'):
'traffic-fatality-72-1': {
**parse_twitter_fields_scenarios['traffic-fatality-72-1'],
Fields.AGE: 22,
Fields.DOB: '03/29/1996',
Fields.DOB: datetime(1996, 3, 29),
Fields.ETHNICITY: 'White',
Fields.FIRST_NAME: 'Elijah',
Fields.GENDER: 'male',
Fields.LAST_NAME: 'Perales',
},
'traffic-fatality-71-2': {
**parse_twitter_fields_scenarios['traffic-fatality-71-2'],
Fields.DOB: '06/01/1964',
Fields.DOB: datetime(1964, 6, 1),
Fields.FIRST_NAME: 'Barkat',
Fields.LAST_NAME: 'Umatia',
Fields.ETHNICITY: 'Other',
Expand Down Expand Up @@ -159,10 +160,10 @@ def test_parse_twitter_title_00(input_, expected):
mock_data.twitter_description_00,
{
'Case': '18-3640187',
'Date': '12/30/2018',
'Date': datetime(2018, 12, 30),
'Time': '2:24 a.m.',
'Location': '1400 E. Highway 71 eastbound',
'DOB': '02/09/1980',
'DOB': datetime(1980, 2, 9),
'Notes': 'The preliminary investigation shows that a 2003 Ford F150 was '
'traveling northbound on the US Highway 183 northbound ramp to E. Highway 71, eastbound. '
'The truck went across the E. Highway 71 and US Highway 183 ramp, rolled '
Expand Down Expand Up @@ -197,8 +198,8 @@ def test_parse_twitter_description_02():
expected = {
'Age': 57,
'Case': '18-160882',
'DOB': '01/22/1961',
'Date': '01/16/2018',
'DOB': datetime(1961, 1, 22),
'Date': datetime(2018, 1, 16),
'Location': '1500 W. Slaughter Lane',
'Time': '5:14 p.m.',
}
Expand Down
2 changes: 1 addition & 1 deletion tests/core/test_date_utils.py
Expand Up @@ -11,7 +11,7 @@
])
def test_is_in_range_00(current, from_, to, expected):
"""Ensure a date is in range."""
assert date_utils.is_in_range(current, from_, to) == expected
assert (date_utils.from_date(from_) <= date_utils.parse_date(current) <= date_utils.to_date(to)) == expected


@pytest.mark.parametrize('date, default, settings, expected', [
Expand Down