Skip to content
This repository has been archived by the owner on Feb 2, 2022. It is now read-only.

Parse Age without Date of Birth #125

Merged
merged 29 commits into from
May 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ad536b6
use first item of deceased_field for First Name
mscarey Apr 17, 2019
65c9e87
make missing name data throw IndexError
mscarey Apr 18, 2019
9fb41cc
add name with punctuation to parsing test
mscarey Apr 20, 2019
3698f8f
move split method within deceased field parser
mscarey Apr 23, 2019
7ffadbb
split deceased field on some slashes
mscarey Apr 23, 2019
510032e
Merge https://github.com/scrapd/scrapd
mscarey Apr 26, 2019
7f40cde
Merge remote-tracking branch 'origin'
mscarey May 15, 2019
4fa089c
change parsing tests to expect datetimes
mscarey May 16, 2019
304879e
change date string calculations to datetime
mscarey May 16, 2019
097aaf7
update more tests to expect datetime objects
mscarey May 16, 2019
9b7dc35
add function to parse deceased field without DOB
mscarey May 16, 2019
8ba1ae7
run yapf format
mscarey May 16, 2019
a2a1ec8
change datetime objects to date objects
mscarey May 20, 2019
efffc77
add tests for formatters
mscarey May 20, 2019
d61d162
import date_utils functions by name
mscarey May 20, 2019
cabdf73
add date serializing to formatters
mscarey May 20, 2019
fabe8cd
run yapf format
mscarey May 20, 2019
c400595
remove ValueError for failed age parsing.
mscarey May 20, 2019
306b6fd
slightly delay conversion of datetime to date
mscarey May 20, 2019
9ea122c
add line breaks for formatting tests
mscarey May 22, 2019
394fbed
rephrase first lines of docstrings for pydocstyle
mscarey May 22, 2019
6fbd7a8
use strftime, not isoformat, for date format
mscarey May 22, 2019
848debf
add yapf line break
mscarey May 22, 2019
b10aeaa
delete duplicate test
mscarey May 22, 2019
6b0cfbf
change CSV formatter to month-first date format
mscarey May 22, 2019
a27a9e4
change to module-level import of date_utils
mscarey May 22, 2019
ec32be9
change DOB to datetime.date earlier
mscarey May 23, 2019
730c83b
add yapf line breaks
mscarey May 23, 2019
ccd34d7
Merge branch 'master' of https://github.com/scrapd/scrapd into age
mscarey May 23, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 64 additions & 32 deletions scrapd/core/apd.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,22 +172,9 @@ def parse_twitter_description(twitter_description):
# Handle special case where Date of birth is a token `DOB:`.
tmp_dob = d.get(Fields.DOB)
if tmp_dob and isinstance(tmp_dob, list):
d[Fields.DOB] = tmp_dob[0]
d[Fields.DOB] = date_utils.parse_date(tmp_dob[0])

# Parse the Deceased field.
if d.get(Fields.DECEASED):
try:
d.update(parse_deceased_field(' '.join(d.get(Fields.DECEASED))))
except ValueError as e:
logger.trace(e)
else:
logger.trace('No decease information to parse in Twitter description.')

# Compute the victim's age.
if d.get(Fields.DATE) and d.get(Fields.DOB):
d[Fields.AGE] = date_utils.compute_age(' '.join(d.get(Fields.DATE)), d.get(Fields.DOB))

return sanitize_fatality_entity(d)
return common_fatality_parsing(d)


def parse_details_page_notes(details_page_notes):
Expand Down Expand Up @@ -248,29 +235,53 @@ def parse_details_page_notes(details_page_notes):
return final


def sanitize_fatality_entity(d):
def common_fatality_parsing(d):
"""
Clean up a fatality entity.
Perform parsing common to Twitter descriptions and page content.

Ensures that the values are all strings and removes the 'Deceased' field which does not contain
relevant information anymore.

:param dict d: the fatality to sanitize
:param dict d: the fatality to finish parsing
:return: A dictionary containing the details information about the fatality with sanitized entries.
:rtype: dict
"""

# All values must be strings.
for k, v in d.items():
if isinstance(v, list):
d[k] = ' '.join(v)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part should go to the sanitize_fatality_entity() function. The goal of the sanitizing function was to ensure that the final values had the right format, or that unnecessary or invalid values (like None or empty ones) were discarded.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently that part needs to happen before the deceased field is parsed, while sanitize_fatality_entity() has to happen afterwards.

if d.get('Date'):
d['Date'] = date_utils.clean_date_string(d['Date'])
# Extracting other fields from 'Deceased' field.
if d.get(Fields.DECEASED):
try:
d.update(parse_deceased_field(d.get(Fields.DECEASED)))
except ValueError as e:
logger.trace(e)
else:
logger.trace('No deceased information to parse in fatality page.')

# Parse the `Date` field.
if d.get(Fields.DATE):
d[Fields.DATE] = date_utils.parse_date(d[Fields.DATE])

# Compute the victim's age.
if d.get(Fields.DATE) and d.get(Fields.DOB):
d[Fields.AGE] = date_utils.compute_age(d.get(Fields.DATE), d.get(Fields.DOB))

return sanitize_fatality_entity(d)


if d.get('DOB'):
d['DOB'] = date_utils.clean_date_string(d['DOB'], True)
def sanitize_fatality_entity(d):
"""
Clean up a fatality entity.

Removes the 'Deceased' field which does not contain relevant information anymore.

:return: A dictionary containing the details information about the fatality with sanitized entries.
:rtype: dict
"""

# The 'Deceased' field is unnecessary.
if d.get('Deceased'):
del d['Deceased']

Expand Down Expand Up @@ -345,14 +356,37 @@ def parse_deceased_field(deceased_field):
except Exception:
pass

# Try to parse the deceased fields assuming it contains an age.
try:
return parse_age_deceased_field(deceased_field)
except Exception:
pass

raise ValueError(f'Cannot parse {Fields.DECEASED}: {deceased_field}')


def parse_age_deceased_field(deceased_field):
"""
Parse deceased field assuming it contains an age.

:param str deceased_field: the deceased field
:return: a dictionary representing the deceased field.
:rtype: dict
"""
age_pattern = re.compile(r'([0-9]+) years')
# Raises AttributeError upon failure
age = re.search(age_pattern, deceased_field).group(1)
split_deceased_field = age_pattern.split(deceased_field)
d = parse_fleg(split_deceased_field[0].split())
d[Fields.AGE] = int(age)
return d


def parse_comma_delimited_deceased_field(deceased_field):
"""
Parse deceased fields seperated with commas.

:param list split_deceased_field: a list representing the deceased field
:param str deceased_field: a list representing the deceased field
:return: a dictionary representing the deceased field.
:rtype: dict
"""
Expand Down Expand Up @@ -415,7 +449,8 @@ def parse_deceased_field_common(split_deceased_field, fleg):

# Extract and clean up DOB.
raw_dob = split_deceased_field[-1].strip()
d[Fields.DOB] = date_utils.clean_date_string(raw_dob, True)
dob_guess = date_utils.parse_date(raw_dob)
d[Fields.DOB] = date_utils.check_dob(dob_guess)

return d

Expand Down Expand Up @@ -498,11 +533,7 @@ def parse_page_content(detail_page, notes_parsed=False):
text_chunk = match.string[match.start(0):match.end(0)]
d[Fields.NOTES] = parse_details_page_notes(text_chunk)

# Compute the victim's age.
if d.get(Fields.DATE) and d.get(Fields.DOB):
d[Fields.AGE] = date_utils.compute_age(d.get(Fields.DATE), d.get(Fields.DOB))

return sanitize_fatality_entity(d)
return common_fatality_parsing(d)


def parse_case_field(page):
Expand Down Expand Up @@ -713,15 +744,16 @@ async def async_retrieve(pages=-1, from_=None, to=None):
# If the page contains fatalities, ensure all of them happened within the specified time range.
if page_res:
entries_in_time_range = [
entry for entry in page_res if date_utils.is_between(entry[Fields.DATE], from_, to)
entry for entry in page_res
if date_utils.from_date(from_) <= entry[Fields.DATE] <= date_utils.to_date(to)
]

# If 2 pages in a row:
# 1) contain results
# 2) but none of them contain dates within the time range
# 3) and we did not collect any valid entries
# Then we can stop the operation.
if from_ and all([date_utils.is_before(entry[Fields.DATE], from_)
if from_ and all([entry[Fields.DATE] < date_utils.from_date(from_)
for entry in page_res]) and not has_entries:
no_date_within_range_count += 1
if no_date_within_range_count > 1:
Expand Down
37 changes: 18 additions & 19 deletions scrapd/core/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def check_dob(dob):
"""
In case that a date only contains 2 digits, determine century.

:param datetime.datetime dob: DOB
:param datetime.date dob: DOB
:return: DOB with 19xx or 20xx as appropriate
:rtype: datetime.datetime
:rtype: datetime.date
"""

now = datetime.datetime.now()
now = datetime.date.today()
if dob.year > now.year:
dob = datetime.datetime(dob.year - 100, dob.month, dob.day)
dob = datetime.date(dob.year - 100, dob.month, dob.day)
return dob


Expand All @@ -44,37 +44,37 @@ def clean_date_string(date, is_dob=False):
dt = parse_date(date)
if is_dob:
dt = check_dob(dt)
return datetime.datetime.strftime(dt, "%m/%d/%Y")
return datetime.date.strftime(dt, "%m/%d/%Y")


def from_date(date):
"""
Parse the date from a human readable format, with options for the from date.

* If the date cannot be parsed, `datetime.datetime.min` is returned.
* If the date cannot be parsed, `datetime.date.min` is returned.
* If the day of the month is not specified, the first day is used.

:param str date: date
:return: a date object representing the date.
:rtype: datetime.datetime
:rtype: datetime.date
"""

return parse_date(date, datetime.datetime.min, settings={'PREFER_DAY_OF_MONTH': 'first'})
return parse_date(date, datetime.date.min, settings={'PREFER_DAY_OF_MONTH': 'first'})


def to_date(date):
"""
Parse the date from a human readable format, with options for the to date.

* If the date cannot be parsed, `datetime.datetime.max` is returned.
* If the date cannot be parsed, `datetime.date.max` is returned.
* If the day of the month is not specified, the last day is used.

:param str date: date
:return: a date object representing the date.
:rtype: datetime.datetime
:rtype: datetime.date
"""

return parse_date(date, datetime.datetime.max, settings={'PREFER_DAY_OF_MONTH': 'last'})
return parse_date(date, datetime.date.max, settings={'PREFER_DAY_OF_MONTH': 'last'})


def parse_date(date, default=None, settings=None):
Expand All @@ -85,17 +85,17 @@ def parse_date(date, default=None, settings=None):
returned.

:param str date: date
:param datetime default: default value in case the date cannot be parsed.
:param datetime.date default: default value in case the date cannot be parsed.
:param dict settings: a dictionary containing the parsing options. All the available options are defined here:
https://dateparser.readthedocs.io/en/latest/dateparser.html#dateparser.conf.Settings.
:return: a date object representing the date.
:rtype: datetime.datetime
:rtype: datetime.date
"""

try:
d = dateparser.parse(date, settings=settings)
if d:
return d
return d.date()
raise ValueError(f'Cannot parse date: {date}')
except Exception:
if default:
Expand All @@ -107,7 +107,7 @@ def is_between(date, from_=None, to=None):
"""
Check whether a date is comprised between 2 others.

:param str date: date to vheck
:param str date: date to check
:param str from_: start date, defaults to None
:param str to: end date, defaults to None
:return: `True` if the date is between `from_` and `to`
Expand All @@ -124,13 +124,12 @@ def compute_age(date, dob):
"""
Compute a victim's age.

:param str date: crash date
:param str dob: date of birth
:param datetime.date date: crash date
:param datetime.date dob: date of birth
:return: the victim's age.
:rtype: int
"""
DAYS_IN_YEAR = 365
dob_ = parse_date(dob)

# Compute the age.
return (parse_date(date) - check_dob(dob_)).days // DAYS_IN_YEAR
return (date - dob).days // DAYS_IN_YEAR
27 changes: 26 additions & 1 deletion scrapd/core/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,17 @@ def print(self, results, **kwargs):
formatter = self._get_formatter()
formatter.printer(results, **kwargs)

def date_serialize(self, obj):
"""
Convert date objects to string for serialization.

:rtype: str
"""

if isinstance(obj, (datetime.datetime, datetime.date)):
return obj.strftime("%m/%d/%Y")
raise TypeError("Type %s not serializable" % type(obj))

# pylint: disable=unused-argument
def printer(self, results, **kwargs):
"""
Expand All @@ -72,6 +83,17 @@ def printer(self, results, **kwargs):
"""
print(results, file=self.output)

def to_json_string(self, results):
"""
Convert dict of parsed fields to JSON string.

:param results dict: results of scraping APD news site

:rtype: str
"""

return json.dumps(results, sort_keys=True, indent=2, default=self.date_serialize)


class PythonFormatter(Formatter):
"""
Expand All @@ -97,7 +119,8 @@ class JSONFormatter(Formatter):
__format_name__ = 'json'

def printer(self, results, **kwargs): # noqa: D102
print(json.dumps(results, sort_keys=True, indent=2), file=self.output)
json_string = self.to_json_string(results)
print(json_string, file=self.output)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to create an extra function here. to_json_string() is not really necessary, you can simply put:

print(json.dumps(results, sort_keys=True, indent=2, default=self.date_serialize), file=self.output)

Copy link
Contributor Author

@mscarey mscarey May 23, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted that line to be in a method with a return value for easier testing. I made more changes for the CSV formatter.



class CSVFormatter(Formatter):
Expand All @@ -110,6 +133,8 @@ class CSVFormatter(Formatter):
__format_name__ = 'csv'

def printer(self, results, **kwargs): # noqa: D102
results = self.to_json_string(results)
results = json.loads(results)
writer = csv.DictWriter(self.output, fieldnames=CSVFIELDS, extrasaction='ignore')
writer.writeheader()
writer.writerows(results)
Expand Down
Loading