Skip to content
This repository has been archived by the owner on Feb 2, 2022. It is now read-only.

Commit

Permalink
Fix unidentified deceased
Browse files Browse the repository at this point in the history
Drive-by:
* Adds tasks to dump the errors and the data set in JSON and CSV.

Fixes #201
  • Loading branch information
rgreinho committed Oct 22, 2019
1 parent a348a62 commit 67284e1
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 96 deletions.
4 changes: 2 additions & 2 deletions .github/CONTRIBUTING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ least one parsing error either in the twitter fields or the article itself. The
Workflow
^^^^^^^^

Start by running `scrapd` at the root of this project::
Run the `dump-json` task from the root of the project::

scrapd -vvv --dump 1>.dump/dump.json 2>.dump/dump.json.log
inv dump-json

In addition to the dumps, this will also create 2 files to help you debug:

Expand Down
9 changes: 7 additions & 2 deletions scrapd/core/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,16 @@ def parse_deceased_field(soup):
for deceased_tag in deceased_tags:
deceased_field = parse_deceased_tag(deceased_tag)
try:
fatality, err = deceased.process_deceased_field(deceased_field)
fatality = []
err = []
for processed_deceased in deceased.process_deceased_field(deceased_field):
f, e = processed_deceased
fatality.append(f)
err += e
except ValueError as e: # pragma: no cover
errors.append(str(e))
else:
fatalities.append(fatality)
fatalities.extend(fatality)
errors.extend(err)

return fatalities, errors
Expand Down
42 changes: 40 additions & 2 deletions scrapd/core/deceased.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,17 +218,24 @@ def process_deceased_field(deceased_field):
parse_pipe_delimited_deceased_field,
parse_space_delimited_deceased_field,
parse_age_deceased_field,
parse_unidentified,
]

# Execute the parsing methods in order.
for m in parse_methods:
try:
d = m(deceased_field)
return to_fatality(d)
if isinstance(d, dict):
return [to_fatality(d)]
if isinstance(d, list):
return [to_fatality(entry) for entry in d]
except (ValueError, IndexError):
pass

raise ValueError(f'cannot parse {Fields.DECEASED}: {deceased_field}')
# This line is never hit, because `parse_unidentified()` acts as a catch all method,
# and turns unparsable deceased fields into unidentified fatalities.
# This line is only here for consistency (all the paths return a list).
return [] # pragma: no cover


def parse_age_deceased_field(deceased_field):
Expand Down Expand Up @@ -303,6 +310,37 @@ def parse_space_delimited_deceased_field(deceased_field):
return parse_deceased_field_common(split_deceased_field, fleg)


def parse_unidentified(deceased_field):
"""
Parse deceased field with unidentified victims.
:param str deceased_field: the deceased field as a string.
:return: a list of dictionaries representing the deceased field.
:rtype: list of dicts
"""
unidentified_deceased_pattern = re.compile(
r'''
(
Unidentified # The "Unidentified" keyword
,? # Potentially a comma
\s # A whitespace
(?P<ethinicty>[^\s]+\s)? # The ethinicty
(?P<gender>female|male) # The gender
)
''',
re.VERBOSE,
)
matches = re.finditer(unidentified_deceased_pattern, deceased_field)
unidentified_fatalities = []
for match in matches:
d = {
Fields.GENDER: (match.group('gender') or 'Undefined').strip().capitalize(),
Fields.ETHNICITY: (match.group('ethinicty') or 'Undefined').strip().capitalize(),
}
unidentified_fatalities.append(d)
return unidentified_fatalities


def parse_deceased_field_common(split_deceased_field, fleg):
"""
Parse the deceased field.
Expand Down
7 changes: 1 addition & 6 deletions scrapd/core/twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,7 @@ def normalize_tokens(d):

# Process each fatality.
for fatality in tmp_fatalities:
try:
f, errors = deceased.process_deceased_field(fatality)
except ValueError as e:
err.append(str(e))
continue
else:
for f, errors in deceased.process_deceased_field(fatality):
d.setdefault('fatalities', []).append(f)
err.extend(errors)

Expand Down
15 changes: 15 additions & 0 deletions tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from nox.virtualenv import VirtualEnv

# Configuration values.
DUMP_DIR = '.dump'
VENV = 'venv'
project_name = 'scrapd'
docker_org = 'scrapd'
Expand Down Expand Up @@ -38,6 +39,20 @@ def clean_repo(c):
c.run('git reset --hard')


@task
def dump_json(c):
"""Dump errors and create JSON data set."""
c.run(f'mkdir -p {DUMP_DIR}')
c.run('scrapd -vvv --dump 1>.dump/dump.json 2>.dump/dump.json.log')


@task
def dump_csv(c):
"""Dump errors and create CSV data set."""
c.run(f'mkdir -p {DUMP_DIR}')
c.run('scrapd -vvv --dump --format csv 1>.dump/dump.csv 2>.dump/dump.csv.log')


@task
def flame_graph(c):
"""Create an interactive CPU flame graph."""
Expand Down
Loading

0 comments on commit 67284e1

Please sign in to comment.