scrapd · mergify · Mar 12, 2019 · Mar 8, 2019 · Mar 8, 2019 · Mar 9, 2019
diff --git a/scrapd/core/apd.py b/scrapd/core/apd.py
@@ -179,14 +179,55 @@ def parse_twitter_description(twitter_description):
     return sanitize_fatality_entity(d)
 
 
+def parse_details_page_notes(details_page_notes):
+    """
+    Clean up a details page notes section.
+
+    The purpose of this function is to attempt to extract the sentences about
+    the crash with some level of fidelity, but does not always return
+    a perfectly parsed sentence as the HTML syntax varies widely.
+
+    :param str details_description: the paragraph after the Deceased information
+    :return: A paragraph containing the details of the fatality in sentence form.
+    :rtype: str
+    """
+    # Ideally the Notes will be contained in a paragraph tag.
+    start_tag = details_page_notes.find('<p>') + len('<p>')
+    end_tag = details_page_notes.find('</p>', start_tag)
+
+    # Here .upper().isupper() tests if the substring of the
+    # text passed in contains any letters. If it doesn't,
+    # the Notes may be located after a <br \>.
+    if not details_page_notes[start_tag:end_tag].upper().isupper():
+        start_tag = details_page_notes.find(r'<br \>') + len(r'<br \>')
+
+    snippet = details_page_notes[start_tag:end_tag]
+
+    # Update the snippet if the following tag is an image.
+    if snippet[:4] == '<img':
+        snippet = details_page_notes[details_page_notes.find(r'<br \>') + len(r'<br \>'):end_tag]
+
+    # Remove the end of line characters.
+    squished = snippet.replace('\n', ' ')
+
+    # Look for the first capital letter and start from there.
+    first_cap = 0
+    for index, c in enumerate(squished):
+        if c.isupper():
+            first_cap = index
+            break
+
+    return squished[first_cap:]
+
+
 def sanitize_fatality_entity(d):
     """
     Clean up a fatality entity.
 
     Ensures that the values are all strings and removes the 'Deceased' field which does not contain
     relevant information anymore.
 
-    :param dict d: the fatality to sanatize
+    :param dict d: the fatality to sanitize
     :return: A dictionary containing the details information about the fatality with sanitized entries.
     :rtype: dict
     """
@@ -255,7 +296,7 @@ def parse_deceased_field(deceased_field):
     return d
 
 
-def parse_page_content(detail_page):
+def parse_page_content(detail_page, notes_parsed=False):
     """
     Parse the detail page to extract fatality information.
 
@@ -277,15 +318,21 @@ def parse_page_content(detail_page):
         match = re.search(search[1], normalized_detail_page)
         if match:
             d[search[0]] = match.groups()[0]
-
     # Parse the Deceased field.
     if d.get(Fields.DECEASED):
         try:
             d.update(parse_deceased_field(d.get(Fields.DECEASED).split()))
         except ValueError as e:
             logger.trace(e)
     else:
-        logger.trace('No decease information to parse in fatality page.')
+        logger.trace('No deceased information to parse in fatality page.')
+
+    # Fill in Notes from Details page if not in twitter description.
+    search_notes = re.compile(r'>Deceased:.*\s{2,}(.|\n)*?<\/p>(.|\n)*?<\/p>')
+    match = re.search(search_notes, normalized_detail_page)
+    if match and not notes_parsed:
+        text_chunk = match.string[match.start(0):match.end(0)]
+        d[Fields.NOTES] = parse_details_page_notes(text_chunk)
 
     # Compute the victim's age.
     if d.get(Fields.DATE) and d.get(Fields.DOB):
@@ -327,10 +374,13 @@ def parse_page(page):
     """
     # Parse the page.
     twitter_d = parse_twitter_fields(page)
-    page_d = parse_page_content(page)
+    if twitter_d.get(Fields.NOTES):
+        page_d = parse_page_content(page, True)
+    else:
+        page_d = parse_page_content(page, False)
 
     # Merge the results, from right to left.
-    # (i.e. the rightmost object will overiide the object just before it, etc.)
+    # (i.e. the rightmost object will override the object just before it, etc.)
     d = {**page_d, **twitter_d}
     return d
 

diff --git a/tests/core/test_apd.py b/tests/core/test_apd.py
@@ -143,7 +143,10 @@ def test_parse_twitter_description_00():
         'Time': '2:24 a.m.',
         'Location': '1400 E. Highway 71 eastbound',
         'DOB': '02/09/1980',
-        'Notes': 'The preliminary investigation shows that a 2003 Ford F150 was traveling northbound on the US Highway 183 northbound ramp to E. Highway 71, eastbound. The truck went across the E. Highway 71 and US Highway 183 ramp, rolled and came to a stop north of the roadway.',
+        'Notes': 'The preliminary investigation shows that a 2003 Ford F150 was '
+        'traveling northbound on the US Highway 183 northbound ramp to E. Highway 71, eastbound. '
+        'The truck went across the E. Highway 71 and US Highway 183 ramp, rolled '
+        'and came to a stop north of the roadway.',
         'Gender': 'male',
         'Ethnicity': 'White',
         'Last Name': 'Sabillon-Garcia',
@@ -183,6 +186,13 @@ def test_parse_twitter_description_03():
     assert actual == expected
 
 
+def test_parse_details_page_notes_01():
+    """Ensure a malformed entry is not parsed."""
+    actual = apd.parse_twitter_description(mock_data.details_page_notes_01)
+    expected = {}
+    assert actual == expected
+
+
 def test_extract_traffic_fatalities_page_details_link_00(news_page):
     """Ensure page detail links are extracted from news page."""
     actual = apd.extract_traffic_fatalities_page_details_link(news_page)
@@ -236,10 +246,13 @@ def test_has_next_01():
 
 @pytest.mark.parametrize('filename,expected', [(k, v) for k, v in parse_page_content_scenarios.items()])
 def test_parse_page_content_00(filename, expected):
-    """Ensure information are properly extracted from the content detail page."""
+    """Ensure information are properly extracted from the content detail page.
+           Don't compare notes if parsed from details page."""
     page_fd = TEST_DATA_DIR / filename
     page = page_fd.read_text()
     actual = apd.parse_page_content(page)
+    if 'Notes' in actual and 'Notes' not in expected:
+        del actual['Notes']
     assert actual == expected
 
 
@@ -254,10 +267,13 @@ def test_parse_twitter_fields_00(filename, expected):
 
 @pytest.mark.parametrize('filename,expected', [(k, v) for k, v in parse_page_scenarios.items()])
 def test_parse_page_00(filename, expected):
-    """Ensure information are properly extracted from the page."""
+    """Ensure information are properly extracted from the page.
+       Don't compare notes if parsed from details page."""
     page_fd = TEST_DATA_DIR / filename
     page = page_fd.read_text()
     actual = apd.parse_page(page)
+    if 'Notes' in actual and 'Notes' not in expected:
+        del actual['Notes']
     assert actual == expected
 
 

diff --git a/tests/mock_data.py b/tests/mock_data.py
@@ -3,10 +3,11 @@
 twitter_title_00 = "Traffic Fatality #73"
 twitter_description_00 = """
 Case:           18-3640187 Date:            December 30, 2018 Time:            2:24 a.m. Location:     1400 E. Highway
-71 eastbound Deceased:   Corbin Sabillon-Garcia, White male, DOB 02/09/80   The preliminary investigation shows that a
-2003 Ford F150 was traveling northbound on the US Highway 183 northbound ramp to E. Highway 71, eastbound. The truck
-went across the E. Highway 71 and US Highway 183 ramp, rolled and came to a stop north of the roadway.
+ 71 eastbound Deceased:   Corbin Sabillon-Garcia, White male, DOB 02/09/80   The preliminary investigation shows that a
+ 2003 Ford F150 was traveling northbound on the US Highway 183 northbound ramp to E. Highway 71, eastbound. The truck
+ went across the E. Highway 71 and US Highway 183 ramp, rolled and came to a stop north of the roadway.
 """
 twitter_description_01 = "Case:           19-0161105"
 twitter_description_02 = "Case:            18-160882 Date:             Tuesday, January 16, 2018 Time:             5:14 p.m. Location:      1500 W. Slaughter Lane Deceased:     Eva Marie Gonzales, W/F, DOB: 01-22-1961 (passenger)"
 twitter_description_03 = "APD is asking any businesses in the area of East Cesar Chavez and Adam L. Chapa Sr. streets to check their surveillance cameras between 2 and 2:10 a.m. on Oct. 10, 2018, for this suspect vehicle. See video of suspect vehicle here --&gt; https://youtu.be/ezxaRW79PnI"
+details_page_notes_01 = "<><>"
diff --git a/tests/step_defs/test_retrieve.py b/tests/step_defs/test_retrieve.py
@@ -40,7 +40,8 @@ def time_range(from_date, to_date):
 @then('the generated file must contain <entry_count> entries')
 @pytest.mark.asyncio
 def ensure_results(mocker, event_loop, output_format, time_range, entry_count):
-    """Ensure we get the right amount of entries."""
+    """Ensure we get the right amount of entries.
+       Don't compare the parsed notes to entry."""
     result, _ = event_loop.run_until_complete(apd.async_retrieve(pages=-1, **time_range))
     assert result is not None
     assert len(result) == entry_count