scrapd · mergify · Jan 11, 2020 · Jan 10, 2020
diff --git a/noxfile.py b/noxfile.py
@@ -165,7 +165,7 @@ def run_pytest_units(session):
 
 
 def run_pytest_integrations(session):
-    run_pytest(session, '-m', 'integrations', '--reruns', '3', '--reruns-delay', '5', '-r', 'R')
+    run_pytest(session, '-m', 'integrations and not dump', '--reruns', '3', '--reruns-delay', '5', '-r', 'R')
 
 
 def run_sphinx(session):

diff --git a/scrapd/core/apd.py b/scrapd/core/apd.py
@@ -36,6 +36,7 @@ async def fetch_text(session, url, params=None):
         params = {}
     try:
         async with session.get(url, params=params) as response:
+            logger.debug(response.url)
             return await response.text()
     except (
             aiohttp.ClientError,
@@ -82,10 +83,33 @@ def extract_traffic_fatalities_page_details_link(news_page):
     :return: a list of links.
     :rtype: list or `None`
     """
-    PATTERN = r'<a href="(/news/traffic-fatality-\d{1,3}-\d|\S*)">(Traffic Fatality #(\d{1,3})).*\s*</a>'
-    regex = re.compile(PATTERN)
+    regex = re.compile(
+        r'''
+        <a\shref="
+        (?:
+            (?:
+                (/news/traffic-fatality-\d{1,3}-\d|\S*)
+                ">
+                (Traffic\sFatality\s\#(\d{1,3}))
+            )
+            |
+            (?:
+                (/news/fatality-crash-\d{1,3}-\d)
+                ">
+                (Fatality\sCrash\s\#(\d{1,3}))
+            )
+        )
+        .*\s*
+        </a>
+        ''',
+        re.VERBOSE | re.MULTILINE,
+    )
     matches = regex.findall(news_page, re.MULTILINE)
-    return matches
+    compact_matches = []
+    for match in matches:
+        parts = tuple(part for part in match if part != '')
+        compact_matches.append(parts)
+    return compact_matches
 
 
 def generate_detail_page_urls(titles):

diff --git a/scrapd/core/model.py b/scrapd/core/model.py
@@ -1,6 +1,7 @@
 """Define the ScrAPD models."""
 import datetime
 from enum import Enum
+import re
 from typing import List
 
 from pydantic import BaseModel
@@ -130,7 +131,7 @@ def update(self, other, strict=False):
     @validator('case')
     def valid_case_number(cls, v):  # pylint: disable=no-self-argument
         """Ensure a case number is valid."""
-        pattern = r"(\d{2}-\d{3,7})"
+        pattern = re.compile(r"(\d{2}-\d{3,7})")
         if not regex.match_pattern(v, pattern):
             raise ValueError('invalid format: "{v}"')
         return v

diff --git a/scrapd/core/regex.py b/scrapd/core/regex.py
@@ -32,12 +32,12 @@ def match_pattern(text, pattern, group_number=0):
     Match a pattern.
 
     :param str text: the text to match the pattern against
-    :param compiled regex pattern: the pattern to look for
+    :param compiled regex pattern: the regex to look for
     :param int group_number: the capturing group number
     :return: a string representing the captured group.
     :rtype: str
     """
-    match = re.search(pattern, text)
+    match = pattern.search(text)
     return match.groups()[group_number] if match else ''
 
 
@@ -94,8 +94,22 @@ def match_crash_field(page):
     :return: a string representing the crash number.
     :rtype: str
     """
-    crashes_pattern = re.compile(r'Traffic Fatality #(\d{1,3})')
-    return match_pattern(page, crashes_pattern)
+    crashes_pattern = re.compile(
+        r'''
+        (?:
+        (?:Traffic\sFatality\s\#(\d{1,3}))
+        |
+        (?:Fatality\sCrash\s\#(\d{1,3}))
+        )
+        ''',
+        re.VERBOSE,
+    )
+    matches = crashes_pattern.search(page)
+    if not matches:
+        return None
+
+    non_empty_match = [match for match in matches.groups() if match]
+    return non_empty_match[0]
 
 
 def match_date_field(page):

diff --git a/tests/core/test_regex.py b/tests/core/test_regex.py
@@ -23,7 +23,12 @@ def test_parse_case_field_00(input_, expected):
 
 @pytest.mark.parametrize(
     'input_, expected',
-    (('<span property="dc:title" content="Traffic Fatality #12" class="rdf-meta element-hidden"></span>', '12'), ))
+    (
+        ('<span property="dc:title" content="Traffic Fatality #12" class="rdf-meta element-hidden"></span>', '12'),
+        ('<title>Fatality Crash #1 | AustinTexas.gov - The Official Website of the City of Austin</title>', '1'),
+        ('There is no title here', None),
+    ),
+)
 def test_parse_crashes_field_00(input_, expected):
     """Ensure the crashes field gets parsed correctly."""
     actual = regex.match_crash_field(input_)

diff --git a/tests/features/retrieve.feature b/tests/features/retrieve.feature
@@ -11,5 +11,5 @@ Feature: Retrieve
     Examples:
       | format | from_date   | to_date     | crash_count | fatality_count |
       | csv    | Jan 15 2019 | Jan 18 2019 | 2           | 2              |
-      | json   | Jan 2018    | Dec 2018    | 72          | 72             |
-      | json   | Jan 15 2018 | Jan 18 2018 | 1           | 1              |
+      | json   | Jan 2019    | Dec 2019    | 86          | 87             |
+      | json   | Jan 15 2019 | Jan 18 2019 | 2           | 2              |