Skip to content
This repository has been archived by the owner on Feb 2, 2022. It is now read-only.

Handle new page title #230

Merged
merged 1 commit into from
Jan 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def run_pytest_units(session):


def run_pytest_integrations(session):
run_pytest(session, '-m', 'integrations', '--reruns', '3', '--reruns-delay', '5', '-r', 'R')
run_pytest(session, '-m', 'integrations and not dump', '--reruns', '3', '--reruns-delay', '5', '-r', 'R')


def run_sphinx(session):
Expand Down
30 changes: 27 additions & 3 deletions scrapd/core/apd.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ async def fetch_text(session, url, params=None):
params = {}
try:
async with session.get(url, params=params) as response:
logger.debug(response.url)
return await response.text()
except (
aiohttp.ClientError,
Expand Down Expand Up @@ -82,10 +83,33 @@ def extract_traffic_fatalities_page_details_link(news_page):
:return: a list of links.
:rtype: list or `None`
"""
PATTERN = r'<a href="(/news/traffic-fatality-\d{1,3}-\d|\S*)">(Traffic Fatality #(\d{1,3})).*\s*</a>'
regex = re.compile(PATTERN)
regex = re.compile(
r'''
<a\shref="
(?:
(?:
(/news/traffic-fatality-\d{1,3}-\d|\S*)
">
(Traffic\sFatality\s\#(\d{1,3}))
)
|
(?:
(/news/fatality-crash-\d{1,3}-\d)
">
(Fatality\sCrash\s\#(\d{1,3}))
)
)
.*\s*
</a>
''',
re.VERBOSE | re.MULTILINE,
)
matches = regex.findall(news_page, re.MULTILINE)
return matches
compact_matches = []
for match in matches:
parts = tuple(part for part in match if part != '')
compact_matches.append(parts)
return compact_matches


def generate_detail_page_urls(titles):
Expand Down
3 changes: 2 additions & 1 deletion scrapd/core/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Define the ScrAPD models."""
import datetime
from enum import Enum
import re
from typing import List

from pydantic import BaseModel
Expand Down Expand Up @@ -130,7 +131,7 @@ def update(self, other, strict=False):
@validator('case')
def valid_case_number(cls, v): # pylint: disable=no-self-argument
"""Ensure a case number is valid."""
pattern = r"(\d{2}-\d{3,7})"
pattern = re.compile(r"(\d{2}-\d{3,7})")
if not regex.match_pattern(v, pattern):
raise ValueError('invalid format: "{v}"')
return v
Expand Down
22 changes: 18 additions & 4 deletions scrapd/core/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ def match_pattern(text, pattern, group_number=0):
Match a pattern.

:param str text: the text to match the pattern against
:param compiled regex pattern: the pattern to look for
:param compiled regex pattern: the regex to look for
:param int group_number: the capturing group number
:return: a string representing the captured group.
:rtype: str
"""
match = re.search(pattern, text)
match = pattern.search(text)
return match.groups()[group_number] if match else ''


Expand Down Expand Up @@ -94,8 +94,22 @@ def match_crash_field(page):
:return: a string representing the crash number.
:rtype: str
"""
crashes_pattern = re.compile(r'Traffic Fatality #(\d{1,3})')
return match_pattern(page, crashes_pattern)
crashes_pattern = re.compile(
r'''
(?:
(?:Traffic\sFatality\s\#(\d{1,3}))
|
(?:Fatality\sCrash\s\#(\d{1,3}))
)
''',
re.VERBOSE,
)
matches = crashes_pattern.search(page)
if not matches:
return None

non_empty_match = [match for match in matches.groups() if match]
return non_empty_match[0]


def match_date_field(page):
Expand Down
7 changes: 6 additions & 1 deletion tests/core/test_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ def test_parse_case_field_00(input_, expected):

@pytest.mark.parametrize(
'input_, expected',
(('<span property="dc:title" content="Traffic Fatality #12" class="rdf-meta element-hidden"></span>', '12'), ))
(
('<span property="dc:title" content="Traffic Fatality #12" class="rdf-meta element-hidden"></span>', '12'),
('<title>Fatality Crash #1 | AustinTexas.gov - The Official Website of the City of Austin</title>', '1'),
('There is no title here', None),
),
)
def test_parse_crashes_field_00(input_, expected):
"""Ensure the crashes field gets parsed correctly."""
actual = regex.match_crash_field(input_)
Expand Down
4 changes: 2 additions & 2 deletions tests/features/retrieve.feature
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ Feature: Retrieve
Examples:
| format | from_date | to_date | crash_count | fatality_count |
| csv | Jan 15 2019 | Jan 18 2019 | 2 | 2 |
| json | Jan 2018 | Dec 2018 | 72 | 72 |
| json | Jan 15 2018 | Jan 18 2018 | 1 | 1 |
| json | Jan 2019 | Dec 2019 | 86 | 87 |
| json | Jan 15 2019 | Jan 18 2019 | 2 | 2 |