/
justice_glossary.py
55 lines (44 loc) · 1.92 KB
/
justice_glossary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from typing import Any, Iterable
from scrapy.http.response.html import HtmlResponse
from ...metadata import Metadata, Subject
from ...models.glossary import GlossaryEntry, GlossaryParseResult
from ...text import URL, LoCSubject, NonemptyString as String
from ...text import Sentence, ensure_ends_with_period, make_soup, normalize_nonempty
def parse_glossary(html: HtmlResponse) -> GlossaryParseResult:
parsed_entries = tuple(__parse_entries(html))
return GlossaryParseResult(
metadata=Metadata(
dcterms_title=String("Glossary"),
dcterms_language="en",
dcterms_coverage="NZL",
# Info about original source
dcterms_source=String("https://www.justice.govt.nz/about/glossary/"),
publiclaw_sourceModified="unknown",
publiclaw_sourceCreator=String("New Zealand Ministry of Justice"),
dcterms_subject=(
Subject(
uri=LoCSubject("sh85071120"),
rdfs_label=String("Justice, Administration of"),
),
Subject(
uri=URL("https://www.wikidata.org/wiki/Q16514399"),
rdfs_label=String("Administration of justice"),
),
),
),
entries=parsed_entries,
)
def __parse_entries(html: HtmlResponse) -> Iterable[GlossaryEntry]:
"""TODO: Refactor into a parent class"""
for phrase, defn in __raw_entries(html):
yield GlossaryEntry(
phrase=normalize_nonempty(phrase.text),
definition=Sentence(normalize_nonempty(ensure_ends_with_period(defn.text))),
)
def __raw_entries(html: HtmlResponse) -> Iterable[tuple[Any, Any]]:
"""
The core of this parser.
TODO: Refactor all the glossary parsers to need only this function.
"""
soup = make_soup(html)
return ((phrase, phrase.parent.next_sibling) for phrase in soup.find_all("strong"))