Skip to content

Commit

Permalink
misc tidying
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Feb 1, 2018
1 parent cb2d53e commit 29f4ce1
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 18 deletions.
8 changes: 2 additions & 6 deletions ferenda/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1384,12 +1384,8 @@ def _queue_jobs(manager, iterable, inst, classname, command):
continue
signal.alarm(timeout_length)
if r['basefile'] not in processing:
log.info("%s not found in processing (%s)" % (r['basefile'], ", ".join(processing)))
processing.remove(r['basefile']) # or .discard()? but if a
# recieved job is not in the
# processing set, something
# is probably wrong

log.warning("%s not found in processing (%s)" % (r['basefile'], ", ".join(processing)))
processing.discard(r['basefile'])
if isinstance(r['result'], tuple) and r['result'][0] == _WrappedKeyboardInterrupt:
raise KeyboardInterrupt()
elif isinstance(r['result'], tuple) and isinstance(r['result'][0], Exception):
Expand Down
7 changes: 6 additions & 1 deletion ferenda/sources/legal/se/ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
from . import Regeringen, Offtryck, RPUBL

def ds_sanitize_identifier(identifier):
if not re.match("Ds (19|20)\d{2}:[1-9]\d*"):
if identifier.startswith("DS "):
identifier = identifier.replace("DS ", "Ds ")
if not re.match("Ds (19|20)\d{2}:[1-9]\d*", identifier):
raise ValueError("Irregular identifier %s (after mangling)" % identifier)
return Literal(identifier)

Expand Down Expand Up @@ -106,3 +108,6 @@ def metadata_from_basefile(self, basefile):
a["rpubl:arsutgava"], a["rpubl:lopnummer"] = basefile.split(":", 1)
a["rpubl:utrSerie"] = self.lookup_resource("Ds", SKOS.altLabel)
return a

def sanitize_identifier(self, identifier):
return ds_sanitize_identifier(identifier)
4 changes: 4 additions & 0 deletions ferenda/sources/legal/se/regeringen.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ def download(self, basefile=None, url=None):
"http://www.regeringen.se/rattsdokument/departementsserien-och-promemorior/2000/01/ds-2000681/", # Ds 2000:68
"http://www.regeringen.se/rattsdokument/departementsserien-och-promemorior/1999/01/ds-1999241/", # Ds 1999:24 -- in english
"http://www.regeringen.se/rattsdokument/departementsserien-och-promemorior/1998/01/ds-1998141/", # Ds 1998:14
"http://www.regeringen.se/rattsdokument/departementsserien-och-promemorior/2015/12/"
"andringar-i-rennaringsforordningen-1993384/", # mistaken for a DS when it's really a unpublished PM
"http://www.regeringen.se/rattsdokument/departementsserien-och-promemorior/2015/12/"
"andring-av-bestammelserna-om-ratt-till-bistand-i-lagen-1994137-om-mottagande-av-asylsokande-m.fl/" # same
])

def attribs_from_url(self, url):
Expand Down
10 changes: 7 additions & 3 deletions ferenda/sources/legal/se/sou.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@


def sou_sanitize_identifier(identifier):
if not re.match("SOU (19|20)\d{2}:[1-9]\d*"):
if not re.match("SOU (19|20)\d{2}:[1-9]\d*", identifier):
raise ValueError("Irregular identifier %s (after mangling)" % identifier)
return Literal(identifier)

Expand Down Expand Up @@ -139,6 +139,9 @@ def canonical_uri(self, basefile):
resource = self.attributes_to_resource(attrib)
return self.minter.space.coin_uri(resource)

def sanitize_identifier(self, identifier):
return sou_sanitize_identifier(identifier)


class SOUKB(Offtryck, PDFDocumentRepository):
alias = "soukb"
Expand Down Expand Up @@ -323,16 +326,17 @@ def sanitize_metadata(self, props, doc):
if props.get('dcterms:title') and " : betänkande" in props['dcterms:title']:
props['dcterms:title'] = props['dcterms:title'].rsplit(" : ")[0]
return props


def sanitize_identifier(self, identifier):
return sou_sanitize_identifier(identifier)

def extract_body(self, fp, basefile):
reader = StreamingPDFReader()
parser = "ocr" if self.config.ocr else "xml"
reader.read(fp, parser=parser)
for page in reader:
page.src = "index.pdf" # FIXME: don't hardcode the filename
return reader


def sanitize_body(self, rawbody):
sanitized = super(SOUKB, self).sanitize_body(rawbody)
Expand Down
3 changes: 3 additions & 0 deletions lagen/nu/regeringenlegacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ class DirRegeringenLegacy(RegeringenLegacy, SameAs, DirRegeringen):
class SOURegeringenLegacy(RegeringenLegacy, SameAs, SOURegeringen):
alias = "souregeringen.legacy"

def sanitize_identifier(self, identifier):
from ferenda.sources.legal.se.sou import sou_sanitize_identifier
return sou_sanitize_identifier(identifier)

class DsRegeringenLegacy(RegeringenLegacy, SameAs, Ds):
alias = "dsregeringen.legacy"
Expand Down
17 changes: 9 additions & 8 deletions test/integrationLagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,11 +971,9 @@ def test_format(self):

def test_missing_pages(self):
# issue 5: "I prop. 1992/93:30 saknas s. 18–30. Prop. 1996/97:106 är ofullständig (har bara två sidor)"
for urlseg in ("prop/1992/93:30",
"prop/1996/97:106",
# "prop/1988/89:150", # NB: These 2 are budget
# "prop/1991/92:100"
): # propositions, left out by design
for urlseg, expected_missing in (
("prop/1992/93:30", []),
("prop/1996/97:106", [3,])):
res = self.get(self.baseurl + urlseg)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
Expand All @@ -987,7 +985,10 @@ def test_missing_pages(self):
# intentionally)
pagenum = 1
for page in pages:
self.assertEqual(str(pagenum), page.get("id")[3:], urlseg)
if pagenum in expected_missing:
pagenum += 1
else:
self.assertEqual(str(pagenum), page.get("id")[3:], urlseg)
pagenum += 1

def test_missing_docs(self):
Expand Down Expand Up @@ -1018,7 +1019,7 @@ def test_toc(self):
# issue 8
errors = []
for doctype, startyear, regex in (("dir", 1987, "^Dir\. (19|20)\d{2}:[1-9]\d*$"),
("ds", 1993, "^Ds (19|20)\d{2}:[1-9]\d*$"),
("ds", 1995, "^Ds (19|20)\d{2}:[1-9]\d*$"),
("sou", 1922, "^SOU (19|20)\d{2}:[1-9]\d*$"),
("prop", 1971, "^Prop\. (19|20)\d{2}(|/\d{2}|/2000):[1-9]\d*$")):
for year in range(startyear, 2018):
Expand All @@ -1031,7 +1032,7 @@ def test_toc(self):
for link in soup.find("article").find_all("a"):
# self.assertRegexpMatches(link.text, regex)
if not re.match(regex, link.text):
errors.append("%s/%s: %s" % (doctype, year, link.text))
errors.append("%s/%s: %s (%s)" % (doctype, year, link.text, link.get("href")))
self.maxDiff = None
self.assertEqual([], errors)

0 comments on commit 29f4ce1

Please sign in to comment.