Skip to content

Commit

Permalink
make sure list_basefiles_for also returns those SOUs that have parseo…
Browse files Browse the repository at this point in the history
…ption=metadataony
  • Loading branch information
staffanm committed Aug 19, 2018
1 parent 9d7c190 commit ab7995d
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 16 deletions.
14 changes: 9 additions & 5 deletions ferenda/sources/legal/se/myndfskr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
print_function, unicode_literals)
Expand Down Expand Up @@ -1167,7 +1168,7 @@ def download_post_form(self, form, url):
resp = self.session.send(req, allow_redirects=True)
return resp

def maintext_from_soup(self, soup):
def main_from_soup(self, soup):
main = soup.find("div", id="readme")
if main:
main.find("div", "rs_skip").decompose()
Expand All @@ -1189,11 +1190,14 @@ def maintext_from_soup(self, soup):
e.dummyfile = self.store.parsed_path(basefile)
raise e

def maintext_from_soup(self, soup):
main = self.main_from_soup(soup)
return main.get_text("\n\n", strip=True)

def textreader_from_basefile(self, basefile, force_ocr=False, attachment=None):
infile = self.store.downloaded_path(basefile)
soup = BeautifulSoup(util.readfile(infile), "lxml")
main = self.maintext_from_soup(soup)
text = main.get_text("\n\n", strip=True)
text = self.maintext_from_soup(soup)
text = self.sanitize_text(text, basefile)
return TextReader(string=text)

Expand All @@ -1204,7 +1208,7 @@ def parse_open(self, basefile):
return self.store.open_downloaded(basefile)

def parse_body(self, fp, basefile):
main = BeautifulSoup(self.maintext_from_soup(BeautifulSoup(fp, "lxml")), "lxml")
main = self.main_from_soup(BeautifulSoup(fp, "lxml"))
return Body([elements_from_soup(main)],
uri=None)

Expand Down Expand Up @@ -2056,7 +2060,7 @@ def consolidation_basis(self, soup):
def maintext_from_soup(self, soup):
main = soup.find("div", id="socextPageBody").find("div", "ms-rtestate-field")
assert main
return main
return str(main)

def parse_open(self, basefile):
if basefile.startswith("konsolidering"):
Expand Down
26 changes: 15 additions & 11 deletions ferenda/sources/legal/se/sou.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from cached_property import cached_property

from ferenda import (PDFAnalyzer, CompositeRepository, DocumentEntry,
PDFDocumentRepository, CompositeStore, Facet)
PDFDocumentRepository, CompositeStore, Facet, DocumentStore)
from ferenda import util, decorators, errors
from ferenda.pdfreader import StreamingPDFReader
from . import Regeringen, SwedishLegalSource, FixedLayoutSource, SwedishLegalStore, Offtryck, RPUBL
Expand Down Expand Up @@ -145,6 +145,8 @@ def canonical_uri(self, basefile):
def sanitize_identifier(self, identifier):
return sou_sanitize_identifier(identifier)

class SOUKBStore(SwedishLegalStore):
downloaded_suffixes = [".pdf", ".rdf"]

class SOUKB(Offtryck, PDFDocumentRepository):
alias = "soukb"
Expand All @@ -158,7 +160,8 @@ class SOUKB(Offtryck, PDFDocumentRepository):
# A bit nonsensical, but required for SwedishLegalSource.get_parser
document_type = SOU = True
PROPOSITION = DS = KOMMITTEDIREKTIV = False

documentstore_class = SOUKBStore

@classmethod
def get_default_options(cls):
opts = super(SOUKB, cls).get_default_options()
Expand Down Expand Up @@ -210,7 +213,7 @@ def download_get_basefiles(self, source):
def download_single(self, basefile, url):
if self.get_parse_options(basefile) == "skip":
raise errors.DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
rdffilename = self.store.downloaded_path(basefile, attachment="metadata.rdf")
rdffilename = self.store.downloaded_path(basefile, attachment="index.rdf")
if self.get_parse_options(basefile) == "metadataonly" and os.path.exists(rdffilename) and (not self.config.refresh):
# it is kind of bad that we can even get here in these
# cases (if a rdffile exists, and a empty index.pdf
Expand All @@ -234,14 +237,15 @@ def download_single(self, basefile, url):

# download rdf metadata before actual content
try:
# it appears that certain URLs (like curl
# http://data.libris.kb.se/open/bib/8351225.rdf)
# sometimes return an empty response. We should check
# and warn for this (and infer a minimal RDF by
# hand from what we can, eg dc:title from the link
# text)
# it appears that URLs like
# http://data.libris.kb.se/open/bib/8351225.rdf now
# returns empty responses. Until we find out the proper
# RDF endpoint URLs, we should check and warn for this
# (and infer a minimal RDF by hand from what we can, eg
# dc:title from the link text)
self.download_if_needed(rdfurl, basefile,
filename=rdffilename)
filename=rdffilename,
archive=False)
if os.path.getsize(rdffilename) == 0:
self.log.warning("%s: %s returned 0 response, infer RDF" %
(basefile, rdfurl))
Expand Down Expand Up @@ -326,7 +330,7 @@ def extract_head(self, fp, basefile):

def extract_metadata(self, rawhead, basefile):
metadata = util.readfile(self.store.downloaded_path(
basefile, attachment="metadata.rdf"))
basefile, attachment="index.rdf"))
# For some reason these RDF files might use canonical
# decomposition form (NFD) which is less optimal. Fix this.
metadata = unicodedata.normalize("NFC", metadata)
Expand Down

0 comments on commit ab7995d

Please sign in to comment.