Skip to content

Commit

Permalink
updated a bunch of repos to not download more than absolutely needed …
Browse files Browse the repository at this point in the history
…when parse option 'metadataonly' is set
  • Loading branch information
staffanm committed Jun 10, 2018
1 parent 5ea4903 commit 79750da
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 85 deletions.
89 changes: 65 additions & 24 deletions ferenda/sources/legal/se/propositioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from . import (Trips, NoMoreLinks, Regeringen, Riksdagen,
SwedishLegalSource, SwedishLegalStore, RPUBL, Offtryck)
from .fixedlayoutsource import FixedLayoutStore, FixedLayoutSource
from .swedishlegalsource import lazyread
from .swedishlegalsource import lazyread, SwedishLegalStore
from .elements import Sidbrytning

def prop_sanitize_identifier(identifier):
Expand Down Expand Up @@ -635,6 +635,8 @@ class PropRiksdagen(Riksdagen):
def sanitize_identifier(self, identifier):
return prop_sanitize_identifier(identifier)

class PropKBStore(SwedishLegalStore):
downloaded_suffixes = [".pdf", ".xml"]

class PropKB(Offtryck, PDFDocumentRepository):
alias = "propkb"
Expand All @@ -644,7 +646,8 @@ class PropKB(Offtryck, PDFDocumentRepository):
basefile_regex = "prop_(?P<year>\d{4})(?P<type>_urtima|_höst|_a|_b|)__+(?P<no>\d+)(?:_(?P<part>\d+)|)"
document_type = PROPOSITION = True
SOU = DS = KOMMITTEDIREKTIV = False

documentstore_class = PropKBStore

@classmethod
def get_default_options(cls):
opts = super(PropKB, cls).get_default_options()
Expand Down Expand Up @@ -701,7 +704,7 @@ def download_get_basefiles(self, source):
continue
if self.get_parse_options(basefile) == "skip":
continue
if part and int(part) > 1:
if part and int(part) > 1 and self.get_parse_options(basefile) != "metadataonly":
# Download attachments ourselves -- not
# really what download_get_basefile should
# do, but hey....
Expand All @@ -722,14 +725,30 @@ def download_single(self, basefile, url=None):
if not url:
entry = DocumentEntry(self.store.documententry_path(basefile))
url = entry.orig_url
if self.get_parse_options(basefile) == "metadataonly":
# in these cases, to save space, get
# the smaller XML OCR data, not the
# actual scanned images-in-PDF
url = url.replace(".pdf", ".xml").replace("pdf/web", "xml")
# make store.downloaded_path return .xml suffixes (and set
# the timestamp to the beginning of epoch so that the
# resulting if-modified-since header doesn't contain the
# current date/time
downloaded_path = self.store.downloaded_path(basefile).replace(".pdf", ".xml")
if not os.path.exists(downloaded_path):
util.writefile(downloaded_path, "")
os.utime(downloaded_path, (0,0))
return super(PropKB, self).download_single(basefile, url)


@lazyread
# @lazyread
def downloaded_to_intermediate(self, basefile, attachment=None):
downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
intermediate_path = self.store.intermediate_path(basefile)
return self.convert_pdf(downloaded_path, intermediate_path)
if downloaded_path.endswith(".xml"):
return open(downloaded_path)
else:
intermediate_path = self.store.intermediate_path(basefile)
return self.convert_pdf(downloaded_path, intermediate_path)

def convert_pdf(self, downloaded_path, intermediate_path):
intermediate_dir = os.path.dirname(intermediate_path)
Expand All @@ -744,12 +763,41 @@ def convert_pdf(self, downloaded_path, intermediate_path):
return reader.convert(**kwargs)

def extract_head(self, fp, basefile):
return None # "rawhead" is never used
if self.get_parse_options(basefile) == "metadataonly":
tree = etree.parse(fp)
firstpage = tree.find("//{http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml}page")
return firstpage
else:
return None # "rawhead" is never used

def extract_metadata(self, rawhead, basefile):
res = self.metadata_from_basefile(basefile)
# extracting title and other metadata (dep, publication date
# etc) requires parsing of the body)
return self.metadata_from_basefile(basefile)
# etc) requires parsing of the body (and subsequent processing
# in postprocess_doc). For documents marked as metadataonly in
# options.py, the body is never parsed. Therefore, we do a
# very limited parsing of the first page here.
if self.get_parse_options(basefile) == "metadataonly":
text = util.normalize_space(etree.tostring(rawhead, method="text", encoding="utf-8").decode("utf-8"))
res.update(self.find_firstpage_metadata(text, basefile))
return res

def find_firstpage_metadata(self, firstpage, basefile):
res = {}
m = re.search("proposition till riksdagen *,? *(.*?); gif?ven",
util.normalize_space(firstpage), flags=re.I)
if not m:
self.log.warning("%s: Couldn't find title in first %s characters (first page)" %
(basefile, len(firstpage)))
else:
res["dcterms:title"] = m.groups(1)
m = re.search("gif?ven stockholms slott den (\d+ \w+ \d{4})", util.normalize_space(firstpage), flags=re.I)
if not m:
self.log.warning("%s: Couldn't find date in first %s characters (first page)" %
(basefile, len(firstpage)))
else:
res["dcterms:issued"] = self.parse_swedish_date(m.group(1).lower())
return res

def extract_body(self, fp, basefile):
reader = StreamingPDFReader()
Expand All @@ -773,27 +821,20 @@ def extract_body(self, fp, basefile):
return reader

def postprocess_doc(self, doc):
if self.get_parse_options(doc.basefile) == "metadataonly":
return
# the first thing will be a Sidbrytning; continue scanning text until next sidbrytning
firstpage = ""
for thing in doc.body[1:]:
if isinstance(thing, Sidbrytning):
break
elif isinstance(thing, Textbox):
firstpage += str(thing) + "\n\n"
m = re.search("proposition till riksdagen *,? *(.*?); gif?ven",
util.normalize_space(firstpage), flags=re.I)
if not m:
self.log.warning("%s: Couldn't find title in first %s characters (first page)" %
(doc.basefile, len(firstpage)))
else:
doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(m.group(1), lang=self.lang)))
m = re.search("gif?ven stockholms slott den (\d+ \w+ \d{4})", util.normalize_space(firstpage), flags=re.I)
if not m:
self.log.warning("%s: Couldn't find date in first %s characters (first page)" %
(doc.basefile, len(firstpage)))
else:
d = self.parse_swedish_date(m.group(1).lower())
doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(d)))
firstpage += util.normalize_space(str(thing)) + "\n\n"
metadata = self.find_firstpage_metadata(firstpage, doc.basefile)
if "dcterms:title" in metadata:
doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(metadata["dcterms:title"], lang=self.lang)))
if "dcterms:issued" in metadata:
doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(metadata["dcterms:issued"])))


# inherit list_basefiles_for from CompositeStore, basefile_to_pathfrag
Expand Down
41 changes: 23 additions & 18 deletions ferenda/sources/legal/se/regeringen.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@ def download_get_basefiles(self, params):


def download_single(self, basefile, url=None):
if self.get_parse_options(basefile) == "skip":
raise DownloadSkippedError("%s should not be downloaded according to options.py" % basefile)
if not url:
url = self.remote_url(basefile)
if not url: # remote_url failed
Expand Down Expand Up @@ -283,25 +285,28 @@ def download_single(self, basefile, url=None):
else:
self.log.info("%s: downloaded from %s" % (basefile, url))

soup = BeautifulSoup(codecs.open(filename, encoding=self.source_encoding), "lxml")
cnt = 0
selected_files = self.find_doc_links(soup, basefile)
if selected_files:
for (filename, filetype,label) in selected_files:
fileurl = urljoin(url, filename)
basepath = filename.split("/")[-1]
filename = self.store.downloaded_path(basefile, attachment=basepath)
if not filename.lower().endswith(".pdf"):
filename += ".%s" % filetype
if self.download_if_needed(fileurl, basefile, filename=filename):
filesupdated = True
self.log.debug(
" %s is new or updated" % filename)
else:
self.log.debug(" %s is unchanged" % filename)
if self.get_parse_options(basefile) == "metadataonly":
self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF file" % basefile)
else:
self.log.warning(
"%s (%s) has no downloadable files" % (basefile, url))
soup = BeautifulSoup(codecs.open(filename, encoding=self.source_encoding), "lxml")
cnt = 0
selected_files = self.find_doc_links(soup, basefile)
if selected_files:
for (filename, filetype,label) in selected_files:
fileurl = urljoin(url, filename)
basepath = filename.split("/")[-1]
filename = self.store.downloaded_path(basefile, attachment=basepath)
if not filename.lower().endswith(".pdf"):
filename += ".%s" % filetype
if self.download_if_needed(fileurl, basefile, filename=filename):
filesupdated = True
self.log.debug(
" %s is new or updated" % filename)
else:
self.log.debug(" %s is unchanged" % filename)
else:
self.log.warning(
"%s (%s) has no downloadable files" % (basefile, url))
if updated or filesupdated:
pass
else:
Expand Down
102 changes: 60 additions & 42 deletions ferenda/sources/legal/se/riksdagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def remote_url(self, basefile):
base36year, doctypecode, pnr)

def download_single(self, basefile, url=None):
if self.get_parse_options(basefile) == "skip":
raise errors.DownloadSkippedError("%s should not be downloaded according to options.py" % basefile)
attachment = None
if isinstance(basefile, tuple):
basefile, attachment = basefile
Expand Down Expand Up @@ -176,57 +178,73 @@ def download_single(self, basefile, url=None):
(basefile, xmlfile))
else:
self.log.info("%s: downloaded from %s" % (basefile, url))
fileupdated = False
r = None

# for some reason, using a XML parser ("xml" or
# "lxml-xml") causes only the first ~70 kb of the file
# being parsed... But the lxml parser should work good
# enough for our needs, even if it uses non-html tags.
docsoup = BeautifulSoup(open(xmlfile), "lxml")
dokid = docsoup.find('dok_id').text
if docsoup.find('dokument_url_html'):
htmlurl = docsoup.find('dokument_url_html').text
htmlfile = self.store.downloaded_path(basefile, attachment=docname + ".html")
#self.log.debug(" Downloading to %s" % htmlfile)
r = self.download_if_needed(htmlurl, basefile, filename=htmlfile)
if r:
self.log.debug(" Downloaded html ver to %s" % htmlfile)
elif docsoup.find('dokument_url_text'):
texturl = docsoup.find('dokument_url_text').text
textfile = self.store.downloaded_path(basefile, attachment=docname + ".txt")
#self.log.debug(" Downloading to %s" % htmlfile)
r = self.download_if_needed(texturl, basefile, filename=textfile)
if r:
self.log.debug(" Downloaded text ver to %s" % textfile)
fileupdated = fileupdated or r
for b in docsoup.findAll('bilaga'):
# self.log.debug("Looking for %s, found %s", dokid, b.dok_id.text)
if b.dok_id.text != dokid:
continue
if b.filtyp is None:
# apparantly this can happen sometimes? Very intermitently, though.
self.log.warning(
"Couldn't find filtyp for bilaga %s in %s" %
(b.dok_id.text, xmlfile))
continue
filetype = "." + b.filtyp.text
filename = self.store.downloaded_path(basefile, attachment=docname + filetype)
# self.log.debug(" Downloading to %s" % filename)
try:
r = self.download_if_needed(b.fil_url.text, basefile, filename=filename)
with open(xmlfile) as fp:
docsoup = BeautifulSoup(fp, "lxml-xml")

# At this point, 99% of the contents of xmlfile is a
# totally unnecessary html element, containing a bad
# representation of the document content (we want the
# better pdf file anyway). Maybe we should open it and
# just zap the html element? NOTE: This means that the
# content on disk is no longer a true copy of the remote
# resource, but we'll just accept that in this case.
docsoup.html.decompose()
with open(xmlfile, "w") as fp:
fp.write(str(docsoup))

fileupdated = False
if self.get_parse_options(basefile) == "metadataonly":
self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF/HTML files" % basefile)
else:
r = None
dokid = docsoup.find('dok_id').text
if docsoup.find('dokument_url_html'):
htmlurl = docsoup.find('dokument_url_html').text
htmlfile = self.store.downloaded_path(basefile, attachment=docname + ".html")
#self.log.debug(" Downloading to %s" % htmlfile)
r = self.download_if_needed(htmlurl, basefile, filename=htmlfile)
if r:
self.log.debug(" Downloaded attachment as %s" % filename)
except requests.exceptions.HTTPError as e:
# occasionally we get a 404 even though we shouldn't. Report and hope it
# goes better next time.
self.log.error(" Failed: %s" % e)
continue
self.log.debug(" Downloaded html ver to %s" % htmlfile)
elif docsoup.find('dokument_url_text'):
texturl = docsoup.find('dokument_url_text').text
textfile = self.store.downloaded_path(basefile, attachment=docname + ".txt")
#self.log.debug(" Downloading to %s" % htmlfile)
r = self.download_if_needed(texturl, basefile, filename=textfile)
if r:
self.log.debug(" Downloaded text ver to %s" % textfile)
fileupdated = fileupdated or r
break
for b in docsoup.findAll('bilaga'):
# self.log.debug("Looking for %s, found %s", dokid, b.dok_id.text)
if b.dok_id.text != dokid:
continue
if b.filtyp is None:
# apparantly this can happen sometimes? Very intermitently, though.
self.log.warning(
"Couldn't find filtyp for bilaga %s in %s" %
(b.dok_id.text, xmlfile))
continue
filetype = "." + b.filtyp.text
filename = self.store.downloaded_path(basefile, attachment=docname + filetype)
# self.log.debug(" Downloading to %s" % filename)
try:
r = self.download_if_needed(b.fil_url.text, basefile, filename=filename)
if r:
self.log.debug(" Downloaded attachment as %s" % filename)
except requests.exceptions.HTTPError as e:
# occasionally we get a 404 even though we shouldn't. Report and hope it
# goes better next time.
self.log.error(" Failed: %s" % e)
continue
fileupdated = fileupdated or r
break
except requests.exceptions.HTTPError as e:
self.log.error("%s: Failed: %s" % (basefile, e))
return False

if updated or fileupdated:
return True # Successful download of new or changed file
else:
Expand Down
2 changes: 1 addition & 1 deletion ferenda/sources/legal/se/swedishlegalsource.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ def parse(self, doc):
# in fact not found.
doc.body = Body([PreambleSection([
P(["Detta dokument har bedömts ha begränsad juridisk betydelse, så dess innehåll har inte tagits med här. Du kan hitta originaldokumentet från dess källa genom länken till höger."]),
P(["Om du tycker att dokumentet bör tas med, ", A("hör gärna av dig!", href="/om/kontakt")])
P(["Om du tycker att dokumentet bör tas med, ", A(["hör gärna av dig!"], href="/om/kontakt")])
], title='Dokumenttext saknas')])
else:
doc.body = self.parse_body(fp, doc.basefile)
Expand Down

0 comments on commit 79750da

Please sign in to comment.