updated a bunch of repos to not download more than absolutely needed …

…when parse option 'metadataonly' is set
staffanm · Jun 10, 2018 · 79750da · 79750da
1 parent 5ea4903
commit 79750da
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 85 deletions.
diff --git a/ferenda/sources/legal/se/propositioner.py b/ferenda/sources/legal/se/propositioner.py
@@ -32,7 +32,7 @@
 from . import (Trips, NoMoreLinks, Regeringen, Riksdagen,
                SwedishLegalSource, SwedishLegalStore, RPUBL, Offtryck)
 from .fixedlayoutsource import FixedLayoutStore, FixedLayoutSource
-from .swedishlegalsource import lazyread
+from .swedishlegalsource import lazyread, SwedishLegalStore
 from .elements import Sidbrytning
 
 def prop_sanitize_identifier(identifier):
@@ -635,6 +635,8 @@ class PropRiksdagen(Riksdagen):
     def sanitize_identifier(self, identifier):
         return prop_sanitize_identifier(identifier)
 
+class PropKBStore(SwedishLegalStore):
+    downloaded_suffixes = [".pdf", ".xml"]
 
 class PropKB(Offtryck, PDFDocumentRepository):
     alias = "propkb"
@@ -644,7 +646,8 @@ class PropKB(Offtryck, PDFDocumentRepository):
     basefile_regex = "prop_(?P<year>\d{4})(?P<type>_urtima|_höst|_a|_b|)__+(?P<no>\d+)(?:_(?P<part>\d+)|)"
     document_type = PROPOSITION = True
     SOU = DS = KOMMITTEDIREKTIV = False
-
+    documentstore_class = PropKBStore
+
     @classmethod
     def get_default_options(cls):
         opts = super(PropKB, cls).get_default_options()
@@ -701,7 +704,7 @@ def download_get_basefiles(self, source):
                             continue
                         if self.get_parse_options(basefile) == "skip":
                             continue
-                        if part and int(part) > 1:
+                        if part and int(part) > 1 and self.get_parse_options(basefile) != "metadataonly":
                             # Download attachments ourselves -- not
                             # really what download_get_basefile should
                             # do, but hey....
@@ -722,14 +725,30 @@ def download_single(self, basefile, url=None):
         if not url:
             entry = DocumentEntry(self.store.documententry_path(basefile))
             url = entry.orig_url
+        if self.get_parse_options(basefile) == "metadataonly":
+            # in these cases, to save space, get
+            # the smaller XML OCR data, not the
+            # actual scanned images-in-PDF
+            url = url.replace(".pdf", ".xml").replace("pdf/web", "xml")
+            # make store.downloaded_path return .xml suffixes (and set
+            # the timestamp to the beginning of epoch so that the
+            # resulting if-modified-since header doesn't contain the
+            # current date/time
+            downloaded_path = self.store.downloaded_path(basefile).replace(".pdf", ".xml")
+            if not os.path.exists(downloaded_path):
+                util.writefile(downloaded_path, "")
+                os.utime(downloaded_path, (0,0))
         return super(PropKB, self).download_single(basefile, url)
 
 
-    @lazyread
+    # @lazyread
     def downloaded_to_intermediate(self, basefile, attachment=None):
         downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
-        intermediate_path = self.store.intermediate_path(basefile)
-        return self.convert_pdf(downloaded_path, intermediate_path)
+        if downloaded_path.endswith(".xml"):
+            return open(downloaded_path)
+        else:
+            intermediate_path = self.store.intermediate_path(basefile)
+            return self.convert_pdf(downloaded_path, intermediate_path)
 
     def convert_pdf(self, downloaded_path, intermediate_path):
         intermediate_dir = os.path.dirname(intermediate_path)
@@ -744,12 +763,41 @@ def convert_pdf(self, downloaded_path, intermediate_path):
         return reader.convert(**kwargs)
 
     def extract_head(self, fp, basefile):
-        return None  # "rawhead" is never used
+        if self.get_parse_options(basefile) == "metadataonly":
+            tree = etree.parse(fp)
+            firstpage = tree.find("//{http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml}page")
+            return firstpage
+        else:
+            return None  # "rawhead" is never used
 
     def extract_metadata(self, rawhead, basefile):
+        res = self.metadata_from_basefile(basefile)
         # extracting title and other metadata (dep, publication date
-        # etc) requires parsing of the body)
-        return self.metadata_from_basefile(basefile)
+        # etc) requires parsing of the body (and subsequent processing
+        # in postprocess_doc). For documents marked as metadataonly in
+        # options.py, the body is never parsed. Therefore, we do a
+        # very limited parsing of the first page here.
+        if self.get_parse_options(basefile) == "metadataonly":
+            text = util.normalize_space(etree.tostring(rawhead, method="text", encoding="utf-8").decode("utf-8"))
+            res.update(self.find_firstpage_metadata(text, basefile))
+        return res
+
+    def find_firstpage_metadata(self, firstpage, basefile):
+        res = {}
+        m = re.search("proposition till riksdagen *,? *(.*?); gif?ven",
+                      util.normalize_space(firstpage), flags=re.I)
+        if not m:
+            self.log.warning("%s: Couldn't find title in first %s characters (first page)" %
+                             (basefile, len(firstpage)))
+        else:
+            res["dcterms:title"] = m.groups(1)
+        m = re.search("gif?ven stockholms slott den (\d+ \w+ \d{4})", util.normalize_space(firstpage), flags=re.I)
+        if not m:
+            self.log.warning("%s: Couldn't find date in first %s characters (first page)" %
+                             (basefile, len(firstpage)))
+        else:
+            res["dcterms:issued"] = self.parse_swedish_date(m.group(1).lower())
+        return res
 
     def extract_body(self, fp, basefile):
         reader = StreamingPDFReader()
@@ -773,27 +821,20 @@ def extract_body(self, fp, basefile):
         return reader
 
     def postprocess_doc(self, doc):
+        if self.get_parse_options(doc.basefile) == "metadataonly":
+            return
         # the first thing will be a Sidbrytning; continue scanning text until next sidbrytning
         firstpage = ""
         for thing in doc.body[1:]:
             if isinstance(thing, Sidbrytning):
                 break
             elif isinstance(thing, Textbox):
-                firstpage += str(thing) + "\n\n"
-        m = re.search("proposition till riksdagen *,? *(.*?); gif?ven",
-                      util.normalize_space(firstpage), flags=re.I)
-        if not m:
-            self.log.warning("%s: Couldn't find title in first %s characters (first page)" %
-                             (doc.basefile, len(firstpage)))
-        else:
-            doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(m.group(1), lang=self.lang)))
-        m = re.search("gif?ven stockholms slott den (\d+ \w+ \d{4})", util.normalize_space(firstpage), flags=re.I)
-        if not m:
-            self.log.warning("%s: Couldn't find date in first %s characters (first page)" %
-                             (doc.basefile, len(firstpage)))
-        else:
-            d = self.parse_swedish_date(m.group(1).lower())
-            doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(d)))
+                firstpage += util.normalize_space(str(thing)) + "\n\n"
+        metadata = self.find_firstpage_metadata(firstpage, doc.basefile)
+        if "dcterms:title" in metadata:
+            doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(metadata["dcterms:title"], lang=self.lang)))
+        if "dcterms:issued" in metadata:
+            doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(metadata["dcterms:issued"])))
 
 
 # inherit list_basefiles_for from CompositeStore, basefile_to_pathfrag

diff --git a/ferenda/sources/legal/se/regeringen.py b/ferenda/sources/legal/se/regeringen.py
@@ -253,6 +253,8 @@ def download_get_basefiles(self, params):
 
 
     def download_single(self, basefile, url=None):
+        if self.get_parse_options(basefile) == "skip":
+            raise DownloadSkippedError("%s should not be downloaded according to options.py" % basefile)
         if not url:
             url = self.remote_url(basefile)
             if not url:  # remote_url failed
@@ -283,25 +285,28 @@ def download_single(self, basefile, url=None):
             else:
                 self.log.info("%s: downloaded from %s" % (basefile, url))
 
-            soup = BeautifulSoup(codecs.open(filename, encoding=self.source_encoding), "lxml")
-            cnt = 0
-            selected_files = self.find_doc_links(soup, basefile)
-            if selected_files:
-                for (filename, filetype,label) in selected_files:
-                    fileurl = urljoin(url, filename)
-                    basepath = filename.split("/")[-1]
-                    filename = self.store.downloaded_path(basefile, attachment=basepath)
-                    if not filename.lower().endswith(".pdf"):
-                        filename += ".%s" % filetype
-                    if self.download_if_needed(fileurl, basefile, filename=filename):
-                        filesupdated = True
-                        self.log.debug(
-                            "    %s is new or updated" % filename)
-                    else:
-                        self.log.debug("    %s is unchanged" % filename)
+            if self.get_parse_options(basefile) == "metadataonly":
+                self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF file" % basefile)
             else:
-                self.log.warning(
-                    "%s (%s) has no downloadable files" % (basefile, url))
+                soup = BeautifulSoup(codecs.open(filename, encoding=self.source_encoding), "lxml")
+                cnt = 0
+                selected_files = self.find_doc_links(soup, basefile)
+                if selected_files:
+                    for (filename, filetype,label) in selected_files:
+                        fileurl = urljoin(url, filename)
+                        basepath = filename.split("/")[-1]
+                        filename = self.store.downloaded_path(basefile, attachment=basepath)
+                        if not filename.lower().endswith(".pdf"):
+                            filename += ".%s" % filetype
+                        if self.download_if_needed(fileurl, basefile, filename=filename):
+                            filesupdated = True
+                            self.log.debug(
+                                "    %s is new or updated" % filename)
+                        else:
+                            self.log.debug("    %s is unchanged" % filename)
+                else:
+                    self.log.warning(
+                        "%s (%s) has no downloadable files" % (basefile, url))
             if updated or filesupdated:
                 pass
             else:

diff --git a/ferenda/sources/legal/se/riksdagen.py b/ferenda/sources/legal/se/riksdagen.py
@@ -149,6 +149,8 @@ def remote_url(self, basefile):
             base36year, doctypecode, pnr)
 
     def download_single(self, basefile, url=None):
+        if self.get_parse_options(basefile) == "skip":
+            raise errors.DownloadSkippedError("%s should not be downloaded according to options.py" % basefile)
         attachment = None
         if isinstance(basefile, tuple):
             basefile, attachment = basefile
@@ -176,57 +178,73 @@ def download_single(self, basefile, url=None):
                                    (basefile, xmlfile))
             else:
                 self.log.info("%s: downloaded from %s" % (basefile, url))
-            fileupdated = False
-            r = None
+
             # for some reason, using a XML parser ("xml" or
             # "lxml-xml") causes only the first ~70 kb of the file
             # being parsed... But the lxml parser should work good
             # enough for our needs, even if it uses non-html tags.
-            docsoup = BeautifulSoup(open(xmlfile), "lxml")
-            dokid = docsoup.find('dok_id').text
-            if docsoup.find('dokument_url_html'):
-                htmlurl = docsoup.find('dokument_url_html').text
-                htmlfile = self.store.downloaded_path(basefile, attachment=docname + ".html")
-                #self.log.debug("   Downloading to %s" % htmlfile)
-                r = self.download_if_needed(htmlurl, basefile, filename=htmlfile)
-                if r:
-                    self.log.debug("    Downloaded html ver to %s" % htmlfile)
-            elif docsoup.find('dokument_url_text'):
-                texturl = docsoup.find('dokument_url_text').text
-                textfile = self.store.downloaded_path(basefile, attachment=docname + ".txt")
-                #self.log.debug("   Downloading to %s" % htmlfile)
-                r = self.download_if_needed(texturl, basefile, filename=textfile)
-                if r:
-                    self.log.debug("    Downloaded text ver to %s" % textfile)
-            fileupdated = fileupdated or r
-            for b in docsoup.findAll('bilaga'):
-                # self.log.debug("Looking for %s, found %s", dokid, b.dok_id.text)
-                if b.dok_id.text != dokid:
-                    continue
-                if b.filtyp is None:
-                    # apparantly this can happen sometimes? Very intermitently, though.
-                    self.log.warning(
-                        "Couldn't find filtyp for bilaga %s in %s" %
-                        (b.dok_id.text, xmlfile))
-                    continue
-                filetype = "." + b.filtyp.text
-                filename = self.store.downloaded_path(basefile, attachment=docname + filetype)
-                # self.log.debug("   Downloading to %s" % filename)
-                try:
-                    r = self.download_if_needed(b.fil_url.text, basefile, filename=filename)
+            with open(xmlfile) as fp:
+                docsoup = BeautifulSoup(fp, "lxml-xml")
+
+            # At this point, 99% of the contents of xmlfile is a
+            # totally unnecessary html element, containing a bad
+            # representation of the document content (we want the
+            # better pdf file anyway). Maybe we should open it and
+            # just zap the html element? NOTE: This means that the
+            # content on disk is no longer a true copy of the remote
+            # resource, but we'll just accept that in this case.
+            docsoup.html.decompose()
+            with open(xmlfile, "w") as fp:
+                fp.write(str(docsoup))
+
+            fileupdated = False
+            if self.get_parse_options(basefile) == "metadataonly":
+                self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF/HTML files" % basefile)
+            else:
+                r = None
+                dokid = docsoup.find('dok_id').text
+                if docsoup.find('dokument_url_html'):
+                    htmlurl = docsoup.find('dokument_url_html').text
+                    htmlfile = self.store.downloaded_path(basefile, attachment=docname + ".html")
+                    #self.log.debug("   Downloading to %s" % htmlfile)
+                    r = self.download_if_needed(htmlurl, basefile, filename=htmlfile)
                     if r:
-                        self.log.debug("    Downloaded attachment as %s" % filename)
-                except requests.exceptions.HTTPError as e:
-                    # occasionally we get a 404 even though we shouldn't. Report and hope it
-                    # goes better next time.
-                    self.log.error("   Failed: %s" % e)
-                    continue
+                        self.log.debug("    Downloaded html ver to %s" % htmlfile)
+                elif docsoup.find('dokument_url_text'):
+                    texturl = docsoup.find('dokument_url_text').text
+                    textfile = self.store.downloaded_path(basefile, attachment=docname + ".txt")
+                    #self.log.debug("   Downloading to %s" % htmlfile)
+                    r = self.download_if_needed(texturl, basefile, filename=textfile)
+                    if r:
+                        self.log.debug("    Downloaded text ver to %s" % textfile)
                 fileupdated = fileupdated or r
-                break
+                for b in docsoup.findAll('bilaga'):
+                    # self.log.debug("Looking for %s, found %s", dokid, b.dok_id.text)
+                    if b.dok_id.text != dokid:
+                        continue
+                    if b.filtyp is None:
+                        # apparantly this can happen sometimes? Very intermitently, though.
+                        self.log.warning(
+                            "Couldn't find filtyp for bilaga %s in %s" %
+                            (b.dok_id.text, xmlfile))
+                        continue
+                    filetype = "." + b.filtyp.text
+                    filename = self.store.downloaded_path(basefile, attachment=docname + filetype)
+                    # self.log.debug("   Downloading to %s" % filename)
+                    try:
+                        r = self.download_if_needed(b.fil_url.text, basefile, filename=filename)
+                        if r:
+                            self.log.debug("    Downloaded attachment as %s" % filename)
+                    except requests.exceptions.HTTPError as e:
+                        # occasionally we get a 404 even though we shouldn't. Report and hope it
+                        # goes better next time.
+                        self.log.error("   Failed: %s" % e)
+                        continue
+                    fileupdated = fileupdated or r
+                    break
         except requests.exceptions.HTTPError as e:
             self.log.error("%s: Failed: %s" % (basefile, e))
             return False
-
         if updated or fileupdated:
             return True  # Successful download of new or changed file
         else:

diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py
@@ -598,7 +598,7 @@ def parse(self, doc):
             # in fact not found.
             doc.body = Body([PreambleSection([
                 P(["Detta dokument har bedömts ha begränsad juridisk betydelse, så dess innehåll har inte tagits med här. Du kan hitta originaldokumentet från dess källa genom länken till höger."]),
-                P(["Om du tycker att dokumentet bör tas med, ", A("hör gärna av dig!", href="/om/kontakt")])
+                P(["Om du tycker att dokumentet bör tas med, ", A(["hör gärna av dig!"], href="/om/kontakt")])
             ], title='Dokumenttext saknas')])
         else:
             doc.body = self.parse_body(fp, doc.basefile)