make sure list_basefiles_for also returns those SOUs that have parseo…

…ption=metadataony
staffanm · Aug 19, 2018 · ab7995d · ab7995d
1 parent 9d7c190
commit ab7995d
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 16 deletions.
diff --git a/ferenda/sources/legal/se/myndfskr.py b/ferenda/sources/legal/se/myndfskr.py
@@ -1,3 +1,4 @@
+
 # -*- coding: utf-8 -*-
 from __future__ import (absolute_import, division,
                         print_function, unicode_literals)
@@ -1167,7 +1168,7 @@ def download_post_form(self, form, url):
         resp = self.session.send(req, allow_redirects=True)
         return resp
 
-    def maintext_from_soup(self, soup):
+    def main_from_soup(self, soup):
         main = soup.find("div", id="readme")
         if main:
             main.find("div", "rs_skip").decompose()
@@ -1189,11 +1190,14 @@ def maintext_from_soup(self, soup):
             e.dummyfile = self.store.parsed_path(basefile)
             raise e
 
+    def maintext_from_soup(self, soup):
+        main = self.main_from_soup(soup)
+        return main.get_text("\n\n", strip=True)
+
     def textreader_from_basefile(self, basefile, force_ocr=False, attachment=None):
         infile = self.store.downloaded_path(basefile)
         soup = BeautifulSoup(util.readfile(infile), "lxml")
-        main = self.maintext_from_soup(soup)
-        text = main.get_text("\n\n", strip=True)
+        text = self.maintext_from_soup(soup)
         text = self.sanitize_text(text, basefile)
         return TextReader(string=text)
 
@@ -1204,7 +1208,7 @@ def parse_open(self, basefile):
         return self.store.open_downloaded(basefile)
 
     def parse_body(self, fp, basefile):
-        main = BeautifulSoup(self.maintext_from_soup(BeautifulSoup(fp, "lxml")), "lxml")
+        main = self.main_from_soup(BeautifulSoup(fp, "lxml"))
         return Body([elements_from_soup(main)],
                     uri=None)
 
@@ -2056,7 +2060,7 @@ def consolidation_basis(self, soup):
     def maintext_from_soup(self, soup):
         main = soup.find("div", id="socextPageBody").find("div", "ms-rtestate-field")
         assert main
-        return main
+        return str(main)
 
     def parse_open(self, basefile):
         if basefile.startswith("konsolidering"):

diff --git a/ferenda/sources/legal/se/sou.py b/ferenda/sources/legal/se/sou.py
@@ -20,7 +20,7 @@
 from cached_property import cached_property
 
 from ferenda import (PDFAnalyzer, CompositeRepository, DocumentEntry,
-                     PDFDocumentRepository, CompositeStore, Facet)
+                     PDFDocumentRepository, CompositeStore, Facet, DocumentStore)
 from ferenda import util, decorators, errors
 from ferenda.pdfreader import StreamingPDFReader
 from . import Regeringen, SwedishLegalSource, FixedLayoutSource, SwedishLegalStore, Offtryck, RPUBL
@@ -145,6 +145,8 @@ def canonical_uri(self, basefile):
     def sanitize_identifier(self, identifier):
         return sou_sanitize_identifier(identifier)
 
+class SOUKBStore(SwedishLegalStore):
+    downloaded_suffixes = [".pdf", ".rdf"]
 
 class SOUKB(Offtryck, PDFDocumentRepository):
     alias = "soukb"
@@ -158,7 +160,8 @@ class SOUKB(Offtryck, PDFDocumentRepository):
     # A bit nonsensical, but required for SwedishLegalSource.get_parser
     document_type = SOU = True
     PROPOSITION = DS = KOMMITTEDIREKTIV = False
-
+    documentstore_class = SOUKBStore
+
     @classmethod
     def get_default_options(cls):
         opts = super(SOUKB, cls).get_default_options()
@@ -210,7 +213,7 @@ def download_get_basefiles(self, source):
     def download_single(self, basefile, url):
         if self.get_parse_options(basefile) == "skip":
             raise errors.DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
-        rdffilename = self.store.downloaded_path(basefile, attachment="metadata.rdf")
+        rdffilename = self.store.downloaded_path(basefile, attachment="index.rdf")
         if self.get_parse_options(basefile) == "metadataonly" and os.path.exists(rdffilename) and (not self.config.refresh):
             # it is kind of bad that we can even get here in these
             # cases (if a rdffile exists, and a empty index.pdf
@@ -234,14 +237,15 @@ def download_single(self, basefile, url):
 
         # download rdf metadata before actual content
         try:
-            # it appears that certain URLs (like curl
-            # http://data.libris.kb.se/open/bib/8351225.rdf)
-            # sometimes return an empty response. We should check
-            # and warn for this (and infer a minimal RDF by
-            # hand from what we can, eg dc:title from the link
-            # text)
+            # it appears that URLs like
+            # http://data.libris.kb.se/open/bib/8351225.rdf now
+            # returns empty responses. Until we find out the proper
+            # RDF endpoint URLs, we should check and warn for this
+            # (and infer a minimal RDF by hand from what we can, eg
+            # dc:title from the link text)
             self.download_if_needed(rdfurl, basefile,
-                                    filename=rdffilename)
+                                    filename=rdffilename,
+                                    archive=False)
             if os.path.getsize(rdffilename) == 0:
                 self.log.warning("%s: %s returned 0 response, infer RDF" %
                                  (basefile, rdfurl))
@@ -326,7 +330,7 @@ def extract_head(self, fp, basefile):
 
     def extract_metadata(self, rawhead, basefile):
         metadata = util.readfile(self.store.downloaded_path(
-            basefile, attachment="metadata.rdf"))
+            basefile, attachment="index.rdf"))
         # For some reason these RDF files might use canonical
         # decomposition form (NFD) which is less optimal. Fix this.
         metadata = unicodedata.normalize("NFC", metadata)