xxx

staffanm · Jun 10, 2018 · 2d45762 · 2d45762
1 parent 79750da
commit 2d45762
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 15 deletions.
diff --git a/ferenda/errors.py b/ferenda/errors.py
@@ -24,9 +24,6 @@ class DownloadFileNotFoundError(DownloadError):
     reason."""
     pass
 
-class DownloadSkippedError(DownloadError):
-    """Raised when asked to download a document that the options file indicates should be skipped."""
-    pass
 class ParseError(FerendaException):
 
     """Raised when :py:meth:`~ferenda.DocumentRepository.parse` fails in

diff --git a/ferenda/sources/legal/se/regeringen.py b/ferenda/sources/legal/se/regeringen.py
@@ -156,7 +156,7 @@ def download(self, basefile=None, url=None):
         "http://www.regeringen.se/rattsdokument/statens-offentliga-utredningar/1921/10/noter-med-egypten-angaende-forlangning-av-de-blandade-domstolarnas-verksamhet-m.-m/", # SÖ, not SOU
         "http://www.regeringen.se/rattsdokument/statens-offentliga-utredningar/1921/07/ministeriella-noter-vaxlade-med-italien-angaende-omsesidighet-rorande-ersattning-for-olycksfall-i-arbete/", # SÖ, not SOU
         "http://www.regeringen.se/rattsdokument/statens-offentliga-utredningar/1921/10/konvention-angaende-faststallande-av-minimialder-for-barns-anvandande-i-arbete-till-sjoss/", # SÖ, not SOU
-
+        "https://www.regeringen.se/rattsliga-dokument/proposition/2018/01/sou-2071883" # missing a 1, leading to the interpretation prop. 2071/88:3 instead of 2017/18:83
 
     ])
 
@@ -254,7 +254,7 @@ def download_get_basefiles(self, params):
 
     def download_single(self, basefile, url=None):
         if self.get_parse_options(basefile) == "skip":
-            raise DownloadSkippedError("%s should not be downloaded according to options.py" % basefile)
+            raise DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
         if not url:
             url = self.remote_url(basefile)
             if not url:  # remote_url failed

diff --git a/ferenda/sources/legal/se/riksdagen.py b/ferenda/sources/legal/se/riksdagen.py
@@ -70,7 +70,7 @@ class Riksdagen(Offtryck, FixedLayoutSource):
     documentstore_class = RiksdagenStore
     document_type = None
     start_url = None
-    start_url_template = "http://data.riksdagen.se/dokumentlista/?sort=datum&sortorder=asc&utformat=xml&doktyp=%(doctype)s"
+    start_url_template = "http://data.riksdagen.se/dokumentlista/?sort=datum&sortorder=desc&utformat=xml&doktyp=%(doctype)s"
 
 
     @property
@@ -97,7 +97,9 @@ def download_get_basefiles(self, start_url):
         url = start_url
         done = False
         pagecount = 1
+        seenurls = set()
         while not done:
+            seenurls.add(url)
             resp = requests.get(url)
             soup = BeautifulSoup(resp.text, features="xml")
 
@@ -119,9 +121,14 @@ def download_get_basefiles(self, start_url):
                     attachment = doc.tempbeteckning.text
                 yield (basefile, attachment), doc.dokumentstatus_url_xml.text
             try:
-                url = soup.dokumentlista['nasta_sida']
-                pagecount += 1
-                self.log.debug("Getting page #%d" % pagecount)
+                nexturl = soup.dokumentlista['nasta_sida']
+                if nexturl in seenurls:
+                    self.log.warning("Saw %s for the second time, probably means that we've got what we can" % nexturl)
+                    done = True
+                else:
+                    url = nexturl
+                    pagecount += 1
+                    self.log.debug("Getting page #%d" % pagecount)
             except KeyError:
                 self.log.debug("That was the last page")
                 done = True
@@ -150,7 +157,7 @@ def remote_url(self, basefile):
 
     def download_single(self, basefile, url=None):
         if self.get_parse_options(basefile) == "skip":
-            raise errors.DownloadSkippedError("%s should not be downloaded according to options.py" % basefile)
+            raise errors.DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
         attachment = None
         if isinstance(basefile, tuple):
             basefile, attachment = basefile
@@ -169,7 +176,7 @@ def download_single(self, basefile, url=None):
         existed = os.path.exists(xmlfile)
         self.log.debug("  %s: Downloading to %s" % (basefile, xmlfile))
         try:
-            updated = self.download_if_needed(url, basefile)
+            updated = self.download_if_needed(url, basefile, archive=self.download_archive)
             if existed:
                 if updated:
                     self.log.info("  %s: updated from %s" % (basefile, url))
@@ -207,14 +214,14 @@ def download_single(self, basefile, url=None):
                     htmlurl = docsoup.find('dokument_url_html').text
                     htmlfile = self.store.downloaded_path(basefile, attachment=docname + ".html")
                     #self.log.debug("   Downloading to %s" % htmlfile)
-                    r = self.download_if_needed(htmlurl, basefile, filename=htmlfile)
+                    r = self.download_if_needed(htmlurl, basefile, filename=htmlfile, archive=self.download_archive)
                     if r:
                         self.log.debug("    Downloaded html ver to %s" % htmlfile)
                 elif docsoup.find('dokument_url_text'):
                     texturl = docsoup.find('dokument_url_text').text
                     textfile = self.store.downloaded_path(basefile, attachment=docname + ".txt")
                     #self.log.debug("   Downloading to %s" % htmlfile)
-                    r = self.download_if_needed(texturl, basefile, filename=textfile)
+                    r = self.download_if_needed(texturl, basefile, filename=textfile, archive=self.download_archive)
                     if r:
                         self.log.debug("    Downloaded text ver to %s" % textfile)
                 fileupdated = fileupdated or r
@@ -232,7 +239,7 @@ def download_single(self, basefile, url=None):
                     filename = self.store.downloaded_path(basefile, attachment=docname + filetype)
                     # self.log.debug("   Downloading to %s" % filename)
                     try:
-                        r = self.download_if_needed(b.fil_url.text, basefile, filename=filename)
+                        r = self.download_if_needed(b.fil_url.text, basefile, filename=filename, archive=self.download_archive)
                         if r:
                             self.log.debug("    Downloaded attachment as %s" % filename)
                     except requests.exceptions.HTTPError as e:

diff --git a/ferenda/sources/legal/se/sou.py b/ferenda/sources/legal/se/sou.py
@@ -208,7 +208,7 @@ def download_get_basefiles(self, source):
 
     def download_single(self, basefile, url):
         if self.get_parse_options(basefile) == "skip":
-            raise errors.DownloadSkippedError("%s should not be downloaded according to options.py" % basefile)
+            raise errors.DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
         # url is really a 2-tuple
         url, title = url
         resp = self.session.get(url)