Skip to content

Commit

Permalink
xxx
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Jun 10, 2018
1 parent 79750da commit 2d45762
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 15 deletions.
3 changes: 0 additions & 3 deletions ferenda/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@ class DownloadFileNotFoundError(DownloadError):
reason."""
pass

class DownloadSkippedError(DownloadError):
"""Raised when asked to download a document that the options file indicates should be skipped."""
pass
class ParseError(FerendaException):

"""Raised when :py:meth:`~ferenda.DocumentRepository.parse` fails in
Expand Down
4 changes: 2 additions & 2 deletions ferenda/sources/legal/se/regeringen.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def download(self, basefile=None, url=None):
"http://www.regeringen.se/rattsdokument/statens-offentliga-utredningar/1921/10/noter-med-egypten-angaende-forlangning-av-de-blandade-domstolarnas-verksamhet-m.-m/", # SÖ, not SOU
"http://www.regeringen.se/rattsdokument/statens-offentliga-utredningar/1921/07/ministeriella-noter-vaxlade-med-italien-angaende-omsesidighet-rorande-ersattning-for-olycksfall-i-arbete/", # SÖ, not SOU
"http://www.regeringen.se/rattsdokument/statens-offentliga-utredningar/1921/10/konvention-angaende-faststallande-av-minimialder-for-barns-anvandande-i-arbete-till-sjoss/", # SÖ, not SOU

"https://www.regeringen.se/rattsliga-dokument/proposition/2018/01/sou-2071883" # missing a 1, leading to the interpretation prop. 2071/88:3 instead of 2017/18:83

])

Expand Down Expand Up @@ -254,7 +254,7 @@ def download_get_basefiles(self, params):

def download_single(self, basefile, url=None):
if self.get_parse_options(basefile) == "skip":
raise DownloadSkippedError("%s should not be downloaded according to options.py" % basefile)
raise DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
if not url:
url = self.remote_url(basefile)
if not url: # remote_url failed
Expand Down
25 changes: 16 additions & 9 deletions ferenda/sources/legal/se/riksdagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class Riksdagen(Offtryck, FixedLayoutSource):
documentstore_class = RiksdagenStore
document_type = None
start_url = None
start_url_template = "http://data.riksdagen.se/dokumentlista/?sort=datum&sortorder=asc&utformat=xml&doktyp=%(doctype)s"
start_url_template = "http://data.riksdagen.se/dokumentlista/?sort=datum&sortorder=desc&utformat=xml&doktyp=%(doctype)s"


@property
Expand All @@ -97,7 +97,9 @@ def download_get_basefiles(self, start_url):
url = start_url
done = False
pagecount = 1
seenurls = set()
while not done:
seenurls.add(url)
resp = requests.get(url)
soup = BeautifulSoup(resp.text, features="xml")

Expand All @@ -119,9 +121,14 @@ def download_get_basefiles(self, start_url):
attachment = doc.tempbeteckning.text
yield (basefile, attachment), doc.dokumentstatus_url_xml.text
try:
url = soup.dokumentlista['nasta_sida']
pagecount += 1
self.log.debug("Getting page #%d" % pagecount)
nexturl = soup.dokumentlista['nasta_sida']
if nexturl in seenurls:
self.log.warning("Saw %s for the second time, probably means that we've got what we can" % nexturl)
done = True
else:
url = nexturl
pagecount += 1
self.log.debug("Getting page #%d" % pagecount)
except KeyError:
self.log.debug("That was the last page")
done = True
Expand Down Expand Up @@ -150,7 +157,7 @@ def remote_url(self, basefile):

def download_single(self, basefile, url=None):
if self.get_parse_options(basefile) == "skip":
raise errors.DownloadSkippedError("%s should not be downloaded according to options.py" % basefile)
raise errors.DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
attachment = None
if isinstance(basefile, tuple):
basefile, attachment = basefile
Expand All @@ -169,7 +176,7 @@ def download_single(self, basefile, url=None):
existed = os.path.exists(xmlfile)
self.log.debug(" %s: Downloading to %s" % (basefile, xmlfile))
try:
updated = self.download_if_needed(url, basefile)
updated = self.download_if_needed(url, basefile, archive=self.download_archive)
if existed:
if updated:
self.log.info(" %s: updated from %s" % (basefile, url))
Expand Down Expand Up @@ -207,14 +214,14 @@ def download_single(self, basefile, url=None):
htmlurl = docsoup.find('dokument_url_html').text
htmlfile = self.store.downloaded_path(basefile, attachment=docname + ".html")
#self.log.debug(" Downloading to %s" % htmlfile)
r = self.download_if_needed(htmlurl, basefile, filename=htmlfile)
r = self.download_if_needed(htmlurl, basefile, filename=htmlfile, archive=self.download_archive)
if r:
self.log.debug(" Downloaded html ver to %s" % htmlfile)
elif docsoup.find('dokument_url_text'):
texturl = docsoup.find('dokument_url_text').text
textfile = self.store.downloaded_path(basefile, attachment=docname + ".txt")
#self.log.debug(" Downloading to %s" % htmlfile)
r = self.download_if_needed(texturl, basefile, filename=textfile)
r = self.download_if_needed(texturl, basefile, filename=textfile, archive=self.download_archive)
if r:
self.log.debug(" Downloaded text ver to %s" % textfile)
fileupdated = fileupdated or r
Expand All @@ -232,7 +239,7 @@ def download_single(self, basefile, url=None):
filename = self.store.downloaded_path(basefile, attachment=docname + filetype)
# self.log.debug(" Downloading to %s" % filename)
try:
r = self.download_if_needed(b.fil_url.text, basefile, filename=filename)
r = self.download_if_needed(b.fil_url.text, basefile, filename=filename, archive=self.download_archive)
if r:
self.log.debug(" Downloaded attachment as %s" % filename)
except requests.exceptions.HTTPError as e:
Expand Down
2 changes: 1 addition & 1 deletion ferenda/sources/legal/se/sou.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def download_get_basefiles(self, source):

def download_single(self, basefile, url):
if self.get_parse_options(basefile) == "skip":
raise errors.DownloadSkippedError("%s should not be downloaded according to options.py" % basefile)
raise errors.DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
# url is really a 2-tuple
url, title = url
resp = self.session.get(url)
Expand Down

0 comments on commit 2d45762

Please sign in to comment.