Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Apr 1, 2018
1 parent 436a5da commit bff31f3
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 11 deletions.
2 changes: 1 addition & 1 deletion ferenda/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1249,7 +1249,7 @@ def _build_worker(jobqueue, resultqueue, clientname):
except EOFError as e:
print("%s: Result of %s %s %s couldn't be put on resultqueue" % (
os.getpid(), job['classname'], job['command'], job['basefile']))
except TypeError, AttributeError, RemoteError as e:
except (TypeError, AttributeError, RemoteError) as e:
# * TypeError: Has happened with a "can't pickle
# pyexpat.xmlparser objects". Still not sure what was
# the cause of that.
Expand Down
16 changes: 10 additions & 6 deletions ferenda/sources/legal/se/myndfskr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from io import BytesIO
import os
import re
from collections import OrderedDict

from rdflib import URIRef, Literal, Namespace
from bs4 import BeautifulSoup
Expand All @@ -18,6 +19,7 @@
from rdflib.namespace import DCTERMS, SKOS

from . import RPUBL, RINFOEX, SwedishLegalSource, FixedLayoutSource
from .fixedlayoutsource import FixedLayoutStore
from .swedishlegalsource import SwedishCitationParser, SwedishLegalStore
from ferenda import TextReader, Describer, Facet, PDFReader, DocumentEntry
from ferenda import util, decorators, errors, fulltextindex
Expand All @@ -35,6 +37,10 @@

class RequiredTextMissing(errors.ParseError): pass

class MyndFskrStore(FixedLayoutStore):
doctypes = OrderedDict([(".pdf", b'%PDF'),
(".html", b'<!DO'), # HTML can start basically in any way. This is the HTML5 way, which might be common for our targets])
])
class MyndFskrBase(FixedLayoutSource):
"""A abstract base class for fetching and parsing regulations from
various swedish government agencies. These documents often have a
Expand Down Expand Up @@ -65,14 +71,12 @@ class MyndFskrBase(FixedLayoutSource):

nextpage_regex = None
nextpage_url_regex = None
download_rewrite_url = False
# iff True, use remote_url to rewrite download links instead of
download_rewrite_url = False # iff True, use remote_url to rewrite download links instead of
# accepting found links as-is. If it's a callable, call that with
# basefile, URL and expect a rewritten URL.


download_formid = None # if the paging uses forms, POSTs and other
# forms of insanity
download_formid = None # if the paging uses forms, POSTs and other forms of insanity
documentstore_class = MyndFskrStore

@classmethod
def get_default_options(cls):
Expand Down Expand Up @@ -1443,7 +1447,7 @@ def download_single(self, basefile, url):
# this also updates the docentry
html_downloaded = super(SKVFS, self).download_single(basefile, url)
# try to find link to a PDF in what was just downloaded
soup = BeautifulSoup(util.readfile(self.store.downloaded_path(basefile, suffix=".html")), "lxml")
soup = BeautifulSoup(util.readfile(self.store.downloaded_path(basefile)), "lxml")
pdffilename = self.store.downloaded_path(basefile,
attachment="index.pdf")
if (self.config.refresh or not(os.path.exists(pdffilename))):
Expand Down
3 changes: 2 additions & 1 deletion ferenda/sources/legal/se/regeringen.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ def download(self, basefile=None, url=None):
"http://www.regeringen.se/rattsdokument/departementsserien-och-promemorior/2015/12/"
"andringar-i-rennaringsforordningen-1993384/", # mistaken for a DS when it's really a unpublished PM
"http://www.regeringen.se/rattsdokument/departementsserien-och-promemorior/2015/12/"
"andring-av-bestammelserna-om-ratt-till-bistand-i-lagen-1994137-om-mottagande-av-asylsokande-m.fl/" # same
"andring-av-bestammelserna-om-ratt-till-bistand-i-lagen-1994137-om-mottagande-av-asylsokande-m.fl/", # same
"http://www.regeringen.se/rattsdokument/proposition/2018/01/sou-2071883/" # looks like 2071/88:3, but should be 2017/18:83 (and also not SOU!)
])

def attribs_from_url(self, url):
Expand Down
5 changes: 3 additions & 2 deletions test/integrationLagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,8 @@ def test_sfs_source(self):

def test_facsimiles(self):
# issue 3
for urlseg, pages in (("prop/2004/05:147", [36, 48]),
for urlseg, pages in (("prop/1915:83", [29]), # new kind of repo
("prop/2004/05:147", [36, 48]),
("prop/1997/98:177", [18, 30, 32]),
("prop/1997/98:179", [57, 58, 43]),
("prop/2007/08:95", [56, 295, 296]),
Expand All @@ -957,7 +958,7 @@ def test_facsimiles(self):
("prop/2008/09:14", [15]),
("prop/2011/12:115", [15]),
("prop/1980/81:44", [15]),
("prop/1996/97:9", [1])
("prop/1996/97:9", [1]),
):
for page in pages:
url = self.baseurl + urlseg + "/sid%s.png" % page
Expand Down
2 changes: 1 addition & 1 deletion tools/fabfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def copy_elastic():
@hosts('colo.tomtebo.org')
def copy_files():
# NOTE: This includes themes etc in data/rsrc
rsync_project(local_dir="/mnt/ferenda/tng.lagen.nu/data/",
rsync_project(local_dir="/home/staffan/wds/ferenda/tng.lagen.nu/data/",
remote_dir="/home/staffan/www/ferenda.lagen.nu/data",
exclude=["*downloaded*", "*archive*"],
delete=True,
Expand Down

0 comments on commit bff31f3

Please sign in to comment.