misc fixes

staffanm · Feb 4, 2018 · 5a4d0d9 · 5a4d0d9
1 parent 9716634
commit 5a4d0d9
Show file tree

Hide file tree

Showing 14 changed files with 95 additions and 71 deletions.
diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py
@@ -506,7 +506,8 @@ def lookup_resource(self, label, predicate=FOAF.name, cutoff=0.8, warn=True):
 
         fuzz = difflib.get_close_matches(label, resources.keys(), 1, cutoff)
         if fuzz:
-            if warn:
+            # even if we want warnings, we don't want warnings for case changes
+            if warn and label.lower() != fuzz[0].lower():
                 self.log.warning("Assuming that '%s' should be '%s'?" %
                                  (label, fuzz[0]))
             return URIRef(resources[fuzz[0]])

diff --git a/ferenda/manager.py b/ferenda/manager.py
@@ -1228,6 +1228,7 @@ def _build_worker(jobqueue, resultqueue, clientname):
         setproctitle(proctitle)
         log.debug("Client: [pid %s] %s finished: %s" % (os.getpid(), job['basefile'], res))
         outdict = {'basefile': job['basefile'],
+                   'alias': job['alias'],
                    'result':  res,
                    'log': list(logrecords),
                    'client': clientname}
@@ -1335,6 +1336,8 @@ def __queue_jobs_nomanager(jobqueue, iterable, inst, classname, command):
 
 
 def _queue_jobs(manager, iterable, inst, classname, command):
+    def format_tupleset(s):
+        return ", ".join(("%s:%s" % (t[0], t[1])) for t in s)
     jobqueue = manager.jobqueue()
     resultqueue = manager.resultqueue()
     log = getlog()
@@ -1359,33 +1362,35 @@ def _queue_jobs(manager, iterable, inst, classname, command):
                'config': client_config}
         # print("putting %r into jobqueue" %  job)
         jobqueue.put(job)
-        processing.add(basefile)
-    number_of_jobs = idx + 1
+        processing.add((inst.alias,basefile))
     res = []
-    if number_of_jobs == 0:
+    numres = 0
+    if len(processing) == 0:
         return res
-    log.info("Server: Put %s (%s) jobs into job queue" % (number_of_jobs, len(processing)))
+    log.info("%s: Put %s jobs into job queue" % (inst.alias, len(processing)))
     # FIXME: only one of the clients will read this DONE package, and
     # we have no real way of knowing how many clients there will be
     # (they can come and go at will). Didn't think this one through...
-    jobqueue.put("DONE")
-    numres = 0
+    # jobqueue.put("DONE")
     res = []
     clients = Counter()
     signal.signal(signal.SIGALRM, _resultqueue_get_timeout)
     # FIXME: be smart about how long we wait before timing out the resultqueue.get() call
-    timeout_length = 180 
-    while numres < number_of_jobs:
+    timeout_length = 300 
+    while len(processing) > 0:
         try:
             r = resultqueue.get()
         except TimeoutError:
-            log.critical("Timeout: %s jobs not processed (%s)" % (len(processing), ", ".join(processing)))
-            numres = number_of_jobs
+            log.critical("Timeout: %s jobs not processed (%s)" % (len(processing), format_tupleset(processing)))
+            processing.clear()
             continue
         signal.alarm(timeout_length)
-        if r['basefile'] not in processing:
-            log.warning("%s not found in processing (%s)" % (r['basefile'], ", ".join(processing)))
-        processing.discard(r['basefile'])
+        if (r['alias'], r['basefile']) not in processing:
+            if r['alias'] == inst.alias:
+                log.warning("%s not found in processing (%s)" % (r['basefile'], format_tupleset(processing)))
+            else:
+                log.warning("%s from repo %s was straggling, better late than never" % (r['basefile'], r['alias']))
+        processing.discard((r['alias'], r['basefile']))
         if isinstance(r['result'], tuple) and r['result'][0] == _WrappedKeyboardInterrupt:
             raise KeyboardInterrupt()
         elif isinstance(r['result'], tuple) and isinstance(r['result'][0], Exception):
@@ -1407,15 +1412,15 @@ def _queue_jobs(manager, iterable, inst, classname, command):
                 r)
         if 'client' in r:
             clients[r['client']] += 1
-        if 'result' in r:
+        if 'result' in r and r['alias'] == inst.alias:
             res.append(r['result'])
         numres += 1
 
     # ok, now we don't need to worry about timeouts anymore
     signal.alarm(0)
     # sort clients on name, not number of jobs
     clientstats = ", ".join(["%s: %s jobs" % (k, v) for k,v in sorted(clients.items())])
-    log.info("Server: %s jobs processed. %s" % (numres, clientstats))
+    log.info("%s: %s jobs processed. %s" % (inst.alias, numres, clientstats))
     return res
     # sleep(1)
     # don't shut this down --- the toplevel manager.run call must do

diff --git a/ferenda/sources/legal/se/elements.py b/ferenda/sources/legal/se/elements.py
@@ -639,8 +639,10 @@ def as_xhtml(self, uri, parent_uri=None):
         # do not add @property='dcterms:title' as we don't want to
         # create a RDF triple out of this. But we kind of have to set
         # rdf:type = bibo:DocumentPart (to make xsl/forarbete.xsl
-        # create proper TOC)
+        # create proper TOC). And _relate_fulltext_value_label is
+        # surprised if we lack title
         element.set('content', self.title)
+        element.set('property', 'dcterms:title')
         element.set('typeof', 'bibo:DocumentPart')
         return element
 

diff --git a/ferenda/sources/legal/se/offtryck.py b/ferenda/sources/legal/se/offtryck.py
@@ -1523,7 +1523,7 @@ def is_appendix_header(chunk):
                 if txt.startswith("Bilaga "):
                     # assume that whatever follows is a number -- if
                     # not, this is not a proper appendix header anyway
-                    return int(re.split(r"[ \:]")[1])
+                    return int(re.split(r"[ \:]", txt)[1])
                 else:
                     return True 
 

diff --git a/ferenda/sources/legal/se/propositioner.py b/ferenda/sources/legal/se/propositioner.py
@@ -38,8 +38,14 @@ def prop_sanitize_identifier(identifier):
         identifier = identifier.replace("PROP", "Prop")
     if identifier.startswith("Prop "):
         identifier = identifier.replace("Prop ", "Prop. ")
+    if "\xa0" in identifier: # Non-breakable space
+        identifier = identifier.replace("\xa0", " ")
     if not identifier.startswith("Prop. "):
         identifier = "Prop. " + identifier
+    # identify and correct the not-uncommon "2009/2010:87" pattern (should be 2009/10:87)
+    m = re.search(r"(\d{4})/(\d{4}):(\d+)$", identifier)
+    if m and m.group(2) != "2000" and int(m.group(1)) == int(m.group(2)) - 1:
+        identifier = identifier.replace(m.group(2), m.group(2)[-2:])
     if not re.match(r"^Prop\. (19|20)\d{2}(|/\d{2}|/2000):[1-9]\d*$", identifier):
         raise ValueError("Irregular identifier %s" % identifier)
     return Literal(identifier)

diff --git a/ferenda/sources/legal/se/regeringen.py b/ferenda/sources/legal/se/regeringen.py
@@ -321,7 +321,14 @@ def extract_metadata(self, rawhead, basefile):
         # the longest possible id, "Prop. 1999/2000:123", is 19 chars
         if len(title) < 20 and title.endswith(basefile):
             identifier = title
-            title = ""  # FIXME: hunt for title amongst the PDF file links
+            title = ""
+            # maybe the real title is hiding in the ingress of the page?
+            alttitle = content.find("p", "ingress")
+            if alttitle:
+                alttitle = alttitle.text.strip()
+                # some basic heuristics to determine if this is likely to be a title
+                if alttitle.startswith("Tilläggsdirektiv") or len(alttitle) > 120:
+                    title = alttitle
         else:
             identifier_node = content.find("span", "h1-vignette")
             if identifier_node:

diff --git a/ferenda/sources/legal/se/riksdagen.py b/ferenda/sources/legal/se/riksdagen.py
@@ -25,6 +25,7 @@
 from .fixedlayoutsource import FixedLayoutSource, FixedLayoutStore
 from . import Offtryck
 from .legalref import LegalRef
+from .swedishlegalsource import Lazyfile
 
 
 class RiksdagenStore(FixedLayoutStore):
@@ -284,20 +285,7 @@ def lazy_downloaded_to_intermediate(basefile):
                                          os.path.getsize(intermediate_path) / (1024*1024)))
             return res
 
-        class lazyfile(object):
-            def __init__(self, constructor):
-                self.constructor = constructor
-                self.fp = None
-                self.patchdescription = None
-                self.closed = True
-
-            def __getattr__(self, name):
-                if self.fp is None:
-                    self.fp = self.constructor()
-                    self.patchdescription = self.fp.patchdescription
-                return getattr(self.fp, name)
-
-        return lazyfile(partial(lazy_downloaded_to_intermediate, basefile))
+        return Lazyfile(partial(lazy_downloaded_to_intermediate, basefile))
 
 
     def metadata_from_basefile(self, basefile):

diff --git a/ferenda/sources/legal/se/sfs.py b/ferenda/sources/legal/se/sfs.py
@@ -754,8 +754,13 @@ def extract_metadata_register(self, soup, basefile):
                             identifier = util.ucfirst(str(node))
                             if identifier.startswith("Prop"):
                                 from .propositioner import prop_sanitize_identifier
-                                identifier = prop_sanitize_identifier(identifier)
-                            d[node.uri] = {"dcterms:identifier": identifier}
+                                try:
+                                    identifier = prop_sanitize_identifier(identifier)
+                                except ValueError:
+                                    self.log.warning("%s: Could not sanitize irregular identifier %s" % (basefile, identifier))
+                                    identifier = None
+                            if identifier:
+                                d[node.uri] = {"dcterms:identifier": identifier}
                 elif key == 'CELEX-nr':
                     for celex in re.findall('3\d{2,4}[LR]\d{4}', val):
                         b = BNode()
@@ -1014,14 +1019,17 @@ def postprocess_doc(self, doc):
                 graph.add(triple)
                 doc.meta.remove(triple)
                 if p.identifier == RPUBL.forarbete:
-                    triple = (o, DCTERMS.identifier,
-                              doc.meta.value(o, DCTERMS.identifier))
-                    graph.add(triple)
-                    trash.add(triple)
-                    triple = (o, RDF.type,
-                              doc.meta.value(o, RDF.type))
-                    graph.add(triple)
-                    trash.add(triple)
+                    forarb_identifier = doc.meta.value(o, DCTERMS.identifier)
+                    if forarb_identifier: # not always the case, eg if the forarbete had an irregular identifier
+                        triple = (o, DCTERMS.identifier,
+                                  forarb_identifier)
+                        graph.add(triple)
+                        trash.add(triple)
+                    forarb_type = doc.meta.value(o, RDF.type)
+                    if forarb_type:
+                        triple = (o, RDF.type, forarb_type)
+                        graph.add(triple)
+                        trash.add(triple)
                 elif p.identifier == RPUBL.genomforDirektiv:
                     triple = (o, RPUBL.celexNummer,
                               doc.meta.value(o, RPUBL.celexNummer))

diff --git a/ferenda/sources/legal/se/sou.py b/ferenda/sources/legal/se/sou.py
@@ -23,10 +23,10 @@
 from ferenda import util, decorators
 from ferenda.pdfreader import StreamingPDFReader
 from . import Regeringen, SwedishLegalSource, FixedLayoutSource, SwedishLegalStore, Offtryck, RPUBL
-
+from .swedishlegalsource import lazyread
 
 def sou_sanitize_identifier(identifier):
-    if not re.match("SOU (19|20)\d{2}:[1-9]\d*", identifier):
+    if not re.match("SOU (19|20)\d{2}:[1-9]\d*$", identifier):
         raise ValueError("Irregular identifier %s (after mangling)" %  identifier)
     return Literal(identifier)
 
@@ -282,6 +282,7 @@ def metadata_from_basefile(self, basefile):
         attrib["rpubl:utrSerie"] = self.lookup_resource("SOU", SKOS.altLabel)
         return attrib
 
+    @lazyread
     def downloaded_to_intermediate(self, basefile):
         intermediate_path = self.store.intermediate_path(basefile)
         intermediate_dir = os.path.dirname(intermediate_path)

diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py
@@ -8,6 +8,7 @@
 
 from bz2 import BZ2File
 from datetime import datetime, date
+from functools import partial, wraps
 from io import BytesIO, StringIO, BufferedIOBase
 from urllib.parse import quote, unquote
 from wsgiref.util import request_uri
@@ -80,7 +81,30 @@ def __new__(cls, *args, **kwargs):
     def __bool__(self):
         return False
 
+class Lazyfile(object):
 
+    def __init__(self, constructor):
+        self.constructor = constructor
+        self.fp = None
+        self.patchdescription = None
+        self.closed = True
+
+    def __getattr__(self, name):
+        if self.fp is None:
+            self.fp = self.constructor()
+            self.patchdescription = self.fp.patchdescription
+        return getattr(self.fp, name)
+
+
+def lazyread(f):
+    """Don't call the wrapped function until someone actually tries to read from the fp that it should return."""
+    @wraps(f)
+    def wrapper(self, basefile):
+        p = partial(f, self, basefile)
+        fp = Lazyfile(p)
+        return fp
+    return wrapper
+
 class SwedishLegalHandler(RequestHandler):
     def supports(self, environ):
         pathinfo = environ['PATH_INFO']

diff --git a/lagen/nu/res/patches/sfs/patches/1974/152.patch b/lagen/nu/res/patches/sfs/patches/1974/152.patch
diff --git a/lagen/nu/res/patches/sfs/patches/1999/1229.patch b/lagen/nu/res/patches/sfs/patches/1999/1229.patch
@@ -1,30 +1,30 @@
 --- data/sfs/intermediate/1999/1229.txt.bz2
 +++ 
-@@ -15354,6 +15354,7 @@ Korrigerad formattering av strecksatslistor i 44:12-13, styckeindelning i 44:21 samt tabell i 63:3
+@@ -15176,6 +15176,7 @@ Korrigerad formattering av strecksatslistor i 44:12-13, styckeindelning i 44:21 samt tabell i 63:3
  underliggande tillgångar eller av kursindex eller liknande.
 
  12 § Med option avses en rätt för innehavaren att
 +
  - förvärva eller avyttra aktier, obligationer eller andra
  tillgångar till ett bestämt pris, eller
 
-@@ -15365,6 +15366,7 @@
+@@ -15187,6 +15188,7 @@
  Huvudregel
 
  13 § Kapitalvinsten ska beräknas som skillnaden mellan
 +
  - ersättningen för den avyttrade tillgången eller för den
  utfärdade förpliktelsen minskad med utgifterna för avyttringen
  eller utfärdandet, och
-@@ -15433,6 +15435,7 @@
+@@ -15255,6 +15257,7 @@
  avyttrats vid tidpunkten för utnyttjandet. Lag (2007:1419).
 
  Tillgångar som förvärvas genom arv, gåva m.m.
 +
  21 § Om en tillgång förvärvas genom arv, testamente, gåva,
  bodelning eller på liknande sätt, inträder förvärvaren i den
  tidigare ägarens skattemässiga situation.
-@@ -20879,15 +20882,19 @@
+@@ -20610,15 +20613,19 @@
                                  procent av det belopp med vilket
                                  den fastställda förvärvsinkomsten
                                  överstiger 0,99 prisbasbelopp
@@ -43,16 +43,8 @@
 +
  Lag (2011:1256).
 
- 3 a § /Upphör att gälla U:2018-01-01/
-@@ -20933,6 +20940,7 @@
-
- överstiger 12,43
- prisbasbelopp                   0,422 prisbasbelopp
-+
- Lag (2015:775).
-
- 3 a § /Träder i kraft I:2018-01-01/
-@@ -20987,6 +20995,7 @@
+ 3 a § För dem som vid beskattningsårets ingång har fyllt 65 
+@@ -20672,6 +20679,7 @@
 
  överstiger 12,43 
  prisbasbelopp			0,422 prisbasbelopp	

diff --git a/test/integrationLagen.py b/test/integrationLagen.py
@@ -37,7 +37,7 @@ class TestLagen(unittest.TestCase, FerendaTestCase):
 
     def assert_status(self, url, code):
         res = requests.get(url, headers={'Accept': 'text/html'})
-        self.assertEqual(res.status_code, code)
+        self.assertEqual(res.status_code, code, url)
         return res
 
     def assert200(self, url):

diff --git a/tools/nginx.conf b/tools/nginx.conf
@@ -23,7 +23,7 @@ server {
         rewrite_log on;
 	charset utf-8;
         charset_types text/html text/xml;
-
+	client_max_body_size 4m; 
 	location @uwsgi {
 		uwsgi_pass unix:///tmp/ferenda.sock;
 		include /etc/nginx/uwsgi_params;