diff --git a/ferenda/pdfanalyze.py b/ferenda/pdfanalyze.py
index 4a1e132a..1edcc72a 100644
--- a/ferenda/pdfanalyze.py
+++ b/ferenda/pdfanalyze.py
@@ -129,13 +129,7 @@ def paginate(self, paginatepath=None, force=False):
physical = "%s#page=%s" % (page.src.split(os.sep)[-1], page.number)
pageskip = page.number - lastpagenumber
lastpagenumber = page.number
- if isinstance(currentpage, int):
- currentpage += pageskip
- elif util.is_roman(currentpage):
- lower = currentpage.islower()
- currentpage = util.to_roman(util.from_roman(currentpage)+pageskip, lower=lower)
- if lower:
- currentpage = currentpage.lower()
+ currentpage = util.increment(currentpage, pageskip)
pageguess = self.guess_pagenumber(page, currentpage)
if pageguess is None:
if len(page) > 0:
diff --git a/ferenda/pdfreader.py b/ferenda/pdfreader.py
index 26c996a0..cf1997c5 100644
--- a/ferenda/pdfreader.py
+++ b/ferenda/pdfreader.py
@@ -643,7 +643,7 @@ def _parse_xml_make_textbox(self, element, nextelement, after_footnote, lastbox,
nextfont = self.fontspec[int(nextelement.get('font'))] if nextelement is not None and nextelement.get('font') else None
if self.detect_footnotes:
if (len(textelements) and
- textelements[0].strip().isdigit() and
+ (textelements[0].strip().isdigit() and
# check both previous box and next (for catching footnote markers in the foooter)
(lastfont and
lastfont.family == thisfont['family'] and
@@ -658,7 +658,7 @@ def _parse_xml_make_textbox(self, element, nextelement, after_footnote, lastbox,
thisfont['size'] < nextfont['size'] and
-5 < int(nextelement.get("left")) - (int(element.get("left")) + int(element.get("width"))) < 10 and # is really close to
0 < (int(nextelement.get("top")) + int(nextelement.get("height"))) - (int(element.get("top")) + int(element.get("height"))) < 20) # is slightly lower than
- ):
+ )):
# this must be a footnote -- alter tag to show that it
# should be rendered with superscript
if textelements[0].tag is None:
@@ -670,12 +670,12 @@ def _parse_xml_make_textbox(self, element, nextelement, after_footnote, lastbox,
# is it in the main text, ie immediately
# following the last textbox? Then append it to that textbox
- if abs(lastbox.right - int(attribs['left'])) < 3:
+ if lastbox and abs(lastbox.right - int(attribs['left'])) < 3:
# return a Box that the caller will merge with current
attribs['fontid'] = attribs.pop('font')
attribs['merge-with-current'] = True
return Textbox(textelements, **attribs)
- elif min([x.left for x in page]) - int(attribs['left']) < 3:
+ elif min([x.left for x in page] + [0]) - int(attribs['left']) < 3:
# then create a new textbox and let
# the after_footnote magic append more
# textboxes to it. Note: we don't use
diff --git a/ferenda/sources/legal/se/offtryck.py b/ferenda/sources/legal/se/offtryck.py
index a580aa87..a20cce6c 100644
--- a/ferenda/sources/legal/se/offtryck.py
+++ b/ferenda/sources/legal/se/offtryck.py
@@ -8,6 +8,7 @@
import re
import json
import difflib
+import logging
import collections
from math import sqrt, pi, e, floor
# 3rd party
@@ -22,7 +23,7 @@
from ferenda import util, errors
from ferenda import PDFReader, FSMParser, Describer, Facet
from ferenda.elements import (Link, Body, CompoundElement,
- Preformatted, UnorderedList, ListItem)
+ Preformatted, UnorderedList, ListItem, serialize)
from ferenda.elements.html import P
from ferenda.pdfreader import BaseTextDecoder, Page, Textbox
from ferenda.decorators import newstate
@@ -235,7 +236,6 @@ def normalize_family(fontfamily):
return prevbox.font.family in ("Symbol", nextbox.font.family)
def offtryck_gluefunc(textbox, nextbox, prevbox):
- # linespacing = nextbox.font.size / 2
linespacing = nextbox.font.size / 1.2 # bboxes for scanned
# material seem very tight,
# so that lines appear to
@@ -349,8 +349,8 @@ def offtryck_gluefunc(textbox, nextbox, prevbox):
(valignmatch(prevbox, nextbox) or # compare baseline, not topline
alignmatch(prevbox, nextbox) or # compare previous line to next
alignmatch(textbox, nextbox) or # compare entire glued box so far to next FIXME -- is this a good idea? Tends to glue rows in tables...
- (parindent * 2 >= (prevbox.left - nextbox.left) >= parindent) or
- (parindent * 2 >= (textbox.left - nextbox.left) >= parindent) or
+ (parindent * 2 >= (prevbox.left - nextbox.left) >= parindent / 2) or
+ (parindent * 2 >= (textbox.left - nextbox.left) >= parindent / 2) or
(re.match(r"[\d\.]+\s+[A-ZÅÄÖ]", strtextbox) and nextbox.left - textbox.left < parindent * 5) # hanging indent (numbered) heading -- FIXME: we'd like to increase the parindent multiplier depending on the len of the initial number
)):
# if the two boxes are on the same line, but have a
@@ -491,7 +491,8 @@ def parse_body(self, fp, basefile):
# positioned textboxes that don't reflow etc
s = VerbatimSection()
for relidx, page in enumerate(sanitized[startpage:startpage+pagecount]):
- sb = Sidbrytning(ordinal=initialstate['pageno']+relidx,
+ sb = Sidbrytning(ordinal=util.increment(initialstate['pageno'],
+ relidx),
width=page.width,
height=page.height,
src=page.src)
@@ -504,7 +505,7 @@ def parse_body(self, fp, basefile):
if lastpagebreak is None:
initialstate['pageno'] = 1
else:
- initialstate['pageno'] = lastpagebreak.ordinal + 1
+ initialstate['pageno'] = util.increment(lastpagebreak.ordinal, 1)
allbody += body[:]
self.validate_body(allbody, basefile) # Throws exception if invalid
return allbody
@@ -830,72 +831,6 @@ def find_kommittebetankande(self, node, state):
return None
def find_commentary(self, node, state):
-
- def plot(filename, linespacings, linespacing_threshold, gaps, gap_threshold):
- try:
- import matplotlib
- matplotlib.use('Agg')
- import matplotlib.pyplot as plt
- except ImportError:
- raise ImportError("You need matplotlib installed")
- plot = plt.subplot2grid((2,1), (0, 0))
- plot.set_title("linespacings")
- y, x, _ = plot.hist(linespacings, bins=50)
- plot.plot([linespacing_threshold, linespacing_threshold], [0, y.max()])
- if gaps:
- plot = plt.subplot2grid((2,1), (1, 0))
- plot.set_title("gaps")
- y, x, _ = plot.hist(gaps, bins=max(gaps))
- plot.plot([gap_threshold, gap_threshold], [0, y.max()])
- util.ensure_dir(filename)
- plt.savefig(filename, dpi=150)
- self.log.debug("wrote %s" % filename)
-
- def threshold(series, resolution=1000, bandwidth=200):
- # do a pseudo-KDE (but using discrete, high-resolution
- # bins instead of a continous curve because math
- start = min(series)
- stop = max(series)
- width = stop - start
- binsize = width / resolution
- bins = [0] * (resolution + bandwidth)
- scale = [0] * (resolution + bandwidth)
-
- # a bandwidth wide array with values forming a normal
- # (gaussian) distribution
- kernel = [0] * bandwidth
- s = bandwidth / 10
- m = 0
- kernelrange = list(range(int(-bandwidth/2)+1, int(bandwidth/2+1)))
- kernel = [1/(sqrt(2*pi)*s)*e**(-0.5*(float(x-m)/s)**2) for x in kernelrange]
- for val in series:
- normval = val - start
- fraction = normval / width
- binidx = floor(fraction * resolution) + int(bandwidth/2)
- for kernidx, offset in enumerate(kernelrange):
- bins[binidx+offset-1] += kernel[kernidx]
- for idx, bin in enumerate(bins):
- scale[idx] = ((idx - int(bandwidth/2))/resolution * width) + start
-
- # find the valley after the first peak
- peak = False
- best = 0
- for idx, val in enumerate(bins):
- if not peak:
- # walk til we find the peak
- if val >= best:
- best = val
- else:
- peak = True
- else:
- # walk til we find the valley
- if val <= best:
- best = val
- else:
- break
- # now the valley is at idx - 1
- return scale[idx-1]
-
if not isinstance(node, Avsnitt) or (node.title not in ("Författningskommentar",
"Författningskommentarer",
"Specialmotivering")):
@@ -905,409 +840,22 @@ def threshold(series, resolution=1000, bandwidth=200):
return state
else:
return None # visit_node won't call any subnode
- commentary = []
- # parser = SwedishLegalSource.forfattningskommentar_parser()
- for subsection in node:
- if hasattr(subsection, 'title'):
- # find out which laws this proposition proposes to
- # change (can be new or existing)
- if re.match("Förslag(|et) (till lag om|om lag till) ändring i", subsection.title):
- uri = self._parse_uri_from_text(subsection.title, state['basefile'])
- lawname = subsection.title.split(" ", 6)[-1]
- elif re.match("Förslag(|et) till", subsection.title):
- # create a reference that could pass for a real
- # SFS-id, but with the name (the only identifying
- # information we have at this point) encoded into
- # it. FIXME: the numslug could be shorter if we'd
- # make sure to only allow lower-case a-z and to a
- # base26 conversion into an integer
- lawname = subsection.title.split(" ", 2)[-1]
- uri = self.temp_sfs_uri(lawname)
- else:
- uri = None
- if uri:
- commentary.append((uri, lawname, subsection))
-
- if commentary == []: # no subsecs, ie the prop changes a single law
+ cf = CommentaryFinder(state['basefile'], self._parse_uri_from_text, self.temp_sfs_uri)
+ commentaries = []
+ for subsection in node: # nb: Node is the "Författningskommentar" chapter
+ if cf.is_commentary_section(subsection):
+ found = True
+ commentaries.append((subsection, *cf.identify_law(subsection.title)))
+ if not found: # # no subsecs, ie the prop changes a single law
if 'primarylaw' in state:
- commentary.append((state['primarylaw'], state['primarylawname'], node))
+ commentaries.append((subsection, state['primarylaw'], state['primarylawname']))
else:
self.log.warning("%s: Författningskommentar does not specify name of law and find_primary_law didn't find it either" % state['basefile'])
- # first, analyze gaps and linespacing constants using all sections
- linespacings = []
- gaps = []
- detect_singleline_spacing = False
- for law, lawname, section in commentary:
- for idx, subnode in enumerate(section):
- if isinstance(subnode, Sidbrytning):
- continue
- if subnode.linespacing:
- linespacings.append(subnode.linespacing)
- elif detect_singleline_spacing:
- # a single line paragraph has no easily discernable
- # line height, but we can approximate by checking the
- # nearest paragraph above and below
- candidates = []
- if (idx > 0 and
- not isinstance(section[idx-1], Sidbrytning) and
- subnode.bottom > section[idx-1].bottom):
- candidates.append(subnode.bottom - section[idx-1].bottom)
- if (idx +1 < len(section) and
- not isinstance(section[idx+1], Sidbrytning) and
- section[idx+1].bottom > subnode.bottom):
- candidates.append(section[idx+1].bottom - subnode.bottom)
- if candidates:
- linespacings.append(min(candidates) / subnode.font.size)
- if idx and subnode.top > prevnode.bottom:
- gaps.append(subnode.top - prevnode.bottom)
- prevnode = subnode
-
- gap_threshold = threshold(gaps, resolution=1000, bandwidth=400)
- linespacing_threshold = threshold(linespacings, resolution=1000, bandwidth=500)
-
- if os.environ.get("FERENDA_PLOTANALYSIS"):
- #datadir = self.store.datadir
- #self.store.datadir = "plots/%s" % self.alias
- plot_path = self.store.path(state['basefile'], 'intermediate',
- '.commentary.plot.png')
- plot(plot_path, linespacings, linespacing_threshold, gaps, gap_threshold)
- #self.store.datadir = datadir
-
- for law, lawname, section in commentary:
- textnodes = self._find_commentary_for_law(law, section, state, lawname, linespacing_threshold, gap_threshold)
- section[:] = textnodes[:]
-
-
- def _find_commentary_for_law(self, law, section, state, lawname, linespacing_threshold, gap_threshold):
-
- def probable_header(para):
- # headers are less than 100 chars and do not end with a period
- # or other non-hederish thing
- text = str(para).strip()
- if text == 'Bestämmelse Kommentarerna finns i avsnitt':
- # This is a table heading (not real header) type of thing
- # occurring in SOU 2017:66, but similar constructs might
- # appear elsewhere.
- return False
- return (len(text) < 100 and
- (len(text) < 2 or
- (text[-1] not in (".", ")") and text[-2:] not in (" i", " §"))))
-
- def probable_comment(para):
- text = str(para).strip()
- if re.match("(Av p|P)aragrafen (framgår|innehåller|har behandlats|är ny|, som är ny|avgränsar|innebär)", text):
- return True
- # elif re.match("(I f|F)örsta stycket", text): # this overmatches, eg ÅRL 7:31 2 st
- elif re.match("I första stycket", text):
- return True
- elif re.match("\((Jfr|Paragrafen)", text):
- return True
- elif (subnode.linespacing or 0) > linespacing_threshold and text[0].isupper():
- return True
- return False
-
-
- def probable_acttext(para):
- # returns True iff this text is probably acttext
- # returns False iff it's probably not acctext
- # returns None if we don't have enough data
- # (maybe because it's a single line or a Sidbrytning)
-
- if isinstance(para, Sidbrytning):
- return None
-
- # 2 clear indicators of acttext: font size is smaller
- if state['defaultsize'] >= para.font.size + 2:
- return True
- elif para.lines > 1:
- # or linespacing is tighter than average
- return bool(linespacing_threshold and
- para.linespacing and
- para.linespacing < linespacing_threshold)
- else:
- return None
-
-
- # Then try to find what is what. FIXME: this is basically a
- # ad-hoc statemachine, with a lot of ill-understood
- # conditionals and flag settings. Luckily there's a decent
- # test harness in the functionalSources.TestPropRegeringen
- # suite
- textnodes = []
- reexamine_state = False
- skipheader = False # whether we should skip adding a subnode
- # to current_comment since it's only a
- # header (eg "53 §" on a line by itself)
- comment_on = None
- current_comment = None
- comment_start = False
- parsestate = "commenttext"
- prevnode = None
-
- for idx, subnode in enumerate(section):
- if not isinstance(subnode, (Textbox, Sidbrytning, UnorderedList)):
- raise ValueError("_find_commentary_for_law: Got a %s instead of a Textbox/Sidbrytning/UnorderedList, this indicates broken parsing" % type(subnode))
- if isinstance(subnode, (Page, Sidbrytning)):
- # self.log.debug("...Setting reexamine_state flag")
- reexamine_state = True
- if parsestate == "commenttext":
- current_comment.append(subnode)
- else:
- textnodes.append(subnode)
- continue
- text = str(subnode).strip()
- # self.log.debug("Examining %s..." % text[:60])
- if reexamine_state: # meaning the previous node was
- # on the previous page, so any
- # text gap that might have
- # signalled a change from acttext
- # to commenttext was lost.
- prev_state = parsestate
- # indicates section starting with eg "Första
- # stycket innehåller..." FIXME: this should be
- # detected by probable_comment now.
- # if isinstance(subnode, Textbox) and hasattr(subnode, '__getitem__') and (subnode[0].tag == "i"):
- # parsestate = "commenttext"
- if (probable_header(subnode) and
- idx < len(section) - 2 and
- not str(section[idx+1]).strip()[0].islower()):
- # FIXME: the above check that a header is followed
- # by something that looks like a start of a
- # sentence should be rolled into probable_header
- parsestate = "acttext"
- elif (re.match("\d+(| \w) §", text) and
- len(section) > idx+1 and
- not probable_comment(section[idx+1])):
- parsestate = "acttext"
- elif probable_comment(text):
- parsestate = "commenttext"
- else:
- pass # keep parsestate as-is
- if prev_state == "acttext" and parsestate == "commenttext":
- comment_start = True
- reexamine_state = False
-
- # elif len(text) < 20 and (text.endswith(" kap.") or text.endswith(" kap")):
- if len(text) < 20 and (text.endswith(" kap.") or text.endswith(" kap")):
- # subsection heading indicating the start of a new
- # chapter. alter the parsing context from law to
- # chapter in law
- # self.log.debug("...detecting chapter header w/o acttext")
- newlaw = self._parse_uri_from_text(text, state['basefile'], law)
- if newlaw:
- law = newlaw
- skipheader = True
- textnodes.append(subnode)
- subnode = None
- reftext = text
-
- elif len(text) < 20 and text.endswith("§"):
- # self.log.debug("...detecting section header w/o acttext")
- comment_on = self._parse_uri_from_text(text, state['basefile'], law)
- skipheader = True
- offset = 1
- reftext = text
- if len(section) > idx+offset:
- acttext = None
- # now look at following paras until we know
- # whether or not this is acctext or commenttext
- while acttext is None or idx+offset-1 >= len(section):
-
- acttext = probable_acttext(section[idx+offset])
- offset += 1
- if acttext is True:
- parsestate = "acttext"
- comment_start = False
- skipheader = False
- else:
- comment_start = True
-
- elif re.match("\d+ kap. +[^\d]", text): # eg "4 kap. Om domare"
- # self.log.debug("...detecting chapter header with title, no section")
- newlaw = self._parse_uri_from_text(text, state['basefile'], law)
- if newlaw:
- law = newlaw
- skipheader = True # really depends on whether the _next_ subnode is acttext or not
- textnodes.append(subnode)
- parsestate = "acttext"
- subnode = None
-
- elif re.match("\d+(| \w) §", text):
- # self.log.debug("...detecting section header with acttext")
- reftext = text[:text.index("§")+ 1]
- comment_on = self._parse_uri_from_text(reftext, state['basefile'], law)
- comment_start = False
- parsestate = "acttext"
- skipheader = False
-
- elif text in ('Ikraftträdande- och övergångsbestämmelser',
- 'Ikraftträdandebestämmelser'
- 'Övergångsbestämmelser'):
- # ideally, we'd like URIs of the form
- # https://lagen.nu/1942:740#L2018:324, but at this
- # stage we don't have the change SFS URI. Create a
- # fake URI instead with just a #L fragment.
- comment_on = law.split("#")[0] + "#L"
- # this whole crap just tries to find out whether the
- # following subnode is part of accttext or
- # commenttext. We have the exact same test above --
- # this needs refactoring.
- offset = 1
- if len(section) > idx+offset:
- acttext = None
- # now look at following paras until we know
- # whether or not this is acctext or commenttext
- while acttext is None or idx+offset-1 >= len(section):
- acttext = probable_acttext(section[idx+offset])
- offset += 1
- if acttext is True:
- parsestate = "acttext"
- comment_start = False
- skipheader = False
- else:
- comment_start = True
-
- # any big space might signal a switch from acttext ->
- # commenttext or vice versa (if some other obscure
- # conditions are met).
- elif (prevnode and
- subnode.top - prevnode.bottom >= gap_threshold):
- # self.log.debug("...node spacing is %s, switching from parsestate %s" % (subnode.top - prevnode.bottom, parsestate))
- if (re.match("\d+(| \w) §$", str(prevnode).strip())):
- comment_start = True
- parsestate == "commenttext"
- elif probable_header(subnode) or parsestate == "commenttext":
- if current_comment is not None and len(current_comment) == 0:
- # this means we created a
- # Forfattningskommentar and then never added
- # any text to it. Since we're switching into
- # acttext state, replace that object with just
- # the title
- comment_on = current_comment.comment_on
- assert current_comment.title, "Expected current_comment to have a .title"
- titlenode = P([current_comment.title])
- if current_comment in textnodes:
- textnodes[textnodes.index(current_comment)] = titlenode
- del state['commented_paras'][comment_on]
- else:
- self.log.warning("Failed to replace Forfattningskommentar for %s failed" %
- (current_comment.comment_on))
- # at this point, the current_comment is not valid
- # anymore. Any new comment subnodes should go into
- # a new (possibly unnamed) Forfattningskommentar
- parsestate = "acttext"
- current_comment = None
- elif parsestate == "acttext" and not probable_acttext(subnode):
- parsestate = "commenttext"
- skipheader = False
- comment_start = True
- # self.log.debug("...new parsestate is %s" % parsestate)
-
- # FIXME: This gives too many false positives right now --
- # need to check distance to prevbox and/or nextbox. Once
- # header detection works better we can enable it
- # everywhere, not just at the start of the commentary for
- # this act.
- elif current_comment is None and probable_header(subnode):
- # self.log.debug("...seems like a header part of acttext")
- parsestate = "acttext"
- elif probable_acttext(subnode):
- parsestate = "acttext"
- elif probable_comment(subnode):
- parsestate = "commenttext"
- elif (subnode.lines <= 1 and
- len(section) > idx+1 and
- hasattr(section[idx+1], 'top') and
- section[idx+1].top - subnode.bottom < gap_threshold and
- probable_acttext(section[idx+1])):
- # the current subnode is not acttext, but it's not a
- # multiline section so it might be hard to tell. Take
- # a guess by checking the following section, unless
- # the distance to next is too big.
- parsestate = "acttext"
- else:
- # self.log.debug("...will just keep on (parsestate %s)" % parsestate)
- pass
- # if comment_on and parsestate == "commenttext":
- if comment_start:
- # self.log.debug("Starting new Forfattningskommentar for %s" % comment_on)
- # OK, a new comment. Let's record which page we found it on
- page = self._find_subnode(section[idx:], Sidbrytning, reverse=False)
- if page:
- pageno = page.ordinal - 1
- else:
- pageno = None
- if comment_on not in state['commented_paras'] or comment_on is None:
- if not skipheader: # means we have a section header
- # with acttext. that acttext
- # should already have been added
- # to textnodes, so current subnode
- # must contain first box of the
- # comment
- title = ""
- else:
- title = text
- if comment_on:
- current_comment = Forfattningskommentar(title=title,
- comment_on=comment_on,
- uri=None,
- label="Författningskommentar till %s %s" % (reftext, lawname))
- else:
- # this is clearly a comment, but we cannot
- # pinpoint what it comments. Maybe it's a
- # comment following a inline heading (which
- # doesn't have URIs)
- self.log.warning("%s: Creating un-anchored comment '%s...'" % (state['basefile'], text[:40]))
- current_comment = Forfattningskommentar(title=title,
- comment_on=None,
- uri=None,
- label="Författningskommentar i %s" % lawname)
-
- if parsestate != "commenttext":
- #self.log.debug("%s, comment on %s, parsestate was '%s', "
- # "setting to 'commenttext'" %
- # (state['basefile'], comment_on, parsestate))
- parsestate = "commenttext"
- # the URI to the above Forfattningskommentar is
- # dynamically constructed in
- # Forfattningskommentar.as_xhtml
- textnodes.append(current_comment)
- if comment_on:
- state['commented_paras'][comment_on] = pageno
- elif comment_on:
- self.log.warning("Dupe comment on %s at p %s (previous at %s), ignoring" % (comment_on, pageno, state['commented_paras'][comment_on]))
- comment_on = None
- comment_start = False
-
- if parsestate == "commenttext":
- assert subnode
- if current_comment is None:
- if "#" not in law:
- # if the law URI is really a chapter URI, this is
- # hardly the first comment in this section
- current_comment = Forfattningskommentar(title="",
- comment_on=law,
- uri=None,
- label="Författningskommentar till %s" % lawname)
- textnodes.append(current_comment)
- else:
- from pudb import set_trace; set_trace()
- print("This should never happen")
- if not skipheader:
- current_comment.append(subnode)
- else:
- skipheader = False
- else:
- if subnode:
- textnodes.append(subnode)
-
- if isinstance(subnode, (Page, Sidbrytning)):
- prevnode = None
- else:
- prevnode = subnode
- return textnodes
-
+ metrics = cf.analyze(commentaries)
+ metrics["defaultsize"] = state["defaultsize"]
+ for section, uri, name in commentaries:
+ cf.markup_commentary(section, uri, name, metrics)
re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\.[_ ]|N|)?\d+([_ ]s\.\d+|))#?(K([a-z0-9]+)|)(P([a-z0-9]+)|)(S(\d+)|)(N(\d+)|)')
def _parse_uri_from_text(self, text, basefile, baseuri=""):
@@ -1443,7 +991,397 @@ def tabs(self):
else:
return []
+class CommentaryFinder(object):
+
+ def __init__(self, basefile, uriparser, uriminter):
+ self.basefile = basefile
+ self._parse_uri_from_text = uriparser
+ self.temp_sfs_uri = uriminter
+ self.debug = os.environ.get("FERENDA_FSMDEBUG_COMMENTARY")
+ self.log = logging.getLogger("commentary")
+
+
+ def is_commentary_section(self, subsection):
+ if hasattr(subsection, 'title'):
+ return bool(re.match("Förslag(|et) (till lag om|om lag till) ändring i", subsection.title) or re.match("Förslag(|et) till", subsection.title))
+
+ def identify_law(self, title):
+ # find out which laws this section proposes to
+ # change (can be new or existing)
+ if "ändring i" in title:
+ lawname = title.split(" ", 6)[-1]
+ # FIXME: need to provide access to parse_uri_from_text function
+ uri = self._parse_uri_from_text(title, self.basefile) # do _parse_uri_from_text really need basefile?
+ else:
+ # create a reference that could pass for a real
+ # SFS-id, but with the name (the only identifying
+ # information we have at this point) encoded into
+ # it.
+ lawname = title.split(" ", 2)[-1]
+ # FIXME: need to provide accesss to temp_sfs_uri (or move to this class?)
+ uri = self.temp_sfs_uri(lawname)
+ return uri, lawname
+
+ def plot(self, filename, linespacings, linespacing_threshold, gaps, gap_threshold):
+ try:
+ import matplotlib
+ matplotlib.use('Agg')
+ import matplotlib.pyplot as plt
+ except ImportError:
+ raise ImportError("You need matplotlib installed")
+ plot = plt.subplot2grid((2,1), (0, 0))
+ plot.set_title("linespacings")
+ y, x, _ = plot.hist(linespacings, bins=50)
+ plot.plot([linespacing_threshold, linespacing_threshold], [0, y.max()])
+ if gaps:
+ plot = plt.subplot2grid((2,1), (1, 0))
+ plot.set_title("gaps")
+ y, x, _ = plot.hist(gaps, bins=max(gaps))
+ plot.plot([gap_threshold, gap_threshold], [0, y.max()])
+ util.ensure_dir(filename)
+ plt.savefig(filename, dpi=150)
+ self.log.debug("wrote %s" % filename)
+
+ def estimate_density(self, series, resolution, bandwidth):
+ # do a pseudo-KDE (but using discrete, high-resolution
+ # bins instead of a continous curve because math
+ start = min(series)
+ stop = max(series)
+ width = stop - start
+ binsize = width / resolution
+ bins = [0] * (resolution + bandwidth)
+ scale = [0] * (resolution + bandwidth)
+
+ # a bandwidth wide array with values forming a normal
+ # (gaussian) distribution
+ kernel = [0] * bandwidth
+ s = bandwidth / 10
+ m = 0
+ kernelrange = list(range(int(-bandwidth/2)+1, int(bandwidth/2+1)))
+ kernel = [1/(sqrt(2*pi)*s)*e**(-0.5*(float(x-m)/s)**2) for x in kernelrange]
+ for val in series:
+ normval = val - start
+ fraction = normval / width
+ binidx = floor(fraction * resolution) + int(bandwidth/2)
+ for kernidx, offset in enumerate(kernelrange):
+ bins[binidx+offset-1] += kernel[kernidx]
+ for idx, bin in enumerate(bins):
+ scale[idx] = ((idx - int(bandwidth/2))/resolution * width) + start
+ return bins, scale
+
+ def threshold(self, series, resolution=1000, bandwidth=200):
+ bins, scale = self.estimate_density(series, resolution, bandwidth)
+
+ # find the valley after the first peak
+ peak = False
+ best = 0
+ for idx, val in enumerate(bins):
+ if not peak:
+ # walk til we find the peak
+ if val >= best:
+ best = val
+ else:
+ peak = True
+ else:
+ # walk til we find the valley
+ if val <= best:
+ best = val
+ else:
+ break
+ # now the valley is at idx - 1
+ return scale[idx-1]
+
+ def collect_features(self, commentaries):
+ features = {'linespacings': [],
+ 'gaps': []}
+ detect_singleline_spacing = False
+ for section, law, lawname in commentaries:
+ for idx, subnode in enumerate(section):
+ if isinstance(subnode, Sidbrytning):
+ continue
+ if subnode.linespacing:
+ features['linespacings'].append(subnode.linespacing)
+ elif detect_singleline_spacing:
+ # a single line paragraph has no easily discernable
+ # line height, but we can approximate by checking the
+ # nearest paragraph above and below
+ candidates = []
+ if (idx > 0 and
+ not isinstance(section[idx-1], Sidbrytning) and
+ subnode.bottom > section[idx-1].bottom):
+ candidates.append(subnode.bottom - section[idx-1].bottom)
+ if (idx +1 < len(section) and
+ not isinstance(section[idx+1], Sidbrytning) and
+ section[idx+1].bottom > subnode.bottom):
+ candidates.append(section[idx+1].bottom - subnode.bottom)
+ if candidates:
+ features['linespacings'].append(min(candidates) / subnode.font.size)
+ if idx and subnode.top > prevnode.bottom:
+ features['gaps'].append(subnode.top - prevnode.bottom)
+ prevnode = subnode
+ return features
+
+ def analyze(self, commentaries):
+ # first, analyze gaps and linespacing constants using all sections
+ features = self.collect_features(commentaries)
+ gap_threshold = self.threshold(features['gaps'], resolution=1000, bandwidth=400)
+ linespacing_threshold = self.threshold(features['linespacings'], resolution=1000, bandwidth=500)
+
+ if os.environ.get("FERENDA_PLOTANALYSIS"):
+ #datadir = self.store.datadir
+ #self.store.datadir = "plots/%s" % self.alias
+ # FIXME: We don't have access to a store object yet
+ plot_path = self.store.path(state['basefile'], 'intermediate',
+ '.commentary.plot.png')
+ self.plot(plot_path, linespacings, linespacing_threshold, gaps, gap_threshold)
+ #self.store.datadir = datadir
+ return {'linespacing_threshold': linespacing_threshold,
+ 'gap_threshold': gap_threshold}
+
+
+
+ def markup_commentary(self, section, uri, name, metrics):
+ section[:] = self.find_commentary(section, uri, name, metrics)
+
+ def make_commentary_parser(self, metrics, lawname, lawuri):
+ # recognizers
+ # "3 kap." or "3 kap. Om domare"
+ def is_chapter_header(parser):
+ text = str(parser.reader.peek())
+ return bool(len(text) < 20 and text.endswith((" kap.", " kap")) or
+ re.match("\d+ kap. +[^\d]", text))
+
+ # "4 §" or "4 kap. 4 §"
+ def is_section_header(parser):
+ text = str(parser.reader.peek())
+ return len(text) < 20 and text.endswith("§")
+
+ # "4 § Lagtext lagtext och mera lagtext"
+ def is_section_start(parser):
+ text = str(parser.reader.peek())
+ return bool(re.match("\d+(| \w) §", text))
+
+
+ def is_header(parser):
+ return probable_header(parser.reader.peek())
+
+ def is_comment(parser):
+ comment = probable_comment(parser.reader.peek())
+ # if we're not in a commentary section we should not
+ # assume commentary unles probable_comment returns True
+ if comment is True:
+ return True
+ elif comment is False:
+ return False
+ else:
+ # do extra work if we have no assumptions about
+ # whether this is comment or not -- take a look at the
+ # following para, if not separated by a gap.
+ if (state["assume"] is None and
+ parser.reader.peek(2).top - parser.reader.peek().bottom < metrics['gap_threshold'] and
+ probable_comment(parser.reader.peek(2)) is True):
+ return True
+ return state["assume"] == "comment"
+
+ def is_acttext(parser):
+ acttext = probable_acttext(parser.reader.peek())
+ if acttext is True:
+ return True
+ elif acttext is False:
+ return False
+ else:
+ return state["assume"] == "acttext"
+
+ def is_pagebreak(parser):
+ para = parser.reader.peek()
+ if not isinstance(para,
+ (Textbox, Sidbrytning, UnorderedList)):
+ raise ValueError("Got a %s instead of a Textbox/Sidbrytning/UnorderedList, this indicates broken parsing" % type(para))
+ return isinstance(para, Sidbrytning)
+
+ def is_paragraph(parser):
+ return True
+
+ # constructors
+ @newstate('body')
+ def make_body(parser):
+ return p.make_children(Body())
+
+ @newstate('comment')
+ def make_comment(parser):
+ state["assume"] = "comment"
+ text = str(parser.reader.peek())
+ if not state["comment_on"]:
+ if state["beginning"]:
+ state["comment_on"] = lawuri
+ state["beginning"] = False
+ label = "Författningskommentar till %s" % lawname
+ else:
+ self.log.warning("%s: Creating un-anchored comment '%s...'" % (self.basefile, text[:40]))
+ label = "Författningskommentar i %s" % lawname
+ else:
+ label = "Författningskommentar till %s %s" % (state['reftext'], lawname)
+ if not state["skipheader"]:
+ title = ""
+ else:
+ title = text
+
+ f = Forfattningskommentar(title=title,
+ comment_on=state["comment_on"],
+ uri=None,
+ label=label)
+ comment = parser.make_children(f)
+ state["comment_on"] = None
+ state["reftext"] = None
+ return comment
+
+ def make_acttext(parser):
+ state["assume"] = "acttext"
+ return make_paragraph(parser)
+
+ def make_header(parser):
+ state["assume"] = "acttext"
+ return make_paragraph(parser)
+
+ def make_paragraph(parser):
+ ret = parser.reader.next()
+ try:
+ nextchunk = parser.reader.peek()
+ except StopIteration:
+ return ret
+ # determine whether we need to change assumptions about
+ # the following paragraph based on gap size
+ if (not isinstance(nextchunk, Sidbrytning) and
+ nextchunk.top - ret.bottom > metrics["gap_threshold"]):
+ if state["assume"] == "acttext":
+ state["assume"] = "comment"
+ elif state["assume"] == "acttext":
+ state["assume"] = "comment"
+ else:
+ pass
+ return ret
+
+ def handle_pagebreak(parser):
+ state["assume"] = None
+ return parser.reader.next()
+
+ def setup_section_header(parser):
+ state["assume"] = "comment"
+ return make_section(parser)
+
+ def setup_section_start(parser):
+ state["assume"] = "acttext"
+ return make_section(parser)
+
+ def make_section(parser):
+ text = str(parser.reader.peek())
+ state["reftext"] = text[:text.index("§")+ 1]
+ state["comment_on"] = self._parse_uri_from_text(state["reftext"], self.basefile, state["law"])
+ state["comment_start"] = False
+ state["skipheader"] = False # maybe should be true if ?
+ return make_paragraph(parser)
+
+
+ def setup_chapter_start(parser):
+ text = str(parser.reader.peek())
+ newlaw = self._parse_uri_from_text(text, self.basefile, state["law"])
+ if newlaw:
+ state["law"] = newlaw
+ state["skipheader"] = True
+ reftext = text
+
+
+ # helpers
+
+ # The helpers are tristate functions:
+ # True: This is probably
+ # False: This is most likely not
+ # None: I have no idea whether this is or not
+ def probable_header(para):
+ # headers are less than 100 chars and do not end with a period
+ # or other non-hederish thing
+ text = str(para).strip()
+ if text == 'Bestämmelse Kommentarerna finns i avsnitt':
+ # This is a table heading (not real header) type of thing
+ # occurring in SOU 2017:66, but similar constructs might
+ # appear elsewhere.
+ return False
+ return (len(text) < 100 and
+ (len(text) < 2 or
+ (text[-1] not in (".", ")") and text[-2:] not in (" i", " §"))))
+
+ def probable_comment(para):
+ text = str(para).strip()
+ if re.match("(Av p|P)aragrafen (framgår|innehåller|har behandlats|är ny|, som är ny|avgränsar|innebär)", text):
+ return True
+ # elif re.match("(I f|F)örsta stycket", text): # this overmatches, eg ÅRL 7:31 2 st
+ elif re.match("I första stycket", text):
+ return True
+ elif re.match("\((Jfr|Paragrafen)", text):
+ return True
+ elif (para.linespacing or 0) > metrics['linespacing_threshold'] and text[0].isupper():
+ return True
+ return None
+
+ def probable_acttext(para):
+ # returns True iff this text is probably acttext
+ # returns False iff it's probably not acctext
+ # returns None if we don't have enough data
+ # (maybe because it's a single line or a Sidbrytning)
+
+ if isinstance(para, Sidbrytning):
+ return None
+
+ # 2 clear indicators of acttext: font size is smaller
+ if metrics['defaultsize'] >= para.font.size + 2:
+ return True
+ elif para.lines > 1:
+ # or linespacing is tighter than average
+ return bool(metrics['linespacing_threshold'] and
+ para.linespacing and
+ para.linespacing < metrics['linespacing_threshold'])
+ else:
+ return None
+
+ # setup
+ state = {"skipheader": False,
+ "comment_on": None,
+ "beginning": True,
+ "assume": "comment",
+ "law": lawuri}
+ p = FSMParser()
+ p.set_recognizers(is_pagebreak,
+ is_chapter_header,
+ is_section_header,
+ is_section_start,
+ is_header,
+ is_comment,
+ is_acttext,
+ is_paragraph)
+ commonstates = "body", "comment"
+ p.set_transitions({(commonstates, is_pagebreak): (handle_pagebreak, None),
+ ("body", is_header): (make_header, None),
+ ("body", is_chapter_header): (setup_chapter_start, None),
+ ("body", is_section_header): (setup_section_header, None),
+ ("body", is_section_start): (setup_section_start, None),
+ ("body", is_comment): (make_comment, "comment"),
+ ("body", is_acttext): (make_acttext, None),
+ ("comment", is_section_start): (False, None),
+ ("comment", is_header): (False, None),
+ ("comment", is_chapter_header): (False, None),
+ ("comment", is_section_header): (False, None),
+ ("comment", is_acttext): (False, None),
+ ("comment", is_paragraph): (make_paragraph, None),
+ })
+ p.initial_state = "body"
+ p.initial_constructor = make_body
+ p.debug = self.debug
+ return p
+
+ def find_commentary(self, section, uri, name, metrics):
+ textnodes = self.make_commentary_parser(metrics, name, uri).parse(section)
+ return textnodes
def offtryck_parser(basefile="0", metrics=None, preset=None,
diff --git a/ferenda/sources/legal/se/propositioner.py b/ferenda/sources/legal/se/propositioner.py
index 0b25ff0b..2db263d3 100644
--- a/ferenda/sources/legal/se/propositioner.py
+++ b/ferenda/sources/legal/se/propositioner.py
@@ -28,9 +28,11 @@
from ferenda import CompositeRepository, CompositeStore
from ferenda import TextReader, PDFAnalyzer
from ferenda import DocumentEntry, Facet, PDFDocumentRepository
+from ferenda.pdfreader import StreamingPDFReader
from . import (Trips, NoMoreLinks, Regeringen, Riksdagen,
SwedishLegalSource, SwedishLegalStore, RPUBL, Offtryck)
from .fixedlayoutsource import FixedLayoutStore, FixedLayoutSource
+from .swedishlegalsource import lazyread
def prop_sanitize_identifier(identifier):
if not identifier:
@@ -635,17 +637,17 @@ class PropKB(Offtryck, PDFDocumentRepository):
start_url = "https://riksdagstryck.kb.se/tvakammarriksdagen.html"
rdf_type = RPUBL.Proposition
basefile_regex = "prop_(?P\d{4})__+(?P\d+)(?:_(?P\d+)|)"
+ document_type = PROPOSITION = True
+ SOU = DS = KOMMITTEDIREKTIV = False
@classmethod
def get_default_options(cls):
opts = super(PropKB, cls).get_default_options()
- opts['ocr'] = True
+ opts['ocr'] = False
return opts
@decorators.downloadmax
def download_get_basefiles(self, source):
- # source will be an iterator of links to individual collections of things
- from pudb import set_trace; set_trace()
yielded = set()
if self.download_reverseorder:
source = reversed(list(source))
@@ -678,6 +680,60 @@ def download_get_basefiles(self, source):
yield basefile, sublink
yielded.add((basefile,part))
+ def metadata_from_basefile(self, basefile):
+ attrib = super(PropKB, self).metadata_from_basefile(basefile)
+ year, ordinal = basefile.split(":")
+ attrib["rpubl:arsutgava"] = year
+ attrib["rpubl:lopnummer"] = ordinal
+ return attrib
+
+ @lazyread
+ def downloaded_to_intermediate(self, basefile):
+ downloaded_path = self.store.downloaded_path(basefile)
+ intermediate_path = self.store.intermediate_path(basefile)
+ return self.convert_pdf(downloaded_path, intermediate_path)
+
+ def convert_pdf(self, downloaded_path, intermediate_path):
+ intermediate_dir = os.path.dirname(intermediate_path)
+ keep_xml = "bz2" if self.config.compress == "bz2" else True
+ reader = StreamingPDFReader()
+ kwargs = {'filename': downloaded_path,
+ 'workdir': intermediate_dir,
+ 'images': self.config.pdfimages,
+ 'keep_xml': keep_xml}
+ if self.config.ocr:
+ kwargs['ocr_lang'] = 'swe'
+ return reader.convert(**kwargs)
+
+ def extract_head(self, fp, basefile):
+ return None # "rawhead" is never used
+
+ def extract_metadata(self, rawhead, basefile):
+ # extracting title and other metadata (dep, publication date
+ # etc) requires parsing of the body)
+ return self.metadata_from_basefile(basefile)
+
+ def extract_body(self, fp, basefile):
+ reader = StreamingPDFReader()
+ parser = "ocr" if self.config.ocr else "xml"
+ intermediate_suffix = ".hocr" if self.config.ocr else ".xml"
+ if self.config.compress:
+ intermediate_suffix += "." + self.config.compress
+ reader.read(fp, parser=parser)
+ for attachment in [x for x in sorted(self.store.list_attachments(basefile, "downloaded")) if x.endswith(".pdf")]:
+ downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
+ iattachment = attachment.replace(".pdf", intermediate_suffix)
+ intermediate_path = self.store.intermediate_path(basefile, attachment=iattachment)
+ if not os.path.exists(intermediate_path):
+ fp = self.convert_pdf(downloaded_path, intermediate_path)
+ else:
+ fp = self.store.open_intermediate(basefile, attachment=iattachment)
+ reader += StreamingPDFReader().read(fp)
+
+ for page in reader:
+ page.src = "index.pdf" # FIXME: don't hardcode the filename
+ return reader
+
# inherit list_basefiles_for from CompositeStore, basefile_to_pathfrag
# from SwedishLegalStore)
@@ -686,7 +742,7 @@ class PropositionerStore(CompositeStore, SwedishLegalStore):
class Propositioner(CompositeRepository, FixedLayoutSource):
- subrepos = PropRegeringen, PropTrips, PropRiksdagen
+ subrepos = PropRegeringen, PropTrips, PropRiksdagen, PropKB
alias = "prop"
xslt_template = "xsl/forarbete.xsl"
storage_policy = "dir"
diff --git a/ferenda/util.py b/ferenda/util.py
index 711cf950..dc8f724b 100755
--- a/ferenda/util.py
+++ b/ferenda/util.py
@@ -688,6 +688,18 @@ def is_roman(s):
return roman.romanNumeralPattern.match(s.upper()) is not None
+def increment(s, amount=1):
+ """increment a number, regardless if it's a arabic number (int) or a roman number (str)."""
+ if is_roman(s):
+ lower = s.islower()
+ s = to_roman(from_roman(s) + amount, lower=lower)
+ if lower:
+ s = s.lower()
+ return s
+ else:
+ assert isinstance(s, int), "%s is neither a roman numeral nor an int" % s
+ return s + amount
+
def title_sortkey(s):
"""Transform a document title into a key useful for sorting and partitioning documents.
diff --git a/test/integrationOfftryck.py b/test/integrationOfftryck.py
index 2451d9f9..db296d0b 100644
--- a/test/integrationOfftryck.py
+++ b/test/integrationOfftryck.py
@@ -127,3 +127,11 @@ def test_hanging_indent_paragraphs_with_italics(self):
nextbox = self._p('åstadkomma ett fritt flöde av personuppgifter mellan medlemsstaterna i')
self.assertTrue(self.gluefunc(prevbox, nextbox, prevbox))
+
+class TestIdentifyLaw(unittest.TestCase):
+
+ def test_basic(self):
+ cf = CommentaryFinder("2017/18:42")
+ self.assertTrue(cf.identify_law("Förslag till personuppgiftslag"))
+
+