Skip to content

Commit

Permalink
CommentaryFinder now passes all tests
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Mar 10, 2018
1 parent 3c4dfca commit 104d537
Show file tree
Hide file tree
Showing 9 changed files with 689 additions and 783 deletions.
104 changes: 81 additions & 23 deletions ferenda/sources/legal/se/offtryck.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,8 +288,11 @@ def offtryck_gluefunc(textbox, nextbox, prevbox):
# allow for more if prevbox starts with a bullet and
# nextbox startswith lowercase, allow for a large indent
# (but not hanging indent)
if strtextbox.startswith(("\u2022", "\uf0b7", "−")) and strnextbox[0].islower():
alignmatch = lambda p, n: n.left - p.left < 30
ul = False
if strtextbox.startswith(("\u2022", "\uf0b7", "−")):
ul = True
if strnextbox[0].islower():
alignmatch = lambda p, n: n.left - p.left < 30
if strtextbox.startswith("\uf0b7"):
# U+F0B7 is Private use -- probably using symbol font
# for bullet. Just accept any font family or size change
Expand Down Expand Up @@ -319,7 +322,9 @@ def offtryck_gluefunc(textbox, nextbox, prevbox):
# Any line that ONLY contains a section reference should
# probably be interpreted as a header. (also handle dual
# refs "4 kap. 9 c och 10 §§", prop 1997/98:44 s 148)
if re.match("(\d+ kap. |)\d+( \w och \d+| \w| och \d+|) §§?$", strprevbox) and prevbox.bottom <= nextbox.top:
sectionref = re.compile("(\d+ kap. |)\d+( \w och \d+| \w| och \d+|) §§?$")
if ((sectionref.match(strprevbox) or sectionref.match(strnextbox))
and prevbox.bottom <= nextbox.top):
return False

# these text locutions indicate a new paragraph (normally, this is
Expand Down Expand Up @@ -350,7 +355,7 @@ def offtryck_gluefunc(textbox, nextbox, prevbox):
alignmatch(prevbox, nextbox) or # compare previous line to next
alignmatch(textbox, nextbox) or # compare entire glued box so far to next FIXME -- is this a good idea? Tends to glue rows in tables...
(parindent * 2 >= (prevbox.left - nextbox.left) >= parindent / 2) or
(parindent * 2 >= (textbox.left - nextbox.left) >= parindent / 2) or
(not ul and (parindent * 2 >= (textbox.left - nextbox.left) >= parindent / 2)) or # Too permitting when processing unordered lists
(re.match(r"[\d\.]+\s+[A-ZÅÄÖ]", strtextbox) and nextbox.left - textbox.left < parindent * 5) # hanging indent (numbered) heading -- FIXME: we'd like to increase the parindent multiplier depending on the len of the initial number
)):
# if the two boxes are on the same line, but have a
Expand Down Expand Up @@ -842,13 +847,14 @@ def find_commentary(self, node, state):
return None # visit_node won't call any subnode
cf = CommentaryFinder(state['basefile'], self._parse_uri_from_text, self.temp_sfs_uri)
commentaries = []
found = False
for subsection in node: # nb: Node is the "Författningskommentar" chapter
if cf.is_commentary_section(subsection):
found = True
commentaries.append((subsection, *cf.identify_law(subsection.title)))
if not found: # # no subsecs, ie the prop changes a single law
if 'primarylaw' in state:
commentaries.append((subsection, state['primarylaw'], state['primarylawname']))
commentaries.append((node, state['primarylaw'], state['primarylawname']))
else:
self.log.warning("%s: Författningskommentar does not specify name of law and find_primary_law didn't find it either" % state['basefile'])

Expand Down Expand Up @@ -1072,15 +1078,17 @@ def estimate_density(self, series, resolution, bandwidth):
def threshold(self, series, resolution=1000, bandwidth=200):
bins, scale = self.estimate_density(series, resolution, bandwidth)

# find the valley after the first peak
# find the valley after the first (significant, not less than
# 25% of the highest) peak
minpeak = max(bins) * 0.25
peak = False
best = 0
for idx, val in enumerate(bins):
if not peak:
# walk til we find the peak
if val >= best:
best = val
else:
elif val >= minpeak:
peak = True
else:
# walk til we find the valley
Expand Down Expand Up @@ -1125,7 +1133,17 @@ def analyze(self, commentaries):
# first, analyze gaps and linespacing constants using all sections
features = self.collect_features(commentaries)
gap_threshold = self.threshold(features['gaps'], resolution=1000, bandwidth=400)
linespacing_threshold = self.threshold(features['linespacings'], resolution=1000, bandwidth=500)

# if all we have are paragraphs with equal-ish linespacing
# (say, because there is only comments, no acttext), minor
# accidentally differences between paragraphs might fall on
# different sides of this threshold. require that the spread is at least 20% (ie max ls must be at least 20% larger than min ls)
max_ls = max(features['linespacings'])
min_ls = min(features['linespacings'])
if (max_ls - min_ls) / min_ls < 0.20:
linespacing_threshold = min_ls # ie interpret everything as comment linespacing
else:
linespacing_threshold = self.threshold(features['linespacings'], resolution=1000, bandwidth=500)

if os.environ.get("FERENDA_PLOTANALYSIS"):
#datadir = self.store.datadir
Expand All @@ -1147,22 +1165,23 @@ def make_commentary_parser(self, metrics, lawname, lawuri):
# recognizers
# "3 kap." or "3 kap. Om domare"
def is_chapter_header(parser):
text = str(parser.reader.peek())
text = str(parser.reader.peek()).strip()
return bool(len(text) < 20 and text.endswith((" kap.", " kap")) or
re.match("\d+ kap. +[^\d]", text))

# "4 §" or "4 kap. 4 §"
def is_section_header(parser):
text = str(parser.reader.peek())
text = str(parser.reader.peek()).strip()
return len(text) < 20 and text.endswith("§")

# "4 § Lagtext lagtext och mera lagtext"
def is_section_start(parser):
text = str(parser.reader.peek())
return bool(re.match("\d+(| \w) §", text))
text = str(parser.reader.peek()).strip()
return bool(re.match("\d+(| \w) § +[A-ZÅÄÖ]", text))

def is_transition_regs(parser):
return str(parser.reader.peek()).strip() in (
'Ikraftträdande- och övergångsbestämmelse',
'Ikraftträdande- och övergångsbestämmelser',
'Ikraftträdandebestämmelser'
'Övergångsbestämmelser')
Expand Down Expand Up @@ -1230,11 +1249,11 @@ def make_comment(parser):
title = ""
else:
title = state["reftext"]

f = Forfattningskommentar(title=title,
comment_on=state["comment_on"],
uri=None,
label=label)
f.append(make_paragraph(parser))
comment = parser.make_children(f)
state["comment_on"] = None
state["reftext"] = None
Expand Down Expand Up @@ -1268,8 +1287,19 @@ def make_paragraph(parser):
return ret

def handle_pagebreak(parser):
state["assume"] = None
return parser.reader.next()
pagebreak = parser.reader.next()
try:
nextbox = parser.reader.peek()
nextbox.font # trigger an AttributeError if nextbox is also a Sidbrytning
if probable_acttext(nextbox):
state["assume"] = "acttext"
elif probable_comment(nextbox):
state["assume"] = "comment"
else:
state["assume"] = None
except (StopIteration, AttributeError):
state["assume"] = None
return pagebreak

def setup_transition_header(parser):
# ideally, we'd like URIs of the form
Expand All @@ -1281,9 +1311,31 @@ def setup_transition_header(parser):
state["skipheader"] = True

def setup_section_header(parser):
state["assume"] = "comment"
state["skipheader"] = True
make_section(parser) # throw away the result, which will be exactly the header we want to skip
# a header might be followed by acttext or commmenttext --
# it's not easy to tell. examine each box until we get a
# definite answer (or run out of adjacent boxes)
idx = 2
prevbox = None
acttext = False
while True:
box = parser.reader.peek(idx)
if isinstance(box, Sidbrytning) or (prevbox and box.top - prevbox.bottom > metrics['gap_threshold']):
break
acttext = probable_acttext(box)
if acttext in (True, False):
break
prevbox = box
idx += 1

if acttext:
ret = make_section(parser)
state["assume"] = "acttext"
state["skipheader"] = False
return ret
else:
make_section(parser) # throw away the result, which will be exactly the header we want to skip
state["assume"] = "comment"
state["skipheader"] = True

def setup_section_start(parser):
state["assume"] = "acttext"
Expand All @@ -1303,8 +1355,10 @@ def setup_chapter_start(parser):
newlaw = self._parse_uri_from_text(text, self.basefile, state["law"])
if newlaw:
state["law"] = newlaw
state["comment_on"] = state["law"]
state["skipheader"] = True
reftext = text
state["reftext"] = text
return parser.reader.next()


# helpers
Expand Down Expand Up @@ -1337,17 +1391,21 @@ def probable_comment(para):
return True
elif re.match("\((Jfr|Paragrafen)", text):
return True
elif (para.linespacing or 0) > metrics['linespacing_threshold'] and text[0].isupper():
return True
return None
elif metrics['defaultsize'] >= para.font.size + 2:
return False
elif para.lines > 1:
return bool(metrics['linespacing_threshold'] and
para.linespacing and
para.linespacing >= metrics['linespacing_threshold'])
else:
return None


def probable_acttext(para):
# returns True iff this text is probably acttext
# returns False iff it's probably not acctext
# returns None if we don't have enough data
# (maybe because it's a single line or a Sidbrytning)

if isinstance(para, Sidbrytning):
return None

Expand Down

0 comments on commit 104d537

Please sign in to comment.