Skip to content

Commit

Permalink
Be more forgiving when constructing bullet lists
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Jun 13, 2017
1 parent c15bae8 commit 75430f5
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 31 deletions.
22 changes: 9 additions & 13 deletions ferenda/sources/legal/se/offtryck.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,6 @@ def metadata_from_basefile(self, basefile):
a["rpubl:arsutgava"], a["rpubl:lopnummer"] = basefile.split(":", 1)
return a

blacklist = set([(SOU, "2008:35"), # very atypical report
(DS, "2002:34"), # 2-column report, uninteresting
(SOU, "2002:11"), # -""-
(DS, "2007:30"), # atypical report in english
(DS, "2014:32"), # -""-
(DS, "2008:73"), # -""-
(DS, "2008:82"), # -""- in swedish
# (DS, "2004:46"), # -""- in swedish
])

def sanitize_body(self, rawbody):
sanitized = super(Offtryck, self).sanitize_body(rawbody)
if isinstance(sanitized, PDFReader):
Expand Down Expand Up @@ -246,6 +236,9 @@ def offtryck_gluefunc(textbox, nextbox, prevbox):
# U+F0B7 is Private use -- probably using symbol font
# for bullet. Just accept any font family or size change
sizematch = lambda p, n: True
# also acccept a slight mismatch in vertical align because of reasons
valignmatch = lambda p, n: abs(p.bottom - n.bottom) <= 1

# numbered section headings can have large space between
# the leading number and the rest of the heading, and the
# top/bottom of the leading number box might differ from
Expand Down Expand Up @@ -1527,9 +1520,12 @@ def make_bulletlist(parser):

def make_listitem(parser):
s = str(parser.reader.next())
# assume text before first space is the bullet
assert " " in s, "No space after bullet in '%s'" % s
s = s.split(" ",1)[1]
if " " in s:
# assume text before first space is the bullet
s = s.split(" ",1)[1]
else:
# assume the bullet is a single char
s = s[1:]
return ListItem(s)

@newstate('appendix')
Expand Down
12 changes: 7 additions & 5 deletions lagen/nu/res/options/options.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
{
("utr/sou", "2008:35"): "skip", # very atypical report
("utr/ds", "2002:34"): "skip", # 2-column report, uninteresting
("utr/sou", "2002:11"): "skip", # -""-
("utr/sou", "2002:11"): "skip", # -""-
("utr/ds", "2007:30"): "skip", # atypical report in english
("utr/ds", "2014:32"): "skip", # -""-
("utr/ds", "2008:73"): "skip", # -""-
("utr/ds", "2008:82"): "skip", # -""- in swedish
("utr/ds", "2014:111"): "skip", # -""- in swedish
("utr/ds", "2014:32"): "skip", # -""-
("utr/ds", "2009:631"): "skip", # -""-
("utr/ds", "2008:73"): "skip", # -""-
("utr/ds", "2008:82"): "skip", # -""- in swedish
("utr/ds", "2014:111"): "skip", # -""-
("utr/ds", "2005:55"): "skip", # -""-
# some ridiculusly large document (statsbudget) have little legal
# importance. Just process the metadata
("prop", "1971:1"): "metadataonly",
Expand Down
26 changes: 13 additions & 13 deletions test/files/repo/proptrips/parsed/2002-03/84.xhtml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version='1.0' encoding='utf-8'?>
<html xmlns:bibo="http://purl.org/ontology/bibo/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:rpubl="http://rinfo.lagrummet.se/ns/2008/11/rinfo/publ#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xml:lang="sv" xsi:schemaLocation="http://www.w3.org/1999/xhtml http://www.w3.org/MarkUp/SCHEMA/xhtml-rdfa-2.xsd" version="XHTML+RDFa 1.1">
<html xmlns:bibo="http://purl.org/ontology/bibo/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:rpubl="http://rinfo.lagrummet.se/ns/2008/11/rinfo/publ#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#" xmlns="http://www.w3.org/1999/xhtml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.w3.org/1999/xhtml http://www.w3.org/MarkUp/SCHEMA/xhtml-rdfa-2.xsd" version="XHTML+RDFa 1.1" xml:lang="sv">
<head about="https://lagen.nu/prop/2002/03:84">
<meta property="dcterms:identifier" content="Prop. 2002/03:84" xml:lang=""/>
<meta property="dcterms:issued" content="2003-03-13" datatype="xsd:date"/>
Expand All @@ -17,30 +17,30 @@
</head>
<body about="https://lagen.nu/prop/2002/03:84">
<div class="frontmatter">
<span class="sidbrytning" height="1263" id="sid1" src="https://lagen.nu/prop/2002/03:84/sid1.png" width="892"/>
<span id="sid1" class="sidbrytning" src="https://lagen.nu/prop/2002/03:84/sid1.png" width="892" height="1263"/>
<h1 class="prophuvudrubrik">Regeringens proposition 2002/03:84</h1>
<h1 class="proprubrik">Ändring i avtalet om Sveriges exportråd</h1>
<p class="textbox fontspec2" style="top: 387px; left: 85px; height: 21px; width: 422px">Regeringen överlämnar denna proposition till riksdagen.</p>
<p class="textbox fontspec2" style="top: 430px; left: 85px; height: 21px; width: 219px">Stockholm den 13 mars 2003</p>
<p class="textbox fontspec2" style="top: 387px; left: 85px; height: 21px; width: 424px">Regeringen överlämnar denna proposition till riksdagen.</p>
<p class="textbox fontspec2" style="top: 430px; left: 85px; height: 21px; width: 220px">Stockholm den 13 mars 2003</p>
<p class="textbox fontspec3" style="top: 497px; left: 85px; height: 21px; width: 114px">
<i>Göran Persson</i>
</p>
<p class="textbox fontspec3" style="top: 546px; left: 287px; height: 21px; width: 109px">
<p class="textbox fontspec3" style="top: 545px; left: 287px; height: 21px; width: 109px">
<i>Leif Pagrotsky</i>
</p>
<p class="textbox fontspec2" style="top: 567px; left: 287px; height: 21px; width: 173px">(Utrikesdepartementet)</p>
<p class="textbox fontspec2" style="top: 567px; left: 287px; height: 21px; width: 174px">(Utrikesdepartementet)</p>
</div>
<div class="preamblesection" about="https://lagen.nu/prop/2002/03:84#PS1" property="dcterms:title" content="Propositionens huvudsakliga innehåll" typeof="bibo:DocumentPart">
<span rel="dcterms:isPartOf" href="https://lagen.nu/prop/2002/03:84"/>
<p class="textbox fontspec2" style="top: 706px; left: 85px; height: 150px; width: 553px">Exportrådets verksamhet grundar sig på ett avtal mellan staten och Sveriges Allmänna Exportförening. Ändringar i avtalet förutsätter riksdagens bemyndigande. På grundval av en principöverenskommelse mellan staten och Sveriges Allmänna Exportförening föreslås ändringar i avtalet om Sveriges exportråd, för att ersätta nuvarande individuella medlemsavgifter med ett kollektivt bidrag från näringslivet till den del av Exportrådets verksamhet som inte bedrivs på marknadsmässiga villkor. </p>
<span class="sidbrytning" height="1263" id="sid2" src="https://lagen.nu/prop/2002/03:84/sid2.png" width="892"/>
<p class="textbox fontspec2" style="top: 706px; left: 85px; height: 149px; width: 552px">Exportrådets verksamhet grundar sig på ett avtal mellan staten och Sveriges Allmänna Exportförening. Ändringar i avtalet förutsätter riksdagens bemyndigande. På grundval av en principöverenskommelse mellan staten och Sveriges Allmänna Exportförening föreslås ändringar i avtalet om Sveriges exportråd, för att ersätta nuvarande individuella medlemsavgifter med ett kollektivt bidrag från näringslivet till den del av Exportrådets verksamhet som inte bedrivs på marknadsmässiga villkor. </p>
<span id="sid2" class="sidbrytning" src="https://lagen.nu/prop/2002/03:84/sid2.png" width="892" height="1263"/>
</div>
<div class="protokollsutdrag" content="Utrikesdepartementet">
<p class="textbox fontspec2" style="top: 155px; left: 85px; height: 21px; width: 485px">Utdrag ur protokoll vid regeringssammanträde den 13 mars 2003</p>
<p class="textbox fontspec2" style="top: 198px; left: 85px; height: 86px; width: 553px">Närvarande: statsministern Persson, ordförande, och statsråden Winberg, Ulvskog, Lindh, Pagrotsky, Östros, Messing, Engqvist, Lövdén, Ringholm, Bodström, J.O. Karlsson, Sommestad, H. Karlsson, Lund, Andnor, Johansson, Hallengren, Björklund</p>
<p class="textbox fontspec2" style="top: 327px; left: 85px; height: 21px; width: 263px">Föredragande: statsrådet Pagrotsky</p>
<p class="textbox fontspec2" style="top: 391px; left: 85px; height: 21px; width: 102px">___________</p>
<p class="textbox fontspec2" style="top: 434px; left: 85px; height: 43px; width: 550px">Regeringen beslutar proposition 2002/03:84 Ändring i avtalet om Sveriges exportråd.</p>
<p class="textbox fontspec2" style="top: 155px; left: 85px; height: 21px; width: 488px">Utdrag ur protokoll vid regeringssammanträde den 13 mars 2003</p>
<p class="textbox fontspec2" style="top: 198px; left: 85px; height: 86px; width: 552px">Närvarande: statsministern Persson, ordförande, och statsråden Winberg, Ulvskog, Lindh, Pagrotsky, Östros, Messing, Engqvist, Lövdén, Ringholm, Bodström, J.O. Karlsson, Sommestad, H. Karlsson, Lund, Andnor, Johansson, Hallengren, Björklund</p>
<p class="textbox fontspec2" style="top: 327px; left: 85px; height: 21px; width: 264px">Föredragande: statsrådet Pagrotsky</p>
<p class="textbox fontspec2" style="top: 391px; left: 85px; height: 21px; width: 104px">___________</p>
<p class="textbox fontspec2" style="top: 434px; left: 85px; height: 43px; width: 552px">Regeringen beslutar proposition 2002/03:84 Ändring i avtalet om Sveriges exportråd.</p>
</div>
</body>
</html>
32 changes: 32 additions & 0 deletions test/testPDFReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,38 @@ def test_whitespace_normalization(self):
self.assertEqual("Document title ", str(pdf[0][0]))


def test_multiple_textelements(self):
pdf = self._parse_xml("""
<fontspec id="1" size="5" family="X" color="#00000"/>
<text top="0" left="0" width="23" height="13" font="1"><b>foo</b> <b>bar</b></text>
""")
self.assertEqual("foobar", str(pdf[0][0]))
# test that Textelement.__add__ inserts a space correctly
self.assertEqual('<Textelement tag="b">foo bar</Textelement>',
serialize(pdf[0][0][0] + pdf[0][0][1]).strip())
want = """
<Textbox bottom="13" fontid="1" height="13" left="0" lines="0" right="23" top="0" width="23">
<Textelement tag="b">foo</Textelement>
<Textelement tag="b">bar</Textelement>
</Textbox>
"""
self.assertEqual(want[1:], serialize(pdf[0][0]))

# 2nd test, with leading non-tagged Textelement
pdf = self._parse_xml("""
<fontspec id="0" size="5" family="X" color="#00000"/>
<text top="374" left="508" width="211" height="14" font="0">näringsidkaren <i>en</i> <i>varning. En var-</i></text>
""")
want = """
<Textbox bottom="388" fontid="0" height="14" left="508" lines="0" right="719" top="374" width="211">
<Textelement>näringsidkaren </Textelement>
<Textelement tag="i">en</Textelement>
<Textelement tag="i">varning. En var-</Textelement>
</Textbox>
"""
self.assertEqual(want[1:], serialize(pdf[0][0]))


def test_footnote(self):
pdf = self._parse_xml("""
<fontspec id="7" size="14" family="TROYEM+OriginalGaramondBT-Roman" color="#000000"/>
Expand Down

0 comments on commit 75430f5

Please sign in to comment.