Skip to content

Commit

Permalink
handle superscript + bold/italic formatting, and more
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Aug 6, 2017
1 parent 133ebd3 commit c1fe656
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 42 deletions.
1 change: 0 additions & 1 deletion ferenda/documentrepository.py
Original file line number Diff line number Diff line change
Expand Up @@ -2362,7 +2362,6 @@ def generate(self, basefile, otherrepos=[]):
{'basefile': basefile}):
conffile = os.path.abspath(
os.sep.join([self.config.datadir, 'rsrc', 'resources.xml']))

if self.xslt_template.startswith("/"):
templatedir = "."
elif "/" in self.xslt_template:
Expand Down
50 changes: 28 additions & 22 deletions ferenda/pdfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,10 +417,11 @@ def dimensions(s):
# find any previous definition of this fontspec
fontid = None
for specid, spec in self.fontspec.items():
if fontspec == spec:
if (fontspec['size'] == spec['size'] and
fontspec['family'] == spec['family']):
fontid = specid

# None was found, create a new
# None was found, create a new
if not fontid:
fontid = str(len(self.fontspec)) # start at 0
fontspec['id'] = fontid
Expand Down Expand Up @@ -611,7 +612,6 @@ def txt(element_text):
page.append(box)
# done reading the page
self.append(page)
self.fontspec = self._textdecoder.fontspecs(self.fontspec)
self.log.debug("PDFReader initialized: %d pages, %d fontspecs" %
(len(self), len(self.fontspec)))

Expand All @@ -631,7 +631,7 @@ def _parse_xml_make_textbox(self, element, nextelement, after_footnote, lastbox,
# should be rendered with superscript
if textelements[0].tag is None:
textelements[0].tag = ""
if isinstance(textelements[0], LinkedTextelement):
if isinstance(textelements[0], LinkedTextelement) or textelements[0].tag:
textelements[0].tag += "s"
else:
textelements[0].tag = "sup"
Expand Down Expand Up @@ -776,7 +776,7 @@ def _parse_xml_add_fontspec(self, element, fontinfo, fontspec):
fspec['encoding'] = fontinfo[fspec['family']]['encoding']
if "+" in fspec['family']:
fspec['family'] = fspec['family'].split("+", 1)[1]
fontspec[fontid] = fspec
fontspec[fontid] = self._textdecoder.fontspec(fspec)


def _analyze_font_encodings(self, root, fontinfo):
Expand Down Expand Up @@ -1360,9 +1360,20 @@ def _get_tagname(self):
return "span"

def as_xhtml(self, uri, parent_uri=None):
if self.tag in ("ib", "bi"):
return E(self.tag[0], {},
E(self.tag[1], {}, self.clean_string()))
if self.tag and len(self.tag) > 1 and self.tag != "sup":
# first create a list of elements
tagmap = {"s": "sup",
"b": "b",
"i": "i",
"a": "a"}
tags = [E(tagmap[x]) for x in self.tag]
# then place the text content in the last one
tags[-1].text = self.clean_string()
# then nest them
for idx, tag in enumerate(tags):
if idx < len(tags) - 1:
tag.append(tags[idx+1])
return tags[0]
else:
return super(Textelement, self).as_xhtml(uri, parent_uri)

Expand Down Expand Up @@ -1424,18 +1435,13 @@ def _get_tagname(self):
tagname = property(_get_tagname)

def as_xhtml(self, uri, parent_uri=None):
if self.tag:
taglist = "a" + self.tag
prevtag = self.tag
if self.tag is None:
self.tag = "a"
else:
taglist = "a"
element = None
for tag in reversed(taglist):
if tag == "s": # since every subtag must a single char...
tag = "sup"
if element is None:
element = E(tag, {}, str(self))
else:
element = E(tag, {}, element)
self.tag = "a" + self.tag
element = super(LinkedTextelement, self).as_xhtml(uri, parent_uri)
self.tag = prevtag
element.set("href", self.uri)
return element

Expand All @@ -1453,5 +1459,5 @@ def __init__(self, dummy=None):
def __call__(self, textbox, fontspecs):
return textbox

def fontspecs(self, fontspecs):
return fontspecs
def fontspec(self, fontspec):
return fontspec
34 changes: 19 additions & 15 deletions ferenda/sources/legal/se/decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __call__(self, textbox, fontspecs):
tag=subpart.tag)
return textbox

def fontspecs(self, fontspecs):
def fontspec(self, fontspec):
# Fonts in Propositioner get handled wierdly by pdf2xml --
# sometimes they come out as "Times New Roman,Italic",
# sometimes they come out as "TimesNewRomanPS-ItalicMT". Might
Expand All @@ -75,20 +75,24 @@ def fontspecs(self, fontspecs):
# be more consistent. NOTE: This might be totally unneccesary
# now that we use PDFAnalyzer to determine likely fonts for
# headers etc.
for key, val in fontspecs.items():
if 'family' in val:
# Times New Roman => TimesNewRomanPSMT
# Times New Roman,Italic => TimesNewRomanPS-ItalicMT
if val['family'] == "Times New Roman":
val['family'] = "TimesNewRomanPSMT"
if val['family'] == "Times New Roman,Italic":
val['family'] = "TimesNewRomanPS-ItalicMT"
# Not 100% sure abt these last two
if val['family'] == "Times New Roman,Bold":
val['family'] = "TimesNewRomanPS-BoldMT"
if val['family'] == "Times New Roman,BoldItalic":
val['family'] = "TimesNewRomanPS-BoldItalicMT"
return fontspecs
if 'family' in fontspec:
# Times New Roman => TimesNewRomanPSMT
# Times New Roman,Italic => TimesNewRomanPS-ItalicMT
if fontspec['family'] == "Times New Roman":
fontspec['family'] = "TimesNewRomanPSMT"
if fontspec['family'] == "Times New Roman,Italic":
fontspec['family'] = "TimesNewRomanPS-ItalicMT"
# Not 100% sure abt these last two
if fontspec['family'] == "Times New Roman,Bold":
fontspec['family'] = "TimesNewRomanPS-BoldMT"
if fontspec['family'] == "Times New Roman,BoldItalic":
fontspec['family'] = "TimesNewRomanPS-BoldItalicMT"
# only found in sou 2003:129 -- uses totally different
# family name for superscripts, but in reality is same
# font.
if fontspec['family'] == "TTA1o00":
fontspec['family'] = "TT5Eo00"
return fontspec



Expand Down
2 changes: 1 addition & 1 deletion ferenda/sources/legal/se/res/xsl/forarbete.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ really tested with direktiv, utredningar (SOU/Ds) and propositioner.


<!-- remove these empty elements (often <i/> or <span/> tags) -->
<xsl:template match="xhtml:span|xhtml:i[not(text())]">
<xsl:template match="xhtml:span|xhtml:i[not(string())]">
</xsl:template>
<!-- default template: translate everything from whatever namespace
it's in (usually the XHTML1.1 NS) into the default namespace
Expand Down
47 changes: 44 additions & 3 deletions test/testPDFReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,14 +325,15 @@ def test_autodetect_encoding(self):
self.assertEqual("Skälen för regeringens bedömning och förslag",
str(page[2])) # other encoding (0x20




class ParseXML(unittest.TestCase):
maxDiff = None

def _parse_xml(self, xmlfrag):
def _parse_xml(self, xmlfrag, decoding_class=BaseTextDecoder):
pdf = PDFReader(pages=True)
pdf.fontspec = {}
pdf._textdecoder = BaseTextDecoder()
pdf._textdecoder = decoding_class()
xml = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">
<pdf2xml producer="poppler" version="0.24.3">
Expand All @@ -342,6 +343,7 @@ def _parse_xml(self, xmlfrag):
</pdf2xml>""" % xmlfrag
xmlfp = BytesIO(xml.encode("utf-8"))
xmlfp.name = "dummy.xml"

pdf._parse_xml(xmlfp)
return pdf

Expand Down Expand Up @@ -493,6 +495,35 @@ def test_footnote_footer(self):
serialize(pdf[0]))


def test_italic_superscript_unreliable_font(self):
# the thing here is that font 2 and font 7 really has the same
# font family.
# ferenda.sources.legal.se.decoders.OffsetDecoder1d knows this
# since it's hard-coded. The main problem is that the
# OffsetDecoder1d.fontspecs methods (that aliases the fonts)
# is run after PDFReader._parse_xml. Maybe we need to make
# ._parse_xml call into the given textdecoder for each
# fontspec tag it encounters?
from ferenda.sources.legal.se.decoders import OffsetDecoder1d
pdf = self._parse_xml("""
<fontspec id="2" size="14" family="MAMMBB+TT5Eo00" color="#000000"/>
<fontspec id="7" size="7" family="MBAAAC+TTA1o00" color="#000000"/>
<text top="552" left="340" width="372" height="17" font="2">intressant om 50 år föreslås att projektet Kulturarw</text>
<text top="549" left="712" width="5" height="13" font="7"><i>3</i></text>
<text top="552" left="717" width="98" height="17" font="2"> får fortsätta </text>
""", OffsetDecoder1d)
want = """
<Page height="750" number="1" width="500">
<Textbox bottom="569" fontid="2" height="20" left="340" lines="-2" right="815" top="549" width="475">
<Textelement>intressant om 50 år föreslås att projektet Kulturarw</Textelement>
<Textelement tag="is">3</Textelement>
<Textelement> får fortsätta </Textelement>
</Textbox>
</Page>
"""
self.assertEqual(want[1:],
serialize(pdf[0]))

def test_links(self):
pdf = self._parse_xml("""
<fontspec id="6" size="14" family="CNMEID+TradeGothic,Bold" color="#000000"/>
Expand Down Expand Up @@ -658,6 +689,16 @@ def test_linkelements(self):
<p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px">normal<a href="http://example.org/">link</a><sup>footnote marker</sup><a href="http://example.org/"><sup>linked footnote marker</sup></a></p>
"""
self._test_asxhtml(want, body)

def test_superscripts(self):
body = Textbox([Textelement("1", tag="sup"),
Textelement("2", tag="is"),
Textelement("3", tag="bis")],
top=0, left=0, width=100, height=100, fontid=0)
want = """
<p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px"><sup>1</sup><i><sup>2</sup></i><b><i><sup>3</sup></i></b></p>
"""
self._test_asxhtml(want, body)



Expand Down

0 comments on commit c1fe656

Please sign in to comment.