handle superscript + bold/italic formatting, and more

staffanm · Aug 6, 2017 · c1fe656 · c1fe656
1 parent 133ebd3
commit c1fe656
Show file tree

Hide file tree

Showing 5 changed files with 92 additions and 42 deletions.
diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py
@@ -2362,7 +2362,6 @@ def generate(self, basefile, otherrepos=[]):
                               {'basefile': basefile}):
                 conffile = os.path.abspath(
                     os.sep.join([self.config.datadir, 'rsrc', 'resources.xml']))
-
                 if self.xslt_template.startswith("/"):
                     templatedir = "."
                 elif "/" in self.xslt_template:

diff --git a/ferenda/pdfreader.py b/ferenda/pdfreader.py
@@ -417,10 +417,11 @@ def dimensions(s):
                 # find any previous definition of this fontspec
                 fontid = None
                 for specid, spec in self.fontspec.items():
-                    if fontspec == spec:
+                    if (fontspec['size'] == spec['size'] and
+                        fontspec['family'] == spec['family']):
                         fontid = specid
-
-                        # None was found, create a new
+                        
+                # None was found, create a new
                 if not fontid:
                     fontid = str(len(self.fontspec))  # start at 0
                     fontspec['id'] = fontid
@@ -611,7 +612,6 @@ def txt(element_text):
                     page.append(box)
             # done reading the page
             self.append(page)
-        self.fontspec = self._textdecoder.fontspecs(self.fontspec)
         self.log.debug("PDFReader initialized: %d pages, %d fontspecs" %
                        (len(self), len(self.fontspec)))
 
@@ -631,7 +631,7 @@ def _parse_xml_make_textbox(self, element, nextelement, after_footnote, lastbox,
                 # should be rendered with superscript
                 if textelements[0].tag is None:
                     textelements[0].tag = ""
-                if isinstance(textelements[0], LinkedTextelement):
+                if isinstance(textelements[0], LinkedTextelement) or textelements[0].tag:
                     textelements[0].tag += "s"
                 else:
                     textelements[0].tag = "sup"
@@ -776,7 +776,7 @@ def _parse_xml_add_fontspec(self, element, fontinfo, fontspec):
             fspec['encoding'] = fontinfo[fspec['family']]['encoding']
         if "+" in fspec['family']:
             fspec['family'] = fspec['family'].split("+", 1)[1]
-        fontspec[fontid] = fspec
+        fontspec[fontid] = self._textdecoder.fontspec(fspec)
 
 
     def _analyze_font_encodings(self, root, fontinfo):
@@ -1360,9 +1360,20 @@ def _get_tagname(self):
             return "span"
 
     def as_xhtml(self, uri, parent_uri=None):
-        if self.tag in ("ib", "bi"):
-            return E(self.tag[0], {},
-                     E(self.tag[1], {}, self.clean_string()))
+        if self.tag and len(self.tag) > 1 and self.tag != "sup":
+            # first create a list of elements
+            tagmap = {"s": "sup",
+                      "b": "b",
+                      "i": "i",
+                      "a": "a"}
+            tags = [E(tagmap[x]) for x in self.tag]
+            # then place the text content in the last one
+            tags[-1].text = self.clean_string()
+            # then nest them
+            for idx, tag in enumerate(tags):
+                if idx < len(tags) - 1:
+                    tag.append(tags[idx+1])
+            return tags[0]
         else:
             return super(Textelement, self).as_xhtml(uri, parent_uri)
 
@@ -1424,18 +1435,13 @@ def _get_tagname(self):
     tagname = property(_get_tagname)
 
     def as_xhtml(self, uri, parent_uri=None):
-        if self.tag:
-            taglist = "a" + self.tag
+        prevtag = self.tag
+        if self.tag is None:
+            self.tag = "a"
         else:
-            taglist = "a"
-        element = None
-        for tag in reversed(taglist):
-            if tag == "s":  # since every subtag must a single char...
-                tag = "sup"
-            if element is None:
-                element = E(tag, {}, str(self))
-            else:
-                element = E(tag, {}, element)
+            self.tag = "a" + self.tag
+        element = super(LinkedTextelement, self).as_xhtml(uri, parent_uri)
+        self.tag = prevtag
         element.set("href", self.uri)
         return element
 
@@ -1453,5 +1459,5 @@ def __init__(self, dummy=None):
     def __call__(self, textbox, fontspecs):
         return textbox
 
-    def fontspecs(self, fontspecs):
-        return fontspecs
+    def fontspec(self, fontspec):
+        return fontspec
diff --git a/ferenda/sources/legal/se/decoders.py b/ferenda/sources/legal/se/decoders.py
@@ -66,7 +66,7 @@ def __call__(self, textbox, fontspecs):
                                            tag=subpart.tag)
         return textbox
 
-    def fontspecs(self, fontspecs):
+    def fontspec(self, fontspec):
         # Fonts in Propositioner get handled wierdly by pdf2xml --
         # sometimes they come out as "Times New Roman,Italic",
         # sometimes they come out as "TimesNewRomanPS-ItalicMT". Might
@@ -75,20 +75,24 @@ def fontspecs(self, fontspecs):
         # be more consistent. NOTE: This might be totally unneccesary
         # now that we use PDFAnalyzer to determine likely fonts for
         # headers etc.
-        for key, val in fontspecs.items():
-            if 'family' in val:
-                # Times New Roman => TimesNewRomanPSMT
-                # Times New Roman,Italic => TimesNewRomanPS-ItalicMT
-                if val['family'] == "Times New Roman":
-                    val['family'] = "TimesNewRomanPSMT"
-                if val['family'] == "Times New Roman,Italic":
-                    val['family'] = "TimesNewRomanPS-ItalicMT"
-                # Not 100% sure abt these last two
-                if val['family'] == "Times New Roman,Bold":
-                    val['family'] = "TimesNewRomanPS-BoldMT"
-                if val['family'] == "Times New Roman,BoldItalic":
-                    val['family'] = "TimesNewRomanPS-BoldItalicMT"
-        return fontspecs
+        if 'family' in fontspec:
+            # Times New Roman => TimesNewRomanPSMT
+            # Times New Roman,Italic => TimesNewRomanPS-ItalicMT
+            if fontspec['family'] == "Times New Roman":
+                fontspec['family'] = "TimesNewRomanPSMT"
+            if fontspec['family'] == "Times New Roman,Italic":
+                fontspec['family'] = "TimesNewRomanPS-ItalicMT"
+            # Not 100% sure abt these last two
+            if fontspec['family'] == "Times New Roman,Bold":
+                fontspec['family'] = "TimesNewRomanPS-BoldMT"
+            if fontspec['family'] == "Times New Roman,BoldItalic":
+                fontspec['family'] = "TimesNewRomanPS-BoldItalicMT"
+            # only found in sou 2003:129 -- uses totally different
+            # family name for superscripts, but in reality is same
+            # font.
+            if fontspec['family'] == "TTA1o00":  
+                fontspec['family'] = "TT5Eo00"
+        return fontspec
 
 
 

diff --git a/ferenda/sources/legal/se/res/xsl/forarbete.xsl b/ferenda/sources/legal/se/res/xsl/forarbete.xsl
@@ -273,7 +273,7 @@ really tested with direktiv, utredningar (SOU/Ds) and propositioner.
 
 
   <!-- remove these empty elements (often <i/> or <span/> tags) -->
-  <xsl:template match="xhtml:span|xhtml:i[not(text())]">
+  <xsl:template match="xhtml:span|xhtml:i[not(string())]">
   </xsl:template>
   <!-- default template: translate everything from whatever namespace
        it's in (usually the XHTML1.1 NS) into the default namespace

diff --git a/test/testPDFReader.py b/test/testPDFReader.py
@@ -325,14 +325,15 @@ def test_autodetect_encoding(self):
         self.assertEqual("Skälen för regeringens bedömning och förslag",
                          str(page[2]))         # other encoding (0x20
 
-
+
+
 class ParseXML(unittest.TestCase):
     maxDiff = None
 
-    def _parse_xml(self, xmlfrag):
+    def _parse_xml(self, xmlfrag, decoding_class=BaseTextDecoder):
         pdf = PDFReader(pages=True)
         pdf.fontspec = {}
-        pdf._textdecoder = BaseTextDecoder()
+        pdf._textdecoder = decoding_class()
         xml = """<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">
 <pdf2xml producer="poppler" version="0.24.3">
@@ -342,6 +343,7 @@ def _parse_xml(self, xmlfrag):
 </pdf2xml>""" % xmlfrag
         xmlfp = BytesIO(xml.encode("utf-8"))
         xmlfp.name = "dummy.xml"
+
         pdf._parse_xml(xmlfp)
         return pdf
 
@@ -493,6 +495,35 @@ def test_footnote_footer(self):
                          serialize(pdf[0]))
 
 
+    def test_italic_superscript_unreliable_font(self):
+        # the thing here is that font 2 and font 7 really has the same
+        # font family.
+        # ferenda.sources.legal.se.decoders.OffsetDecoder1d knows this
+        # since it's hard-coded. The main problem is that the
+        # OffsetDecoder1d.fontspecs methods (that aliases the fonts)
+        # is run after PDFReader._parse_xml. Maybe we need to make
+        # ._parse_xml call into the given textdecoder for each
+        # fontspec tag it encounters?
+        from ferenda.sources.legal.se.decoders import OffsetDecoder1d
+        pdf = self._parse_xml("""
+<fontspec id="2" size="14" family="MAMMBB+TT5Eo00" color="#000000"/>
+<fontspec id="7" size="7" family="MBAAAC+TTA1o00" color="#000000"/>
+<text top="552" left="340" width="372" height="17" font="2">intressant om 50 år föreslås att projektet Kulturarw</text>
+<text top="549" left="712" width="5" height="13" font="7"><i>3</i></text>
+<text top="552" left="717" width="98" height="17" font="2"> får fortsätta </text>
+        """, OffsetDecoder1d)
+        want = """
+<Page height="750" number="1" width="500">
+  <Textbox bottom="569" fontid="2" height="20" left="340" lines="-2" right="815" top="549" width="475">
+    <Textelement>intressant om 50 år föreslås att projektet Kulturarw</Textelement>
+    <Textelement tag="is">3</Textelement>
+    <Textelement> får fortsätta </Textelement>
+  </Textbox>
+</Page>
+"""
+        self.assertEqual(want[1:],
+                         serialize(pdf[0]))
+
     def test_links(self):
         pdf = self._parse_xml("""
 <fontspec id="6" size="14" family="CNMEID+TradeGothic,Bold" color="#000000"/>
@@ -658,6 +689,16 @@ def test_linkelements(self):
 <p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px">normal<a href="http://example.org/">link</a><sup>footnote marker</sup><a href="http://example.org/"><sup>linked footnote marker</sup></a></p>
 """
         self._test_asxhtml(want, body)
+
+    def test_superscripts(self):
+        body = Textbox([Textelement("1", tag="sup"),
+                        Textelement("2", tag="is"),
+                        Textelement("3", tag="bis")],
+                       top=0, left=0, width=100, height=100, fontid=0)
+        want = """
+<p xmlns="http://www.w3.org/1999/xhtml" class="textbox fontspec0" style="top: 0px; left: 0px; height: 100px; width: 100px"><sup>1</sup><i><sup>2</sup></i><b><i><sup>3</sup></i></b></p>
+"""
+        self._test_asxhtml(want, body)