Skip to content

Commit

Permalink
new pdfreader.asxhtml tests, fixed other regressions
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Mar 1, 2016
1 parent 92aa756 commit 6a6acd1
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 19 deletions.
10 changes: 6 additions & 4 deletions ferenda/pdfreader.py
Expand Up @@ -1029,17 +1029,19 @@ def as_xhtml(self, uri, parent_uri=None):
first = True
prevpart = None
for subpart in self:
if not first and getattr(subpart, 'tag', None) == getattr(prevpart, 'tag', None):
if not first and subpart.tag == prevpart.tag:
prevpart = prevpart + subpart
elif prevpart:
if not isinstance(prevpart, str):
# make sure Textelements w/o a tag doesn't render with
# as_xhtml as this adds a meaningless <span>
if hasattr(prevpart, 'as_xhtml') and prevpart.tag:
prevpart = prevpart.as_xhtml(uri, parent_uri)
children.append(prevpart)
prevpart = subpart
else:
prevpart = subpart
first = False
if not isinstance(prevpart, str):
if hasattr(prevpart, 'as_xhtml') and prevpart.tag:
prevpart = prevpart.as_xhtml(uri, parent_uri)
children.append(prevpart)
element = E("p", {'class': 'textbox'}, *children)
Expand Down Expand Up @@ -1082,7 +1084,7 @@ class Textelement(UnicodeElement):
as a whole is bold (``'b'``) , italic(``'i'`` bold + italic
(``'bi'``) or regular (``None``).
"""

def _get_tagname(self):
if self.tag:
return self.tag
Expand Down
11 changes: 6 additions & 5 deletions ferenda/sources/legal/se/propositioner.py
Expand Up @@ -29,11 +29,12 @@
class PropAnalyzer(PDFAnalyzer):
def documents(self):
for page in self.pdf:
determine dominant font:
if EUAlbertina:
currentdoc = 'eudok'
else:
currentdoc = 'main'
pass
# determine dominant font:
# if EUAlbertina:
# currentdoc = 'eudok'
# else:
# currentdoc = 'main'

def metrics(self, metricspath=None, plotpath=None, startpage=0,
pagecount=None, force=False):
Expand Down
1 change: 0 additions & 1 deletion ferenda/sources/legal/se/swedishlegalsource.py
Expand Up @@ -859,7 +859,6 @@ def infer_identifier(self, basefile):
def postprocess_doc(self, doc):
"""Do any last-minute postprocessing (mainly used to add extra
metadata from doc.body to doc.head)"""
from pudb import set_trace; set_trace()
pass

def get_url_transform_func(self, repos=None, basedir=None, develurl=None):
Expand Down
3 changes: 1 addition & 2 deletions ferenda/wsgiapp.py
Expand Up @@ -236,8 +236,7 @@ def static(self, environ, start_response):
fp = open(fullpath, "rb")
iterdata = FileWrapper(fp)
else:
msg = """
<h1>404</h1>
msg = """<h1>404</h1>
The path %s not found at %s.
Expand Down
2 changes: 1 addition & 1 deletion test/testPDFAnalyze.py
Expand Up @@ -25,7 +25,7 @@ def setUp(self):
self.analyzer = PDFAnalyzer(self.pdf)

def test_documents(self):
self.assertEquals([(0,3)], self.analyzer.documents())
self.assertEquals([(0,3, 'main')], self.analyzer.documents())

def test_vcounters(self):
vcounters = self.analyzer.count_vertical_margins(0, 3)
Expand Down
47 changes: 42 additions & 5 deletions test/testPDFReader.py
Expand Up @@ -12,12 +12,15 @@
import shutil
import tempfile

from lxml import etree

from ferenda.compat import unittest
from ferenda import errors, util
from ferenda.testutil import FerendaTestCase

# SUT
from ferenda import PDFReader

from ferenda.pdfreader import Textbox, Textelement

class Read(unittest.TestCase):
def setUp(self):
Expand Down Expand Up @@ -231,13 +234,47 @@ def test_custom_encoding(self):
class AsXHTML(unittest.TestCase, FerendaTestCase):

def _test_asxhtml(self, want, body):
uri = "http://localhost:8000/res/base/basefile"
got = etree.tostring(body.as_xhtml(uri), pretty_print=True)
got = etree.tostring(body.as_xhtml(None), pretty_print=True)
self.assertEqualXML(want, got)

def test_basic(self):
body = Textbox([Textelement(["test"])])
body = Textbox([Textelement("test", tag=None)],
top=0, left=0, width=100, height=100, fontid=0)
want = """
<p xmlns="http://www.w3.org/1999/xhtml" class="textbox" style="top: 0px, left: 0px, height: 100px, width: 100px">test</p>
"""
self._test_asxhtml(want, body)

def test_elements_with_tags(self):
body = Textbox([Textelement("normal", tag=None),
Textelement("bold", tag="b"),
Textelement("italic", tag="i"),
Textelement("both", tag="bi")
], top=0, left=0, width=100, height=100, fontid=0)
want = """
<p xmlns="http://www.w3.org/1999/xhtml" class="textbox" style="top: 0px, left: 0px, height: 100px, width: 100px">normal<b>bold</b><i>italic</i><b><i>both</i></b></p>
"""
self._test_asxhtml(want, body)


def test_leading_tag(self):
body = Textbox([Textelement("bold", tag="b"),
Textelement("normal", tag=None),
], top=0, left=0, width=100, height=100, fontid=0)
want = """
<p xmlns="http://www.w3.org/1999/xhtml" class="textbox" style="top: 0px, left: 0px, height: 100px, width: 100px"><b>bold</b>normal</p>
"""
self._test_asxhtml(want, body)

def test_tag_merge(self):
body = Textbox([Textelement("identical ", tag=None),
Textelement("tags ", tag=None),
Textelement("should ", tag="b"),
Textelement("merge", tag="b"),
], top=0, left=0, width=100, height=100, fontid=0)
want = """
<p>Test</p>
<p xmlns="http://www.w3.org/1999/xhtml" class="textbox" style="top: 0px, left: 0px, height: 100px, width: 100px">identical tags <b>should merge</b></p>
"""
self._test_asxhtml(want, body)


2 changes: 1 addition & 1 deletion test/testWSGI.py
Expand Up @@ -147,7 +147,7 @@ def test_index_html(self):
def test_not_found(self):
self.env['PATH_INFO'] = '/nonexistent'
status, headers, content = self.call_wsgi(self.env)
msg = '<h1>404</h1>The path /nonexistent not found at %s/nonexistent' % self.datadir
msg = '<h1>404</h1>\n\nThe path /nonexistent not found at %s/nonexistent.\n\nExamined 1 repos.' % self.datadir
self.assertResponse("404 Not Found",
{'Content-Type': 'text/html'},
msg.encode(),
Expand Down

0 comments on commit 6a6acd1

Please sign in to comment.