facsimle page generation now works when source document is a non-pdf …

…document (eg a wpd file)
staffanm · Feb 1, 2018 · cb2d53e · cb2d53e
1 parent d73cd6b
commit cb2d53e
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 29 deletions.
diff --git a/ferenda/pdfreader.py b/ferenda/pdfreader.py
@@ -159,7 +159,8 @@ def __init__(self,
         else:
             suffix = ".xml"
             converter = self._pdftohtml
-            converter_extra = {'images': images}
+            converter_extra = {'images': images,
+                               'keeppdffile': convert_to_pdf}
             parser = self._parse_xml
         convertedfile = os.sep.join([workdir, stem + suffix])
         if keep_xml == "bz2":
@@ -275,7 +276,7 @@ def _tesseract(self, pdffile, workdir, lang, hocr=True):
                            "%(workdir)s/%(root)s%(suffix)s.html" % locals())
         shutil.rmtree(tmpdir)        
 
-    def _pdftohtml(self, tmppdffile, workdir, images):
+    def _pdftohtml(self, tmppdffile, workdir, images, keeppdffile):
         root = os.path.splitext(os.path.basename(tmppdffile))[0]
         try:
             if images:
@@ -342,8 +343,9 @@ def _pdftohtml(self, tmppdffile, workdir, images):
             # print("3: ran %s (%s), stdout %r, stderr %r" % (cmd, returncode, stdout, stderr))
             # print("contents of %s is now %r" % (workdir, os.listdir(workdir)))
         finally:
-            os.unlink(tmppdffile)
-            assert not os.path.exists(tmppdffile), "tmppdffile still there:" + tmppdffile
+            if not keeppdffile:
+                os.unlink(tmppdffile)
+                assert not os.path.exists(tmppdffile), "tmppdffile still there:" + tmppdffile
 
     dims = r"bbox (?P<left>\d+) (?P<top>\d+) (?P<right>\d+) (?P<bottom>\d+)"
     re_dimensions = re.compile(dims).search
@@ -1010,7 +1012,8 @@ def convert(self, filename, workdir=None, images=True,
             tmpfilename = filename
         else:
             converter = self._pdftohtml
-            converter_extra = {'images': images}
+            converter_extra = {'images': images,
+                               'keeppdffile': convert_to_pdf}
             tmpfilename = os.sep.join([workdir, os.path.basename(filename)])
 
         # copying the filename to the workdir is only needed if we use

diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py
@@ -6,10 +6,12 @@
 from wsgiref.util import request_uri
 import re
 import os
+import sys
 from io import BytesIO
 from functools import partial
 from urllib.parse import urlparse, unquote, parse_qsl
 import mimetypes
+import traceback
 
 from rdflib import Graph
 from ferenda.thirdparty import httpheader
@@ -254,6 +256,7 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
 
         if "dir" in params:
             method = {'downloaded': repo.store.downloaded_path,
+                      'intermediate': repo.store.intermediate_path,
                       'parsed': repo.store.parsed_path}[params["dir"]]
             if "page" in params and "format" in params:
                 baseparam = "-size 400x300 -pointsize 12 -gravity center"
@@ -289,7 +292,8 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
                     if not baseattach:
                         baseattach = "page_error.png"
                     outfile = repo.store.intermediate_path(basefile, attachment=baseattach)
-                    errormsg = str(e).replace("\n", "\\n").replace("'", "\\'")
+                    errormsg = "%s\n%s: %s" % ("".join(traceback.format_tb(sys.exc_info()[2])), e.__class__.__name__, str(e))
+                    errormsg = errormsg.replace("\n", "\\n").replace("'", "\\'")
                     cmdline = 'convert  label:"%s" %s' % (errormsg, outfile)
                     util.runcmd(cmdline, require_success=True)
                 method = partial(repo.store.intermediate_path, attachment=baseattach)

diff --git a/ferenda/sources/legal/se/fixedlayoutsource.py b/ferenda/sources/legal/se/fixedlayoutsource.py
@@ -51,7 +51,6 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
             else:
                 repo = self.repo
             params['repo'] = repo.alias
-            params['dir'] = "downloaded"
             pagemapping_path = repo.store.path(basefile, 'intermediate','.pagemapping.json')
             with open(pagemapping_path) as fp:
                 pagemap = json.load(fp)
@@ -62,6 +61,12 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
                     invertedmap[v] = k
             attachment, pp = invertedmap[pageno].split("#page=")
             params['attachment'] = attachment
+            for candidatedir in ('downloaded', 'intermediate'):
+                if os.path.exists(repo.store.path(basefile, candidatedir, '.dummy', attachment=attachment)):
+                    params['dir'] = candidatedir
+                    break
+            else:
+                raise RequestHandlerError("%s: Cannot find %s in any %s directory" % (basefile, attachment, repo.alias))
             params['page'] = str(int(pp) - 1)  # pp is 1-based, but RequestHandler.get_pathfunc expects 0-based
             params['format'] = 'png'
         return super(FixedLayoutHandler, self).get_pathfunc(environ, basefile, params, contenttype, suffix)

diff --git a/ferenda/sources/legal/se/trips.py b/ferenda/sources/legal/se/trips.py
@@ -120,26 +120,29 @@ def download_get_basefiles_page(self, soup):
 #        return super(Trips, self).download_single(basefile)
 #
     def download_is_different(self, existing, new):
-        # load both existing and new into a BeautifulSoup object, then
-        # compare the first <pre> element
-        existing_soup = BeautifulSoup(
-            util.readfile(
-                existing,
-                encoding=self.source_encoding), "lxml")
-        new_soup = BeautifulSoup(util.readfile(new, encoding=self.source_encoding), "lxml")
-        # exactly how we determine difference depends on page
-        # type. SFS register pages must be handled differently.
-        if "/register/" in existing:
-            existing = existing_soup.find("div", "search-results-content")
-            new = new_soup.find("div", "search-results-content")
+        if existing.endswith(".html"):
+            # load both existing and new into a BeautifulSoup object, then
+            # compare the first <pre> element
+            existing_soup = BeautifulSoup(
+                util.readfile(
+                    existing,
+                    encoding=self.source_encoding), "lxml")
+            new_soup = BeautifulSoup(util.readfile(new, encoding=self.source_encoding), "lxml")
+            # exactly how we determine difference depends on page
+            # type. SFS register pages must be handled differently.
+            if "/register/" in existing:
+                existing = existing_soup.find("div", "search-results-content")
+                new = new_soup.find("div", "search-results-content")
+            else:
+                existing = existing_soup.find("div", "body-text")
+                new = new_soup.find("div", "body-text")
+                assert new, "new file (compared to %s) has no expected content" % existing
+            try:
+                return existing != new
+            except RuntimeError: # can happen with at least v4.4.1 of beautifulsoup
+                return True
         else:
-            existing = existing_soup.find("div", "body-text")
-            new = new_soup.find("div", "body-text")
-            assert new, "new file (compared to %s) has no expected content" % existing
-        try:
-            return existing != new
-        except RuntimeError: # can happen with at least v4.4.1 of beautifulsoup
-            return True
+            return super(Trips, self).download_is_different(existing, new)
 
     def remote_url(self, basefile):
         return self.document_url_template % {'basefile': quote(basefile)}

diff --git a/test/integrationLagen.py b/test/integrationLagen.py
@@ -970,7 +970,6 @@ def test_format(self):
             self.assertTrue("<pre>" not in res.text)
 
     def test_missing_pages(self):
-        import pudb; pu.db
         # issue 5: "I prop. 1992/93:30 saknas s. 18–30. Prop. 1996/97:106 är ofullständig (har bara två sidor)"
         for urlseg in ("prop/1992/93:30",
                        "prop/1996/97:106",

diff --git a/tools/fix-fuseki.txt b/tools/fix-fuseki.txt
@@ -1,4 +1,4 @@
 sudo launchctl unload /Library/LaunchDaemons/org.apache.jena.fuskei.plist 
-sudo rm  /opt/apache-jena-fuseki-2.3.1/run/databases/lagen/tdb.lock
-sudo rm /opt/apache-jena-fuseki-2.3.1/run/system/tdb.lock 
+sudo rm  /opt/apache-jena-fuseki-2.6.0/run/databases/lagen/tdb.lock
+sudo rm /opt/apache-jena-fuseki-2.6.0/run/system/tdb.lock 
 sudo launchctl load /Library/LaunchDaemons/org.apache.jena.fuskei.plist