Skip to content

Commit

Permalink
facsimle page generation now works when source document is a non-pdf …
Browse files Browse the repository at this point in the history
…document (eg a wpd file)
  • Loading branch information
staffanm committed Feb 1, 2018
1 parent d73cd6b commit cb2d53e
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 29 deletions.
13 changes: 8 additions & 5 deletions ferenda/pdfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@ def __init__(self,
else:
suffix = ".xml"
converter = self._pdftohtml
converter_extra = {'images': images}
converter_extra = {'images': images,
'keeppdffile': convert_to_pdf}
parser = self._parse_xml
convertedfile = os.sep.join([workdir, stem + suffix])
if keep_xml == "bz2":
Expand Down Expand Up @@ -275,7 +276,7 @@ def _tesseract(self, pdffile, workdir, lang, hocr=True):
"%(workdir)s/%(root)s%(suffix)s.html" % locals())
shutil.rmtree(tmpdir)

def _pdftohtml(self, tmppdffile, workdir, images):
def _pdftohtml(self, tmppdffile, workdir, images, keeppdffile):
root = os.path.splitext(os.path.basename(tmppdffile))[0]
try:
if images:
Expand Down Expand Up @@ -342,8 +343,9 @@ def _pdftohtml(self, tmppdffile, workdir, images):
# print("3: ran %s (%s), stdout %r, stderr %r" % (cmd, returncode, stdout, stderr))
# print("contents of %s is now %r" % (workdir, os.listdir(workdir)))
finally:
os.unlink(tmppdffile)
assert not os.path.exists(tmppdffile), "tmppdffile still there:" + tmppdffile
if not keeppdffile:
os.unlink(tmppdffile)
assert not os.path.exists(tmppdffile), "tmppdffile still there:" + tmppdffile

dims = r"bbox (?P<left>\d+) (?P<top>\d+) (?P<right>\d+) (?P<bottom>\d+)"
re_dimensions = re.compile(dims).search
Expand Down Expand Up @@ -1010,7 +1012,8 @@ def convert(self, filename, workdir=None, images=True,
tmpfilename = filename
else:
converter = self._pdftohtml
converter_extra = {'images': images}
converter_extra = {'images': images,
'keeppdffile': convert_to_pdf}
tmpfilename = os.sep.join([workdir, os.path.basename(filename)])

# copying the filename to the workdir is only needed if we use
Expand Down
6 changes: 5 additions & 1 deletion ferenda/requesthandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
from wsgiref.util import request_uri
import re
import os
import sys
from io import BytesIO
from functools import partial
from urllib.parse import urlparse, unquote, parse_qsl
import mimetypes
import traceback

from rdflib import Graph
from ferenda.thirdparty import httpheader
Expand Down Expand Up @@ -254,6 +256,7 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix):

if "dir" in params:
method = {'downloaded': repo.store.downloaded_path,
'intermediate': repo.store.intermediate_path,
'parsed': repo.store.parsed_path}[params["dir"]]
if "page" in params and "format" in params:
baseparam = "-size 400x300 -pointsize 12 -gravity center"
Expand Down Expand Up @@ -289,7 +292,8 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
if not baseattach:
baseattach = "page_error.png"
outfile = repo.store.intermediate_path(basefile, attachment=baseattach)
errormsg = str(e).replace("\n", "\\n").replace("'", "\\'")
errormsg = "%s\n%s: %s" % ("".join(traceback.format_tb(sys.exc_info()[2])), e.__class__.__name__, str(e))
errormsg = errormsg.replace("\n", "\\n").replace("'", "\\'")
cmdline = 'convert label:"%s" %s' % (errormsg, outfile)
util.runcmd(cmdline, require_success=True)
method = partial(repo.store.intermediate_path, attachment=baseattach)
Expand Down
7 changes: 6 additions & 1 deletion ferenda/sources/legal/se/fixedlayoutsource.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
else:
repo = self.repo
params['repo'] = repo.alias
params['dir'] = "downloaded"
pagemapping_path = repo.store.path(basefile, 'intermediate','.pagemapping.json')
with open(pagemapping_path) as fp:
pagemap = json.load(fp)
Expand All @@ -62,6 +61,12 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix):
invertedmap[v] = k
attachment, pp = invertedmap[pageno].split("#page=")
params['attachment'] = attachment
for candidatedir in ('downloaded', 'intermediate'):
if os.path.exists(repo.store.path(basefile, candidatedir, '.dummy', attachment=attachment)):
params['dir'] = candidatedir
break
else:
raise RequestHandlerError("%s: Cannot find %s in any %s directory" % (basefile, attachment, repo.alias))
params['page'] = str(int(pp) - 1) # pp is 1-based, but RequestHandler.get_pathfunc expects 0-based
params['format'] = 'png'
return super(FixedLayoutHandler, self).get_pathfunc(environ, basefile, params, contenttype, suffix)
Expand Down
41 changes: 22 additions & 19 deletions ferenda/sources/legal/se/trips.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,26 +120,29 @@ def download_get_basefiles_page(self, soup):
# return super(Trips, self).download_single(basefile)
#
def download_is_different(self, existing, new):
# load both existing and new into a BeautifulSoup object, then
# compare the first <pre> element
existing_soup = BeautifulSoup(
util.readfile(
existing,
encoding=self.source_encoding), "lxml")
new_soup = BeautifulSoup(util.readfile(new, encoding=self.source_encoding), "lxml")
# exactly how we determine difference depends on page
# type. SFS register pages must be handled differently.
if "/register/" in existing:
existing = existing_soup.find("div", "search-results-content")
new = new_soup.find("div", "search-results-content")
if existing.endswith(".html"):
# load both existing and new into a BeautifulSoup object, then
# compare the first <pre> element
existing_soup = BeautifulSoup(
util.readfile(
existing,
encoding=self.source_encoding), "lxml")
new_soup = BeautifulSoup(util.readfile(new, encoding=self.source_encoding), "lxml")
# exactly how we determine difference depends on page
# type. SFS register pages must be handled differently.
if "/register/" in existing:
existing = existing_soup.find("div", "search-results-content")
new = new_soup.find("div", "search-results-content")
else:
existing = existing_soup.find("div", "body-text")
new = new_soup.find("div", "body-text")
assert new, "new file (compared to %s) has no expected content" % existing
try:
return existing != new
except RuntimeError: # can happen with at least v4.4.1 of beautifulsoup
return True
else:
existing = existing_soup.find("div", "body-text")
new = new_soup.find("div", "body-text")
assert new, "new file (compared to %s) has no expected content" % existing
try:
return existing != new
except RuntimeError: # can happen with at least v4.4.1 of beautifulsoup
return True
return super(Trips, self).download_is_different(existing, new)

def remote_url(self, basefile):
return self.document_url_template % {'basefile': quote(basefile)}
Expand Down
1 change: 0 additions & 1 deletion test/integrationLagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,6 @@ def test_format(self):
self.assertTrue("<pre>" not in res.text)

def test_missing_pages(self):
import pudb; pu.db
# issue 5: "I prop. 1992/93:30 saknas s. 18–30. Prop. 1996/97:106 är ofullständig (har bara två sidor)"
for urlseg in ("prop/1992/93:30",
"prop/1996/97:106",
Expand Down
4 changes: 2 additions & 2 deletions tools/fix-fuseki.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sudo launchctl unload /Library/LaunchDaemons/org.apache.jena.fuskei.plist
sudo rm /opt/apache-jena-fuseki-2.3.1/run/databases/lagen/tdb.lock
sudo rm /opt/apache-jena-fuseki-2.3.1/run/system/tdb.lock
sudo rm /opt/apache-jena-fuseki-2.6.0/run/databases/lagen/tdb.lock
sudo rm /opt/apache-jena-fuseki-2.6.0/run/system/tdb.lock
sudo launchctl load /Library/LaunchDaemons/org.apache.jena.fuskei.plist

0 comments on commit cb2d53e

Please sign in to comment.