replaced the tangled mess that was DocumentRepository.transform_html …

…(and friends) with a new Transformer class
staffanm · Sep 13, 2013 · a7cb810 · a7cb810
1 parent a5e8a41
commit a7cb810
Show file tree

Hide file tree

Showing 13 changed files with 314 additions and 483 deletions.
diff --git a/ferenda/__init__.py b/ferenda/__init__.py
@@ -14,11 +14,11 @@
 from .tocpage import TocPage
 from .toccriteria import TocCriteria
 from .newscriteria import NewsCriteria
+from .transformer import Transformer
 from .document import Document
 from .documententry import DocumentEntry
 from .documentstore import DocumentStore
 from .documentrepository import DocumentRepository
 from .pdfdocumentrepository import PDFDocumentRepository
 from .compositerepository import CompositeRepository
-from .transformer import Transformer
 __version__ = "0.1.5-dev" #gets pulled into setup.py and docs/conf.py
diff --git a/ferenda/devel.py b/ferenda/devel.py
@@ -31,6 +31,8 @@ class Devel(object):
     class DummyStore(object):
         def __init__(self, path, **kwargs):
             pass
+        def list_basefiles_for(self, action, basedir=None):
+            return []
     downloaded_suffix = ".html"
     storage_policy = "file"
     documentstore_class = DummyStore

diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py
diff --git a/ferenda/layeredconfig.py b/ferenda/layeredconfig.py
@@ -6,6 +6,7 @@
 import logging
 from ferenda.compat import OrderedDict
 from six.moves import configparser
+from six import text_type as str
 
 class LayeredConfig(object):
     """Provide unified access to nested configuration parameters. The

diff --git a/ferenda/manager.py b/ferenda/manager.py
@@ -13,47 +13,44 @@
 from __future__ import unicode_literals
 # system
 import os
-import time
 import stat
 import subprocess
 import sys
 import inspect
-import itertools
 import logging
 import json
 import mimetypes
 from ast import literal_eval
 from datetime import datetime
 import xml.etree.cElementTree as ET
-import cgi
 from ferenda.compat import OrderedDict
+from wsgiref.simple_server import make_server
+from wsgiref.util import FileWrapper
 
 import six
 from six.moves.urllib_parse import urlsplit, parse_qsl, urlencode
 from six.moves import configparser
 input = six.moves.input
 
-from wsgiref.simple_server import make_server
-from wsgiref.util import FileWrapper
-# from pprint import pprint
-
 # 3rd party
 import pkg_resources
 import requests
 import requests.exceptions
 from rdflib import URIRef, Namespace, Literal
 from bs4 import BeautifulSoup
+from lxml import etree
 
 # my modules
 from ferenda import DocumentRepository
 from ferenda import DocumentStore
 from ferenda import FulltextIndex
 from ferenda import LayeredConfig
+from ferenda import Transformer
 from ferenda import TripleStore
 from ferenda import elements
-from ferenda.elements  import html
 from ferenda import errors
 from ferenda import util
+from ferenda.elements import html
 
 # NOTE: This is part of the published API and must be callable in
 # scenarios without configfile or logger.
@@ -356,10 +353,16 @@ def frontpage(repos,
         xhtml_path = os.path.splitext(path)[0] + ".xhtml"
         with open(xhtml_path,"w") as fp:
             fp.write(xhtml)
-
-        xsltdir = repos[0].setup_transform_templates(os.path.dirname(stylesheet), stylesheet)
-        params = repos[0].get_transform_configuration(xsltdir,xhtml_path)
-        repos[0].transform_html(xsltdir+"/"+os.path.basename(stylesheet), xhtml_path, path, params, otherrepos=repos)
+        # FIXME: We don't need to actually store the xhtml file on
+        # disk -- we could just keep it in memory as an lxml tree and
+        # call .transform(tree) just like
+        # DocuementRepository.toc_create_page does
+        docroot = os.path.dirname(path)
+        conffile = os.sep.join([docroot,'rsrc','resources.xml'])
+        transformer = Transformer('XSLT', stylesheet, ["res/xsl"],
+                                  config=conffile,
+                                  documentroot=docroot)
+        transformer.transform_file(xhtml_path, path)
     return True
 
 
@@ -481,26 +484,13 @@ def _wsgi_search(environ, start_response, args):
                                                   'href':url}))
     doc.body.append(html.Div(pages, **{'class':'pager'}))
     # Transform that XHTML into HTML5
-
-    # FIXME: this way of transforming a etree to HTML5 is way too
-    # complicated, dependent on args['repo'][0], stores temporary
-    # files on disk for no good reason, abuses
-    # get_transform_configuration with a fake path and duplicates code
-    xsltfile = "res/xsl/search.xsl"
-    tmpfile = args['documentroot']+"/_searchtmp-%s.xhtml" % os.getpid()
-    outfile = args['documentroot']+"/_searchtmp-%s.html" % os.getpid()
-    # create a fake path so taht get_transform_configuration selects a
-    # resources.xml with correct relative path
-    depth = len(list(filter(None,args['searchendpoint'].split("/"))))
-    fake_outfile = "%s/%s/search.html" % (args['documentroot'],
-                                          "/".join(["fake"]*depth))
-    xsltdir = repo.setup_transform_templates(os.path.dirname(xsltfile), xsltfile)
-    params = repo.get_transform_configuration(xsltdir,fake_outfile)
-
-    repo.render_xhtml(doc,tmpfile)
-    repo.transform_html(xsltdir + "/search.xsl",
-                        tmpfile, outfile, params, args['repos'][1:])
-    data = util.readfile(outfile,"rb")
+    conffile = os.sep.join([args['documentroot'],'rsrc','resources.xml'])
+    transformer = Transformer('XSLT', "res/xsl/search.xsl", ["res/xsl"],
+                              config=conffile)
+    depth = len(args['searchendpoint'].split("/")) - 2 # '/mysearch/' = depth 1
+    repo = DocumentRepository()
+    tree = transformer.transform(repo.render_xhtml_tree(doc), depth)
+    data = transformer.t.html5_doctype_workaround(etree.tostring(tree))
     start_response("200 OK", [
         ("Content-Type", "text/html; charset=utf-8"),
         ("Content-Length", str(len(data)))
@@ -1127,7 +1117,11 @@ def _list_class_usage(cls):
     for attrname in dir(cls):
         attr = getattr(cls, attrname)
         if hasattr(attr, "runnable"):
-            res[attr.__name__] = attr.__doc__.split("\n")[0]
+            doc = attr.__doc__
+            if doc:
+                res[attr.__name__] = doc.split("\n")[0]
+            else:
+                res[attr.__name__] = "(Undocumented)"
     return res
 
 

diff --git a/ferenda/res/xsl/base.xsl b/ferenda/res/xsl/base.xsl
@@ -23,7 +23,7 @@
   <xsl:template match="/">
     <!-- this is a ugly workaround required to get the proper html5
          doctype *and* the pre-rootnode conditional IE comments needed
-         for the h5bp template. Strip the <fakeroot> start and end
+         for the h5bp template. Strip the <remove-this-tag> start and end
          tags as a postprocessing step. -->
     <remove-this-tag>
       <xsl:apply-templates/>

diff --git a/ferenda/transformer.py b/ferenda/transformer.py
@@ -4,23 +4,29 @@
 from tempfile import mkdtemp
 import os
 import shutil
+import re
 
 import pkg_resources
 from lxml import etree
 from lxml.etree import XSLT
 
+from ferenda import errors, util
+
 # assumption: A transformer is initialized with a single template. If
 # you want to use a different template, create a different
 # transformer.
 class Transformer(object):
     def __init__(self, transformertype,
-                       template,
-                       templatedirs,
-                       documentroot=None):
+                 template,
+                 templatedirs,
+                 documentroot=None,
+                 config=None):
+
         cls = {'XSLT': XSLTTransform,
                'JINJA': JinjaTransform}[transformertype]
         self.t = cls(template, templatedirs)
         self.documentroot = documentroot
+        self.config = config
 
     # transform() always operate on the native datastructure -- this might 
     # be different depending on the transformer engine. For XSLT, which is 
@@ -30,32 +36,53 @@ def __init__(self, transformertype,
     # transform_file instead 
     #    
     # valid parameters 
-    # - configurationfile: resources.xml -- cannot be calculated until
-    #                                       we know the outfile
     # - annotationfile: intermediate/basefile.grit.xml
-    def transform(self, indata, depth, parameters=None):
+    def transform(self, indata, depth, parameters=None, uritransform=None):
         if parameters == None:
             parameters = {}
-        configfile = self.t.getconfig(depth)
-        if configfile:
-            parameters['configfile'] = configfile
-        from pudb import set_trace; set_trace()
-        outdata = self.t.transform(indata, parameters)
+
+        # the provided configuration (might be a file or a python dict
+        # or anything else, depending on the transformer engine) will
+        # contain lists of JS/CSS resources. In order to make it
+        # possible to use relative links to these (needed for offline
+        # static HTML files), we first do a transformer
+        # engine-specific adaption of the configuration depending on
+        # the directory depth level of the outfile (as provided
+        # through the depth parameter), then we provide this adapted
+        # configuration to the transform call
+        if self.config:
+            adapted_config = self.t.getconfig(self.config, depth)
+        else:
+            adapted_config = None
+        outdata = self.t.transform(indata, adapted_config, parameters)
+        if uritransform:
+            self._transform_links(outdata.getroot(), uritransform)
         return outdata
 
+    def _transform_links(self, tree, uritransform):
+        for part in tree:
+            # depth-first transformation seems the easiest
+            self._transform_links(part, uritransform)
+            if part.tag != "a": continue
+            uri = part.get("href")
+            if not uri: continue
+            part.set("href", uritransform(uri))
+
+
     # accepts a file-like object, returns a file-like object
     def transform_stream(self, instream,
                          parameters=None):
         return self.t.native_to_stream(
-            self.transform(self.t.stream_to_native(instream),
-
-))
+            self.transform(self.t.stream_to_native(instream)))
+
     # accepts two filenames, reads from one, writes to the other
-    def transform_file(self, infile, outfile, parameters):
+    def transform_file(self, infile, outfile,
+                       parameters=None, uritransform=None):
         depth = self._depth(outfile, self.documentroot)
         self.t.native_to_file(self.transform(self.t.file_to_native(infile),
                                              depth,
-                                             parameters),
+                                             parameters,
+                                             uritransform),
                               outfile)
 
     def _depth(self, outfile, root):
@@ -67,7 +94,7 @@ def __init__(self, template, templatedirs):
         pass
 
 class XSLTTransform(TransformerEngine):
-    def __init__(self, template, templatedirs):
+    def __init__(self, template, templatedirs, **kwargs):
         self.format = True # FIXME: make configurable
         self.templdir = self._setup_templates(template, templatedirs)
         worktemplate = self.templdir + os.sep + os.path.basename(template)
@@ -102,40 +129,67 @@ def _setup_templates(self, template, templatedirs):
             shutil.copy2(template, workdir)
         return workdir
 
-    # getconfig may return different data depending on engine -- in this case 
-    # it creates a xml file and returns the path for it
-    def getconfig(self, depth):
-        pass
-
+    # getconfig may return different data depending on engine -- in
+    # this case it creates a xml file and returns the path for it
+    def getconfig(self, configfile, depth):
+        filename = configfile
+        if depth != 0:
+            (base, ext) = os.path.splitext(configfile)
+            filename = "%(base)s-depth-%(depth)d%(ext)s" % locals()
+            if not util.outfile_is_newer([configfile],  filename):
+                tree = etree.parse(configfile)
+                # adjust the relevant link attribute for some nodes
+                for xpath, attrib in (("stylesheets/link", "href"),
+                                      ("javascripts/script", "src")):
+                    for node in tree.findall(xpath):
+                        # don't adjust absolute links
+                        if not (re.match("(https?://|/)", node.get(attrib))):
+                            node.set(attrib, "../"*depth + node.get(attrib))
+                tree.write(filename)
+        return filename
+
+    def transform(self, indata, config=None, parameters={}):
+        strparams = {}
+        if config:
+            strparams['configurationfile'] = XSLT.strparam(config)
+        for key, value in parameters.items():
+            if key.endswith("file"):
+                # relativize path of file relative to the XSL file
+                # we'll be using. The mechanism could be clearer...
+                value = os.path.relpath(value, self.templdir)
+            strparams[key] = XSLT.strparam(value)
+        try:
+            return self._transformer(indata,**strparams)
+        except etree.XSLTApplyError as e:
+            raise errors.TransformError(str(e.error_log))
+        if len(transform.error_log) > 0:
+            raise errors.TransformError(str(transform.error_log))
+
     # nativedata = lxml.etree
     def native_to_file(self, nativedata, outfile):
         res = self.html5_doctype_workaround(
             etree.tostring(nativedata, pretty_print=self.format))
+        util.ensure_dir(outfile)
         with open(outfile,"wb") as fp:
             fp.write(res)
 
     @staticmethod
     def html5_doctype_workaround(indata):
         # FIXME: This is horrible
         if indata.startswith(b"<remove-this-tag>"):
-            indata = b"<!DOCTYPE html>\n"+indata[17:-18].strip()
-            if indata[-1] == b"<" or indata[-1] == 60:
-                indata = indata[:-1]
+            found = False
+            endidx = -1
+            while not found:
+                if indata[endidx] == b"<" or indata[endidx] == 60:
+                    found = True
+                else:
+                    endidx -= 1
+            indata = b"<!DOCTYPE html>\n"+indata[17:endidx].strip()
         return indata
 
     def file_to_native(self, infile):
         return etree.parse(infile)
 
-    def transform(self, indata, parameters):
-        strparams = {}
-        for key, value in parameters.items():
-            strparams[key] = XSLT.strparam(value)
-        try:
-            return self._transformer(indata,**parameters)
-        except etree.XSLTApplyError as e:
-            raise errors.TransformError(str(e.error_log))
-        if len(transform.error_log) > 0:
-            raise errors.TransformError(str(transform.error_log))
         # FIXME: hook in the transform_links step somehow?
 
 class JinjaTransform(TransformerEngine):

diff --git a/ferenda/triplestore.py b/ferenda/triplestore.py
@@ -17,6 +17,7 @@
 from six import text_type as str
 from six import binary_type as bytes
 from six.moves.urllib_parse import quote
+import pyparsing
 
 from ferenda.thirdparty import SQLite
 from ferenda import util, errors
@@ -199,7 +200,10 @@ def triple_count(self, context=None):
 
 
     def select(self, query, format="sparql"):
-        res = self.graph.query(query)
+        try:
+            res = self.graph.query(query)
+        except pyparsing.ParseException as e:
+            raise errors.SparqlError(e)
         if format == "sparql":
             return res.serialize(format="xml")
         elif format == "json":
@@ -222,7 +226,10 @@ def construct(self, query):
         :param query: A SPARQL query with all neccessary prefixes defined.
         :type query: str
         """
-        res = self.graph.query(query)
+        try:
+            res = self.graph.query(query)
+        except pyparsing.ParseException as e:
+            raise errors.SparqlError(e)
         return res.graph
 
     def clear(self, context=None):
@@ -418,7 +425,6 @@ def construct(self, query):
         url = self._endpoint_url()
         url += "?query=" + quote(query)
         try:
-            r = requests.get(url)
             format = "xml"
             headers = {'Accept': self._contenttype[format]}
             resp = requests.get(url, headers=headers, data=query)