From a7cb810a059822c2124fba341a2a58c39c2c2410 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Fri, 13 Sep 2013 21:59:58 +0200 Subject: [PATCH] replaced the tangled mess that was DocumentRepository.transform_html (and friends) with a new Transformer class --- ferenda/__init__.py | 2 +- ferenda/devel.py | 2 + ferenda/documentrepository.py | 348 +++++++++------------------------- ferenda/layeredconfig.py | 1 + ferenda/manager.py | 60 +++--- ferenda/res/xsl/base.xsl | 2 +- ferenda/transformer.py | 124 ++++++++---- ferenda/triplestore.py | 12 +- test/testDocRepo.py | 214 ++++++++------------- test/testManager.py | 1 + test/testTransform.py | 19 +- test/testTripleStore.py | 7 + tools/test.sh | 5 +- 13 files changed, 314 insertions(+), 483 deletions(-) diff --git a/ferenda/__init__.py b/ferenda/__init__.py index d7745340..d6e3d973 100644 --- a/ferenda/__init__.py +++ b/ferenda/__init__.py @@ -14,11 +14,11 @@ from .tocpage import TocPage from .toccriteria import TocCriteria from .newscriteria import NewsCriteria +from .transformer import Transformer from .document import Document from .documententry import DocumentEntry from .documentstore import DocumentStore from .documentrepository import DocumentRepository from .pdfdocumentrepository import PDFDocumentRepository from .compositerepository import CompositeRepository -from .transformer import Transformer __version__ = "0.1.5-dev" #gets pulled into setup.py and docs/conf.py diff --git a/ferenda/devel.py b/ferenda/devel.py index 1b4e97ce..5031b281 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -31,6 +31,8 @@ class Devel(object): class DummyStore(object): def __init__(self, path, **kwargs): pass + def list_basefiles_for(self, action, basedir=None): + return [] downloaded_suffix = ".html" storage_policy = "file" documentstore_class = DummyStore diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py index 67b33154..8b0df4d9 100644 --- a/ferenda/documentrepository.py +++ b/ferenda/documentrepository.py @@ -44,8 +44,12 @@ # mine from ferenda import util, errors, decorators -from ferenda import Describer, LayeredConfig, TripleStore, FulltextIndex, Document, DocumentEntry, NewsCriteria, TocCriteria, TocPageset, TocPage, DocumentStore -from ferenda.elements import AbstractElement, serialize, Body, Nav, Link, Section, Subsection, Subsubsection, Heading, UnorderedList, ListItem, Preformatted, Paragraph +from ferenda import (Describer, LayeredConfig, TripleStore, FulltextIndex, + Document, DocumentEntry, NewsCriteria, TocCriteria, + TocPageset, TocPage, DocumentStore, Transformer) +from ferenda.elements import (AbstractElement, serialize, Body, Nav, Link, + Section, Subsection, Subsubsection, Heading, + UnorderedList, ListItem, Preformatted, Paragraph) from ferenda.elements.html import elements_from_soup from ferenda.thirdparty import patch, httpheader @@ -250,10 +254,6 @@ def __init__(self, **kwargs): # assume that any standalone prefix is well known self.ns[prefix] = Namespace(util.ns[prefix]) - def __del__(self): - if self._transform_resourcedir: - shutil.rmtree(self._transform_resourcedir) - def get_default_options(self): """Returns the class' configuration default configuration properties. These can be overridden by a configution file, or @@ -997,7 +997,8 @@ def create_external_resources(self, doc): """ pass - def render_xhtml(self, doc, outfile): + + def render_xhtml(self, doc, outfile=None): """Renders the parsed object structure as a XHTML file with RDFa attributes (also returns the same XHTML as a string). @@ -1008,6 +1009,24 @@ def render_xhtml(self, doc, outfile): :returns: The XHTML document :rtype: str """ + xhtmldoc = self.render_xhtml_tree(doc) + doctype = ('') + res = etree.tostring(xhtmldoc, + pretty_print=True, + xml_declaration=True, + encoding='utf-8', + doctype=doctype) + fileno, tmpfile = mkstemp() + fp = os.fdopen(fileno) + fp.close() + with open(tmpfile, "wb") as fp: + fp.write(res) + util.replace_if_different(tmpfile, outfile) + return res + + + def render_xhtml_tree(self, doc): XML_LANG = "{http://www.w3.org/XML/1998/namespace}lang" def render_head(g, uri, children=None): @@ -1025,7 +1044,8 @@ def render_head(g, uri, children=None): # we sort to get a predictable order (by predicate, then by object) for (subj, pred, obj) in sorted(g, key=lambda t:(t[1],t[2])): if str(subj) != uri and str(obj) != uri: - self.log.warning("%s != %s" % (subj, uri)) + # This isn't a triple we should serialize to RDFa, + # at least not in this iteration continue if g.qname(pred) == "dct:title" and revlink: @@ -1083,20 +1103,7 @@ def render_head(g, uri, children=None): headcontent, bodycontent, ) - doctype = ('') - res = etree.tostring(xhtmldoc, - pretty_print=True, - xml_declaration=True, - encoding='utf-8', - doctype=doctype) - fileno, tmpfile = mkstemp() - fp = os.fdopen(fileno) - fp.close() - with open(tmpfile, "wb") as fp: - fp.write(res) - util.replace_if_different(tmpfile, outfile) - return res + return xhtmldoc def parsed_url(self, basefile): @@ -1414,45 +1421,56 @@ def generate(self, basefile, otherrepos=[]): """ with util.logtime(self.log.info, "%(basefile)s OK (%(elapsed).3f sec)", {'basefile': basefile}): + # This dependency management could be abstracted away like + # the parseifneeded decorator does for parse(). But unlike + # parse(), noone is expected to override generate(), so + # the proper place to handle this complexity is probably + # here. infile = self.store.parsed_path(basefile) annotations = self.store.annotation_path(basefile) if os.path.exists(self.store.dependencies_path(basefile)): - dependencies = util.readfile(self.store.dependencies_path(basefile)).split("\n") + deptxt = util.readfile(self.store.dependencies_path(basefile)) + dependencies = deptxt.split("\n") else: dependencies = [] dependencies.extend((infile,annotations)) outfile = self.store.generated_path(basefile) - force = (self.config.force or - self.config.generateforce) - if not force and util.outfile_is_newer(dependencies, outfile): + if ((not self.config.force) and + util.outfile_is_newer(dependencies, outfile)): self.log.debug("%s: Skipped", basefile) return self.log.debug("%s: Starting", basefile) - xsltdir = self.setup_transform_templates("res/xsl", self.xslt_template) - xsltfile = xsltdir + os.sep + os.path.basename(self.xslt_template) - params = self.get_transform_configuration(xsltdir,outfile) - - assert 'configurationfile' in params, "No configurationfile found, did you run makeresources?" - - # The actual function code + # All bookkeping done, now lets prepare and transform! with util.logtime(self.log.debug, "%(basefile)s: prep_annotation_file in %(elapsed).3f sec", {'basefile': basefile}): + # annotation_file should be the same as annotations above? annotation_file = self.prep_annotation_file(basefile) - + + params = {} if annotation_file: - relpath = os.path.relpath(annotation_file, - os.path.dirname(xsltfile)) - # NOTE: Even on Win32, lxml needs to have this path using - # unix separators, i.e. / instead of the native \ - relpath = relpath.replace("\\","/") - params['annotationfile'] = XSLT.strparam(relpath) + params['annotationfile'] = annotation_file + with util.logtime(self.log.debug, - "%(basefile)s: transform_html in %(elapsed).3f", + "%(basefile)s: transform in %(elapsed).3f", {'basefile': basefile}): - self.transform_html(xsltfile, infile, outfile, params, otherrepos=otherrepos) + conffile = os.sep.join([self.config.datadir,'rsrc', + 'resources.xml']) + transformer = Transformer('XSLT', self.xslt_template, + ["res/xsl"], config=conffile, + documentroot=self.config.datadir) + urltransform = None + if self.config.staticsite: + repos = list(otherrepos) + if self not in repos: + repos.append(self) + urltransform = self.get_url_transform_func(repos, os.path.dirname(outfile)) + + transformer.transform_file(infile, outfile, + params, urltransform) + # At this point, outfile may appear untouched if it already # existed and wasn't actually changed. But this will cause the @@ -1467,110 +1485,15 @@ def generate(self, basefile, otherrepos=[]): docentry.updated = now docentry.save() - def transform_html(self, stylesheet, infile, outfile, - parameters={}, - format=True, - xinclude=False, - otherrepos=[]): - """Creates browser-ready HTML5 from a basic XHTML+RDFa file - using a XSLT transform. - - :param stylesheet: the filename of the XSLT stylesheet to use - :type stylesheet: string - :param infile: The filename of the basic XHTML+RDFa file to be - transformed - :type infile: string - :param outfile: The filename of the created HTML5 file - :type outfile: string - :param parameters: Any parameters passed to the XSLT stylesheet (see - :py:meth:`~ferenda.DocumentRepository.get_transform_configuration`) - :type parameters: dict - :param format: Whether to format/indent the resulting outfile - :type format: bool - :param xinclude: Whether to process xinlude directives in the infile - :type xinclude: bool - :returns: True if the transform resulted in a new or updated - outfile, False if the result was identical - to the previously existing outfile. - :rtype: bool - """ - assert not xinclude, "xinclude not supported yet" - # print("transform_html: stylesheet %s infile %s outfile %s" % (stylesheet, infile, outfile)) - - # Open the XSLT stylesheet, either as a normal file - # (user-provided) or a package resource (ferenda built-in) - # FIXME: load-path mechanism (cf manager.makeresources())? - if os.path.exists(stylesheet): - fp = open(stylesheet) - elif pkg_resources.resource_exists('ferenda',stylesheet): # prefix stylesheet with 'res/xsl'? - fp = pkg_resources.resource_stream('ferenda',stylesheet) - else: - raise ValueError("Stylesheet %s not found" % stylesheet) - parser = etree.XMLParser(remove_blank_text=format) - xsltree = etree.parse(fp,parser) - fp.close() - try: - transform = etree.XSLT(xsltree) - except etree.XSLTParseError as e: - raise errors.TransformError(str(e.error_log)) - - with open(infile) as fp: - intree = etree.parse(fp,parser) - try: - outtree = transform(intree,**parameters) - - except etree.XSLTApplyError as e: - raise errors.TransformError(str(e.error_log)) - if len(transform.error_log) > 0: - raise errors.TransformError(str(transform.error_log)) - - if self.config.staticsite: - self.transform_links(outtree.getroot(), outfile, otherrepos) - - res = etree.tostring(outtree,pretty_print=format).strip() - - if format: - bytefp = BytesIO(res) - parser = etree.XMLParser(remove_blank_text=True) - res = etree.tostring(etree.parse(bytefp,parser),pretty_print=True) - - fileno, tmpfile = mkstemp() - fp = os.fdopen(fileno) - fp.close() - - # FIXME: This is horrible - if res.startswith(b""): - res = b"\n"+res[17:-18].strip() - if res[-1] == b"<" or res[-1] == 60: - res = res[:-1] - with open(tmpfile,"wb") as fp: - fp.write(res) - - util.ensure_dir(outfile) - return util.replace_if_different(tmpfile,outfile) - - def transform_links(self, tree, base, otherrepos): - """ - Given a etree tree, transform all links that refer to any file - handled by this or any other repository into relative file - links. Useful for generating HTML files that can be used - offline, used if ``self.config.staticsite`` is set. - - """ - - repos = [self] - repos.extend(otherrepos) - for part in tree: - # depth-first transformation seems the easiest - self.transform_links(part, base, otherrepos) - if part.tag != "a": - continue - uri = part.get("href") - if not uri: - continue - - if uri == self.config.url: # root url + def get_url_transform_func(self, repos, basedir): + # This implementation always transforms URLs to local file + # paths (or if they can't be mapped, leaves them alone) + + # FIXME: apply some memoization to this + def transform(uri): + path = None + if uri == self.config.url: path = "data/index.html" else: for repo in repos: @@ -1579,7 +1502,6 @@ def transform_links(self, tree, base, otherrepos): dataset_params = repo.dataset_params_from_uri(uri) if basefile or (dataset_params is not None): break - if basefile: path = repo.store.generated_path(basefile) elif dataset_params is not None: @@ -1590,107 +1512,12 @@ def transform_links(self, tree, base, otherrepos): else: pseudobasefile = "index" path = repo.store.path(pseudobasefile,'toc','.html') - else: - continue - relpath = os.path.relpath(path,os.path.dirname(base)).replace(os.sep,'/') - part.set("href", relpath) - - # xsltpath = os.path.join(os.curdir,'../ferenda',self.xslt_template) - def get_transform_configuration(self, xsltdir, outfile=None): - """ - Set up a dict of parameters pointing to the configuration XML - file needed for XSLT transform. + if path: + return os.path.relpath(path, basedir) + else: + return uri + return transform - .. note:: - - Maybe this should be an internal method. - - :param xsltdir: path to the directory where the root xslt file is stored - :type xsltdir: str - :param outfile: path to the planned file resulting from the XSLT transfomrm - :type outfile: str - :returns: The path to the resources.xml file, wrapped through lxml.etree.XSLT.strparam and put in a a dict - :rtype: dict - """ - assert os.path.isdir(xsltdir), "%s does not exist (or is not a directory)" % xsltdir - params = {} - conffile = os.sep.join([self.config.datadir,'rsrc','resources.xml']) - if os.path.exists(conffile): - if outfile: - # We detect if stylesheet[@href] and script[@src] - # point correctly, and if not, create a new version of - # configurationfile where os.relpath has been applied - # to them. - tree = etree.parse(conffile) - if os.path.isabs(self.config.datadir): - datadir = self.config.datadir - else: - datadir = os.path.abspath(self.config.datadir) - - if not os.path.isabs(outfile): - outfile = os.path.abspath(outfile) - - assert outfile.startswith(datadir), "outfile %s not under datadir %s" % (outfile, datadir) - # "datadir/foo/bar/baz.html" -> "foo/bar" - # "/var/folders/sy/r4f/T/tmpcRojl/foo/bar/baz.html" -> "foo/bar"p - relative_outfile = outfile[len(datadir)+1:] - if os.sep in relative_outfile: - outdir = relative_outfile.rsplit(os.sep,1)[0] - else: - outdir = "" - for node in tree.findall("stylesheets/link"): - if not (re.match("https?://", node.get('href'))): - node.set('href', os.path.relpath(node.get('href'),outdir).replace(os.sep,"/")) - for node in tree.findall("javascripts/script"): - if not (re.match("https?://", node.get('src'))): - node.set('src', os.path.relpath(node.get('src'),outdir).replace(os.sep,"/")) - # loop through all css refs and use the first local ref: - depth = 0 - for node in tree.findall("stylesheets/link"): - if not (re.match("https?://", node.get('href'))): - depth = node.get('href').count('..') - break - if depth > 0: - # create a new file - (base, ext) = os.path.splitext(conffile) - modfile = base + ("-depth-%s" % depth) + ext - if not util.outfile_is_newer([conffile], modfile): - tree.write(modfile) - conffile = modfile - relpath = os.path.relpath(conffile,xsltdir).replace(os.sep,"/") - params['configurationfile'] = XSLT.strparam(relpath) - - return params - - _transform_resourcedir=None - def setup_transform_templates(self, xsltdir, mainxslt): - """Unpack/extract all XSLT files and other resources needed to - for the XSLT transform, if needed (which is the case if - ferenda is distributed as an egg, i.e. all files are contained - in a zip file). - - :param xsltdir: path to the directory where the supporting xslt files are stored - :type xsltdir: str - :param xsltdir: path to the main xslt file - :type xsltdir: str - :returns: The path to extracted files - :rtype: str - """ - # Unpack/extract all the files - if not self._transform_resourcedir: - self._transform_resourcedir = mkdtemp() - # copy everything to this temp dir (note: this gets cleaned up in __del__) - for f in pkg_resources.resource_listdir('ferenda',xsltdir): - source_fp = pkg_resources.resource_stream('ferenda', xsltdir+"/"+f) - dest = self._transform_resourcedir + "/" + f - with open(self._transform_resourcedir + "/" + f, "wb") as dest_fp: - dest_fp.write(source_fp.read()) - # print("extracted %s/%s to %s" % (xsltdir,f,dest)) - - if os.path.basename(mainxslt) not in pkg_resources.resource_listdir('ferenda',xsltdir): - shutil.copy2(mainxslt, self._transform_resourcedir) - - return self._transform_resourcedir def prep_annotation_file(self, basefile): """Helper function used by :py:meth:`generate` -- prepares a RDF/XML file @@ -2169,8 +1996,6 @@ def toc_generate_page(self, binding, value, documentlist, pagesets, effective_ba if effective_basefile == None: effective_basefile = binding + "/" + value outfile = self.store.path(effective_basefile, 'toc', '.html') - tmpfile = self.store.path(effective_basefile, 'toc', '.xhtml') - doc = self.make_document() doc.uri = self.dataset_uri(binding, value) d = Describer(doc.meta,doc.uri) @@ -2195,20 +2020,19 @@ def toc_generate_page(self, binding, value, documentlist, pagesets, effective_ba ul ]) - self.log.debug("Rendering XHTML to %s" % tmpfile) - self.render_xhtml(doc, tmpfile) - if not util.outfile_is_newer([tmpfile],outfile): - # Prepare a browser-ready HTML page using generic.xsl - self.log.debug("Transforming HTML to %s" % outfile) - # configure params - xsltdir = self.setup_transform_templates("res/xsl", "res/xsl/toc.xsl") - xsltfile = xsltdir + os.sep + os.path.basename("res/xsl/toc.xsl") - params = self.get_transform_configuration(xsltdir,outfile) - self.transform_html(xsltfile, - tmpfile, outfile, params, otherrepos=otherrepos) - self.log.info("Created %s" % outfile) - return outfile - # if we didn't actually create an outfile: + conffile = os.sep.join([self.config.datadir,'rsrc','resources.xml']) + transformer = Transformer('XSLT', "res/xsl/toc.xsl", ["res/xsl"], + config=conffile) + # FIXME: This is a naive way of calculating the relative depth + # of the outfile + depth = len(outfile[len(self.store.datadir)+1:].split(os.sep)) + tree = transformer.transform(self.render_xhtml_tree(doc), depth) + fixed = transformer.t.html5_doctype_workaround(etree.tostring(tree)) + + with self.store.open(effective_basefile, 'toc', '.html', "wb") as fp: + fp.write(fixed) + + self.log.info("Created %s" % outfile) return outfile diff --git a/ferenda/layeredconfig.py b/ferenda/layeredconfig.py index c8862a58..89852a49 100644 --- a/ferenda/layeredconfig.py +++ b/ferenda/layeredconfig.py @@ -6,6 +6,7 @@ import logging from ferenda.compat import OrderedDict from six.moves import configparser +from six import text_type as str class LayeredConfig(object): """Provide unified access to nested configuration parameters. The diff --git a/ferenda/manager.py b/ferenda/manager.py index ecbf1334..5a24c8c6 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -13,47 +13,44 @@ from __future__ import unicode_literals # system import os -import time import stat import subprocess import sys import inspect -import itertools import logging import json import mimetypes from ast import literal_eval from datetime import datetime import xml.etree.cElementTree as ET -import cgi from ferenda.compat import OrderedDict +from wsgiref.simple_server import make_server +from wsgiref.util import FileWrapper import six from six.moves.urllib_parse import urlsplit, parse_qsl, urlencode from six.moves import configparser input = six.moves.input -from wsgiref.simple_server import make_server -from wsgiref.util import FileWrapper -# from pprint import pprint - # 3rd party import pkg_resources import requests import requests.exceptions from rdflib import URIRef, Namespace, Literal from bs4 import BeautifulSoup +from lxml import etree # my modules from ferenda import DocumentRepository from ferenda import DocumentStore from ferenda import FulltextIndex from ferenda import LayeredConfig +from ferenda import Transformer from ferenda import TripleStore from ferenda import elements -from ferenda.elements import html from ferenda import errors from ferenda import util +from ferenda.elements import html # NOTE: This is part of the published API and must be callable in # scenarios without configfile or logger. @@ -356,10 +353,16 @@ def frontpage(repos, xhtml_path = os.path.splitext(path)[0] + ".xhtml" with open(xhtml_path,"w") as fp: fp.write(xhtml) - - xsltdir = repos[0].setup_transform_templates(os.path.dirname(stylesheet), stylesheet) - params = repos[0].get_transform_configuration(xsltdir,xhtml_path) - repos[0].transform_html(xsltdir+"/"+os.path.basename(stylesheet), xhtml_path, path, params, otherrepos=repos) + # FIXME: We don't need to actually store the xhtml file on + # disk -- we could just keep it in memory as an lxml tree and + # call .transform(tree) just like + # DocuementRepository.toc_create_page does + docroot = os.path.dirname(path) + conffile = os.sep.join([docroot,'rsrc','resources.xml']) + transformer = Transformer('XSLT', stylesheet, ["res/xsl"], + config=conffile, + documentroot=docroot) + transformer.transform_file(xhtml_path, path) return True @@ -481,26 +484,13 @@ def _wsgi_search(environ, start_response, args): 'href':url})) doc.body.append(html.Div(pages, **{'class':'pager'})) # Transform that XHTML into HTML5 - - # FIXME: this way of transforming a etree to HTML5 is way too - # complicated, dependent on args['repo'][0], stores temporary - # files on disk for no good reason, abuses - # get_transform_configuration with a fake path and duplicates code - xsltfile = "res/xsl/search.xsl" - tmpfile = args['documentroot']+"/_searchtmp-%s.xhtml" % os.getpid() - outfile = args['documentroot']+"/_searchtmp-%s.html" % os.getpid() - # create a fake path so taht get_transform_configuration selects a - # resources.xml with correct relative path - depth = len(list(filter(None,args['searchendpoint'].split("/")))) - fake_outfile = "%s/%s/search.html" % (args['documentroot'], - "/".join(["fake"]*depth)) - xsltdir = repo.setup_transform_templates(os.path.dirname(xsltfile), xsltfile) - params = repo.get_transform_configuration(xsltdir,fake_outfile) - - repo.render_xhtml(doc,tmpfile) - repo.transform_html(xsltdir + "/search.xsl", - tmpfile, outfile, params, args['repos'][1:]) - data = util.readfile(outfile,"rb") + conffile = os.sep.join([args['documentroot'],'rsrc','resources.xml']) + transformer = Transformer('XSLT', "res/xsl/search.xsl", ["res/xsl"], + config=conffile) + depth = len(args['searchendpoint'].split("/")) - 2 # '/mysearch/' = depth 1 + repo = DocumentRepository() + tree = transformer.transform(repo.render_xhtml_tree(doc), depth) + data = transformer.t.html5_doctype_workaround(etree.tostring(tree)) start_response("200 OK", [ ("Content-Type", "text/html; charset=utf-8"), ("Content-Length", str(len(data))) @@ -1127,7 +1117,11 @@ def _list_class_usage(cls): for attrname in dir(cls): attr = getattr(cls, attrname) if hasattr(attr, "runnable"): - res[attr.__name__] = attr.__doc__.split("\n")[0] + doc = attr.__doc__ + if doc: + res[attr.__name__] = doc.split("\n")[0] + else: + res[attr.__name__] = "(Undocumented)" return res diff --git a/ferenda/res/xsl/base.xsl b/ferenda/res/xsl/base.xsl index 7c762f29..2659bff4 100644 --- a/ferenda/res/xsl/base.xsl +++ b/ferenda/res/xsl/base.xsl @@ -23,7 +23,7 @@ diff --git a/ferenda/transformer.py b/ferenda/transformer.py index 07d780de..ecd4313e 100644 --- a/ferenda/transformer.py +++ b/ferenda/transformer.py @@ -4,23 +4,29 @@ from tempfile import mkdtemp import os import shutil +import re import pkg_resources from lxml import etree from lxml.etree import XSLT +from ferenda import errors, util + # assumption: A transformer is initialized with a single template. If # you want to use a different template, create a different # transformer. class Transformer(object): def __init__(self, transformertype, - template, - templatedirs, - documentroot=None): + template, + templatedirs, + documentroot=None, + config=None): + cls = {'XSLT': XSLTTransform, 'JINJA': JinjaTransform}[transformertype] self.t = cls(template, templatedirs) self.documentroot = documentroot + self.config = config # transform() always operate on the native datastructure -- this might # be different depending on the transformer engine. For XSLT, which is @@ -30,32 +36,53 @@ def __init__(self, transformertype, # transform_file instead # # valid parameters - # - configurationfile: resources.xml -- cannot be calculated until - # we know the outfile # - annotationfile: intermediate/basefile.grit.xml - def transform(self, indata, depth, parameters=None): + def transform(self, indata, depth, parameters=None, uritransform=None): if parameters == None: parameters = {} - configfile = self.t.getconfig(depth) - if configfile: - parameters['configfile'] = configfile - from pudb import set_trace; set_trace() - outdata = self.t.transform(indata, parameters) + + # the provided configuration (might be a file or a python dict + # or anything else, depending on the transformer engine) will + # contain lists of JS/CSS resources. In order to make it + # possible to use relative links to these (needed for offline + # static HTML files), we first do a transformer + # engine-specific adaption of the configuration depending on + # the directory depth level of the outfile (as provided + # through the depth parameter), then we provide this adapted + # configuration to the transform call + if self.config: + adapted_config = self.t.getconfig(self.config, depth) + else: + adapted_config = None + outdata = self.t.transform(indata, adapted_config, parameters) + if uritransform: + self._transform_links(outdata.getroot(), uritransform) return outdata + def _transform_links(self, tree, uritransform): + for part in tree: + # depth-first transformation seems the easiest + self._transform_links(part, uritransform) + if part.tag != "a": continue + uri = part.get("href") + if not uri: continue + part.set("href", uritransform(uri)) + + # accepts a file-like object, returns a file-like object def transform_stream(self, instream, parameters=None): return self.t.native_to_stream( - self.transform(self.t.stream_to_native(instream), - -)) + self.transform(self.t.stream_to_native(instream))) + # accepts two filenames, reads from one, writes to the other - def transform_file(self, infile, outfile, parameters): + def transform_file(self, infile, outfile, + parameters=None, uritransform=None): depth = self._depth(outfile, self.documentroot) self.t.native_to_file(self.transform(self.t.file_to_native(infile), depth, - parameters), + parameters, + uritransform), outfile) def _depth(self, outfile, root): @@ -67,7 +94,7 @@ def __init__(self, template, templatedirs): pass class XSLTTransform(TransformerEngine): - def __init__(self, template, templatedirs): + def __init__(self, template, templatedirs, **kwargs): self.format = True # FIXME: make configurable self.templdir = self._setup_templates(template, templatedirs) worktemplate = self.templdir + os.sep + os.path.basename(template) @@ -102,15 +129,47 @@ def _setup_templates(self, template, templatedirs): shutil.copy2(template, workdir) return workdir - # getconfig may return different data depending on engine -- in this case - # it creates a xml file and returns the path for it - def getconfig(self, depth): - pass - + # getconfig may return different data depending on engine -- in + # this case it creates a xml file and returns the path for it + def getconfig(self, configfile, depth): + filename = configfile + if depth != 0: + (base, ext) = os.path.splitext(configfile) + filename = "%(base)s-depth-%(depth)d%(ext)s" % locals() + if not util.outfile_is_newer([configfile], filename): + tree = etree.parse(configfile) + # adjust the relevant link attribute for some nodes + for xpath, attrib in (("stylesheets/link", "href"), + ("javascripts/script", "src")): + for node in tree.findall(xpath): + # don't adjust absolute links + if not (re.match("(https?://|/)", node.get(attrib))): + node.set(attrib, "../"*depth + node.get(attrib)) + tree.write(filename) + return filename + + def transform(self, indata, config=None, parameters={}): + strparams = {} + if config: + strparams['configurationfile'] = XSLT.strparam(config) + for key, value in parameters.items(): + if key.endswith("file"): + # relativize path of file relative to the XSL file + # we'll be using. The mechanism could be clearer... + value = os.path.relpath(value, self.templdir) + strparams[key] = XSLT.strparam(value) + try: + return self._transformer(indata,**strparams) + except etree.XSLTApplyError as e: + raise errors.TransformError(str(e.error_log)) + if len(transform.error_log) > 0: + raise errors.TransformError(str(transform.error_log)) + # nativedata = lxml.etree def native_to_file(self, nativedata, outfile): res = self.html5_doctype_workaround( etree.tostring(nativedata, pretty_print=self.format)) + util.ensure_dir(outfile) with open(outfile,"wb") as fp: fp.write(res) @@ -118,24 +177,19 @@ def native_to_file(self, nativedata, outfile): def html5_doctype_workaround(indata): # FIXME: This is horrible if indata.startswith(b""): - indata = b"\n"+indata[17:-18].strip() - if indata[-1] == b"<" or indata[-1] == 60: - indata = indata[:-1] + found = False + endidx = -1 + while not found: + if indata[endidx] == b"<" or indata[endidx] == 60: + found = True + else: + endidx -= 1 + indata = b"\n"+indata[17:endidx].strip() return indata def file_to_native(self, infile): return etree.parse(infile) - def transform(self, indata, parameters): - strparams = {} - for key, value in parameters.items(): - strparams[key] = XSLT.strparam(value) - try: - return self._transformer(indata,**parameters) - except etree.XSLTApplyError as e: - raise errors.TransformError(str(e.error_log)) - if len(transform.error_log) > 0: - raise errors.TransformError(str(transform.error_log)) # FIXME: hook in the transform_links step somehow? class JinjaTransform(TransformerEngine): diff --git a/ferenda/triplestore.py b/ferenda/triplestore.py index fa05329e..749691e5 100644 --- a/ferenda/triplestore.py +++ b/ferenda/triplestore.py @@ -17,6 +17,7 @@ from six import text_type as str from six import binary_type as bytes from six.moves.urllib_parse import quote +import pyparsing from ferenda.thirdparty import SQLite from ferenda import util, errors @@ -199,7 +200,10 @@ def triple_count(self, context=None): def select(self, query, format="sparql"): - res = self.graph.query(query) + try: + res = self.graph.query(query) + except pyparsing.ParseException as e: + raise errors.SparqlError(e) if format == "sparql": return res.serialize(format="xml") elif format == "json": @@ -222,7 +226,10 @@ def construct(self, query): :param query: A SPARQL query with all neccessary prefixes defined. :type query: str """ - res = self.graph.query(query) + try: + res = self.graph.query(query) + except pyparsing.ParseException as e: + raise errors.SparqlError(e) return res.graph def clear(self, context=None): @@ -418,7 +425,6 @@ def construct(self, query): url = self._endpoint_url() url += "?query=" + quote(query) try: - r = requests.get(url) format = "xml" headers = {'Accept': self._contenttype[format]} resp = requests.get(url, headers=headers, data=query) diff --git a/test/testDocRepo.py b/test/testDocRepo.py index a55e801d..56a4db6b 100644 --- a/test/testDocRepo.py +++ b/test/testDocRepo.py @@ -834,43 +834,6 @@ def test_elements_from_soup(self): self.maxDiff = 4096 self.assertEqual(serialize(body),serialize(result)) - # Move to Generate? - def test_transform_html(self): - base = self.datadir+os.sep - with open(base+"style.xslt","w") as fp: - fp.write(""" - - - - - - - - - - - -""") - with open(base+"paramfile.xml","w") as fp: - fp.write("""textnode""") - - with open(base+"infile.xml","w") as fp: - fp.write("""Document title""") - - d = DocumentRepository() - parampath = base+"paramfile.xml" - d.transform_html(base+"style.xslt", - base+"infile.xml", - base+"outfile.xml", - {'value':XSLT.strparam('blahonga'), - 'file' :XSLT.strparam(parampath.replace(os.sep,"/"))}) - - self.assertEqualXML(util.readfile(base+"outfile.xml"),""" - - blahonga - textnode - Document title - """) # class Relate(RepoTester) def test_relate_fulltext(self): @@ -1130,6 +1093,7 @@ def _get_repo(self, storetype=None): params = {'storetype':storetype, 'datadir':self.datadir, 'storerepository':'ferenda'} + self.storetype = None if storetype == 'SQLITE': params['storelocation'] = self.datadir+"/ferenda.sqlite" @@ -1284,12 +1248,14 @@ def test_generate_sqlite(self): self.store = self._load_store(self.repo) self._test_generated() - def _generate_complex(self, xsl=None): + def _generate_complex(self, xsl=None, staticsite=False): # Helper func for other tests -- this uses a single # semi-complex source doc, runs it through the generic.xsl # stylesheet, and then the tests using this helper confirm # various aspects of the transformed document self.repo = self._get_repo() + if staticsite: + self.repo.config.staticsite = True if xsl is not None: self.repo.xslt_template = xsl test = """ @@ -1309,6 +1275,7 @@ def _generate_complex(self, xsl=None): property="dct:title" content="Abstract">

Lorem ipsum dolor sit amet

+

external

. @@ -1577,7 +1551,6 @@ class TOC(RepoTester): dct:issued "1976-05-07"^^xsd:date; dct:publisher "Analytical Biochemistry" . """ - results1 = [{'uri':'http://example.org/books/A_Tale_of_Two_Cities', 'title': 'A Tale of Two Cities', 'issued': '1859-04-30'}, @@ -1608,6 +1581,61 @@ class TOC(RepoTester): {'uri':'http://example.org/articles/pm942051', 'title': 'A rapid and sensitive method for the quantitation of microgram quantities of protein utilizing the principle of protein dye-binding', 'issued': '1976-05-07'}] + + def setUp(self): + super(TOCSelect, self).setUp() + # (set up a triple store) and fill it with appropriate data + d = DocumentRepository() + defaults = d.get_default_options() + # FIXME: We really need to subclass at least the toc_select + # test to handle the four different possible storetypes. For + # now we go with the default type (SQLITE, guaranteed to + # always work) but the non-rdflib backends use different code + # paths. + self.store = TripleStore.connect(storetype=defaults['storetype'], + location=self.datadir+os.sep+"test.sqlite", + repository=defaults['storerepository']) + self.store.clear() + self.store.add_serialized(self.books,format="turtle", context="http://example.org/ctx/base") + self.store.add_serialized(self.articles,format="turtle", context="http://example.org/ctx/other") + + + def tearDown(self): + # clear triplestore + self.store.clear() + del self.store + super(TOCSelect, self).tearDown() + + # FIXME: adapt to TripleStore setting so that these tests run with + # all supported triplestores + def test_toc_select(self): + d = DocumentRepository(datadir=self.datadir, + loglevel='CRITICAL', + storelocation=self.datadir+os.sep+"test.sqlite") + d.rdf_type = rdflib.URIRef("http://purl.org/ontology/bibo/Book") + # make sure only one named graph, not entire store, gets searched + got = d.toc_select("http://example.org/ctx/base") + self.assertEqual(len(got),6) + want = self.results1 + for row in want: + self.assertIn(row, got) + + got = d.toc_select("http://example.org/ctx/other") + self.assertEqual(len(got),4) + want2 = self.results2 + for row in want2: + self.assertIn(row, got) + + got = d.toc_select() + self.assertEqual(len(got),10) + want3 = want+want2 + for row in want3: + self.assertIn(row, got) + + +class TOC(RepoTester): + results1 = TOCSelect.results1 + results2 = TOCSelect.results2 pagesets = [TocPageset('Sorted by title',[ TocPage('a','Documents starting with "a"','title', 'a'), @@ -1656,61 +1684,11 @@ def setUp(self): util.ensure_dir(resources) shutil.copy2("%s/files/base/rsrc/resources.xml"%os.path.dirname(__file__), resources) - - # (set up a triple store) and fill it with appropriate data - d = DocumentRepository() - defaults = d.get_default_options() - # FIXME: We really need to subclass at least the toc_select - # test to handle the four different possible storetypes. For - # now we go with the default type (SQLITE, guaranteed to - # always work) but the non-rdflib backends use different code - # paths. - self.store = TripleStore.connect(storetype=defaults['storetype'], - location=self.datadir+os.sep+"test.sqlite", - repository=defaults['storerepository']) - self.store.clear() - self.store.add_serialized(self.books,format="turtle", context="http://example.org/ctx/base") - self.store.add_serialized(self.articles,format="turtle", context="http://example.org/ctx/other") - - def tearDown(self): - # clear triplestore - self.store.clear() - del self.store - super(TOC, self).tearDown() - - - def test_toc_select(self): - d = DocumentRepository(datadir=self.datadir, - loglevel='CRITICAL', - storelocation=self.datadir+os.sep+"test.sqlite") - d.rdf_type = rdflib.URIRef("http://purl.org/ontology/bibo/Book") - # make sure only one named graph, not entire store, gets searched - got = d.toc_select("http://example.org/ctx/base") - self.assertEqual(len(got),6) - want = self.results1 - for row in want: - self.assertIn(row, got) - - got = d.toc_select("http://example.org/ctx/other") - self.assertEqual(len(got),4) - want2 = self.results2 - for row in want2: - self.assertIn(row, got) - - got = d.toc_select() - self.assertEqual(len(got),10) - want3 = want+want2 - for row in want3: - self.assertIn(row, got) - # toc_query is tested by test_toc_select - def test_toc_criteria(self): - d = DocumentRepository(datadir=self.datadir, - loglevel='CRITICAL') - dct = d.ns['dct'] + dct = self.repo.ns['dct'] want = self.criteria - got = d.toc_criteria([dct.title, dct.issued]) + got = self.repo.toc_criteria([dct.title, dct.issued]) self.assertEqual(len(want), len(got)) self.assertEqual(want[0].binding, got[0].binding) @@ -1728,11 +1706,7 @@ def test_toc_criteria(self): # toc_selector is tested by test_toc_criteria def test_toc_pagesets(self): - d = DocumentRepository(datadir=self.datadir, - loglevel='CRITICAL') - data = self.results1 - - got = d.toc_pagesets(data, self.criteria) + got = self.repo.toc_pagesets(self.results1, self.criteria) want = self.pagesets self.assertEqual(len(got), 2) self.assertEqual(got[0].label, want[0].label) @@ -1741,47 +1715,13 @@ def test_toc_pagesets(self): self.assertEqual(got[1], want[1]) def test_select_for_pages(self): - d = DocumentRepository(datadir=self.datadir, - loglevel='CRITICAL') - got = d.toc_select_for_pages(self.results1, self.pagesets, self.criteria) + got = self.repo.toc_select_for_pages(self.results1, self.pagesets, self.criteria) want = self.documentlists self.maxDiff = None self.assertEqual(got, want) def test_generate_page(self): - d = DocumentRepository(datadir=self.datadir, - loglevel='CRITICAL') - path = d.toc_generate_page('title','a', self.documentlists[('title','a')], self.pagesets) - - # 1. first, test intermediate XHTML file - intermediate = path.replace(".html",".xhtml") - self.assertTrue(os.path.exists(intermediate)) - #with open(intermediate) as fp: - # print(fp.read().decode('utf-8')) - #print("=" * 60) - t = etree.parse(intermediate) - xhtmlns = "{http://www.w3.org/1999/xhtml}" - - # 1.1 Correct page title? - self.assertEqual(t.findtext(".//"+xhtmlns+"title"), - 'Documents starting with "a"') - - # 1.2 Correct navigation? - # @id='nav' -> @role='navigation' ? - navlinks = t.findall(".//"+xhtmlns+"ul[@role='navigation']//"+xhtmlns+"a") - self.assertEqual(len(navlinks), 9) # 10 pages in total, but current page isn't linked - self.assertEqual(navlinks[0].text, 'd') - self.assertEqual(navlinks[0].get("href"), 'http://localhost:8000/dataset/base?title=d') - self.assertEqual(navlinks[3].get("href"), 'http://localhost:8000/dataset/base?issued=1791') - - # 1.3 Correct document list? - # @id='documentlist' => @role='main' - docs = t.findall(".//"+xhtmlns+"ul[@role='main']/"+xhtmlns+"li/"+xhtmlns+"a") - self.assertEqual(len(docs),2) - # "And..." should go before "A Tale..." - self.assertEqual(docs[0].text, 'And Then There Were None') - self.assertEqual(docs[0].attrib['href'], 'http://example.org/books/And_Then_There_Were_None') - + path = self.repo.toc_generate_page('title','a', self.documentlists[('title','a')], self.pagesets) # 2. secondly, test resulting HTML file self.assertTrue(os.path.exists(path)) t = etree.parse(path) @@ -1829,9 +1769,7 @@ def test_generate_page(self): self.assertEqual(header.text, 'Documents starting with "a"') def test_generate_pages(self): - d = DocumentRepository(datadir=self.datadir, - loglevel='CRITICAL') - paths = d.toc_generate_pages(self.documentlists,self.pagesets) + paths = self.repo.toc_generate_pages(self.documentlists,self.pagesets) self.assertEqual(len(paths), 10) #print("=============%s====================" % paths[0]) #with open(paths[0]) as fp: @@ -1840,9 +1778,7 @@ def test_generate_pages(self): self.assertTrue(os.path.exists(path)) def test_generate_first_page(self): - d = DocumentRepository(datadir=self.datadir, - loglevel='CRITICAL') - path = d.toc_generate_first_page(self.documentlists,self.pagesets) + path = self.repo.toc_generate_first_page(self.documentlists,self.pagesets) self.assertEqual(path, self.p("base/toc/index.html")) self.assertTrue(os.path.exists(path)) tree = etree.parse(path) diff --git a/test/testManager.py b/test/testManager.py index d3f2ea6c..c5f91d7e 100644 --- a/test/testManager.py +++ b/test/testManager.py @@ -270,6 +270,7 @@ def test_frontpage(self): test = staticmockclass() test2 = staticmockclass2() outfile = self.tempdir+'/index.html' + manager.makeresources([test,test2], self.tempdir+'/rsrc') res = manager.frontpage([test,test2], path=outfile) self.assertTrue(res) diff --git a/test/testTransform.py b/test/testTransform.py index dbc6c89a..91dc8b68 100644 --- a/test/testTransform.py +++ b/test/testTransform.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import unicode_literals + +import os + from ferenda.testutil import RepoTester from ferenda import util @@ -9,7 +12,8 @@ class Transform(RepoTester): def test_transform_html(self): - with open("_teststyle.xslt","w") as fp: + base = self.datadir+os.sep + with open(base+"teststyle.xslt","w") as fp: fp.write(""" @@ -23,15 +27,16 @@ def test_transform_html(self): """) - with open("_paramfile.xml","w") as fp: + with open(base+"paramfile.xml","w") as fp: fp.write("""textnode""") - with open("_infile.xml","w") as fp: + with open(base+"infile.xml","w") as fp: fp.write("""Document title""") - t = Transformer("XSLT", "_teststyle.xslt", ["res/xsl"], "") - t.transform_file("_infile.xml", "_outfile.xml", {'value':'blahonga', - 'file':'_paramfile.xml'}) - self.assertEqualXML(util.readfile("_outfile.xml"),""" + t = Transformer("XSLT", base+"teststyle.xslt", ["res/xsl"], "") + t.transform_file(base+"infile.xml", base+"outfile.xml", + {'value':'blahonga', + 'file':base+'paramfile.xml'}) + self.assertEqualXML(util.readfile(base+"outfile.xml"),""" blahonga textnode diff --git a/test/testTripleStore.py b/test/testTripleStore.py index 0dd0c24a..b393815c 100644 --- a/test/testTripleStore.py +++ b/test/testTripleStore.py @@ -244,6 +244,13 @@ def test_construct(self): if self.store.__class__ == SleepycatStore: self.store.graph.close() + def test_invalid_select(self): + with self.assertRaises(errors.SparqlError): + self.store.select("This is not a valid SPARQL query") + + def test_invalid_construct(self): + with self.assertRaises(errors.SparqlError): + self.store.construct("This is not a valid SPARQL query") @unittest.skipIf('SKIP_FUSEKI_TESTS' in os.environ, "Skipping Fuseki tests") diff --git a/tools/test.sh b/tools/test.sh index 7d109119..5f1410e5 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -3,7 +3,8 @@ if [ -n "$1" ] then PYTHONPATH=test python -Wi -m unittest -v "$1" else + # When running the entire suite, exit at first failure in order to + # not have to wait three minutes. python -Wi -m unittest discover -v -f test - # python -Wi -m unittest discover test python -V -fi +fi