Skip to content

Commit

Permalink
replaced the tangled mess that was DocumentRepository.transform_html …
Browse files Browse the repository at this point in the history
…(and friends) with a new Transformer class
  • Loading branch information
staffanm committed Sep 13, 2013
1 parent a5e8a41 commit a7cb810
Show file tree
Hide file tree
Showing 13 changed files with 314 additions and 483 deletions.
2 changes: 1 addition & 1 deletion ferenda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
from .tocpage import TocPage
from .toccriteria import TocCriteria
from .newscriteria import NewsCriteria
from .transformer import Transformer
from .document import Document
from .documententry import DocumentEntry
from .documentstore import DocumentStore
from .documentrepository import DocumentRepository
from .pdfdocumentrepository import PDFDocumentRepository
from .compositerepository import CompositeRepository
from .transformer import Transformer
__version__ = "0.1.5-dev" #gets pulled into setup.py and docs/conf.py
2 changes: 2 additions & 0 deletions ferenda/devel.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class Devel(object):
class DummyStore(object):
def __init__(self, path, **kwargs):
pass
def list_basefiles_for(self, action, basedir=None):
return []
downloaded_suffix = ".html"
storage_policy = "file"
documentstore_class = DummyStore
Expand Down
348 changes: 86 additions & 262 deletions ferenda/documentrepository.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions ferenda/layeredconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
from ferenda.compat import OrderedDict
from six.moves import configparser
from six import text_type as str

class LayeredConfig(object):
"""Provide unified access to nested configuration parameters. The
Expand Down
60 changes: 27 additions & 33 deletions ferenda/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,44 @@
from __future__ import unicode_literals
# system
import os
import time
import stat
import subprocess
import sys
import inspect
import itertools
import logging
import json
import mimetypes
from ast import literal_eval
from datetime import datetime
import xml.etree.cElementTree as ET
import cgi
from ferenda.compat import OrderedDict
from wsgiref.simple_server import make_server
from wsgiref.util import FileWrapper

import six
from six.moves.urllib_parse import urlsplit, parse_qsl, urlencode
from six.moves import configparser
input = six.moves.input

from wsgiref.simple_server import make_server
from wsgiref.util import FileWrapper
# from pprint import pprint

# 3rd party
import pkg_resources
import requests
import requests.exceptions
from rdflib import URIRef, Namespace, Literal
from bs4 import BeautifulSoup
from lxml import etree

# my modules
from ferenda import DocumentRepository
from ferenda import DocumentStore
from ferenda import FulltextIndex
from ferenda import LayeredConfig
from ferenda import Transformer
from ferenda import TripleStore
from ferenda import elements
from ferenda.elements import html
from ferenda import errors
from ferenda import util
from ferenda.elements import html

# NOTE: This is part of the published API and must be callable in
# scenarios without configfile or logger.
Expand Down Expand Up @@ -356,10 +353,16 @@ def frontpage(repos,
xhtml_path = os.path.splitext(path)[0] + ".xhtml"
with open(xhtml_path,"w") as fp:
fp.write(xhtml)

xsltdir = repos[0].setup_transform_templates(os.path.dirname(stylesheet), stylesheet)
params = repos[0].get_transform_configuration(xsltdir,xhtml_path)
repos[0].transform_html(xsltdir+"/"+os.path.basename(stylesheet), xhtml_path, path, params, otherrepos=repos)
# FIXME: We don't need to actually store the xhtml file on
# disk -- we could just keep it in memory as an lxml tree and
# call .transform(tree) just like
# DocuementRepository.toc_create_page does
docroot = os.path.dirname(path)
conffile = os.sep.join([docroot,'rsrc','resources.xml'])
transformer = Transformer('XSLT', stylesheet, ["res/xsl"],
config=conffile,
documentroot=docroot)
transformer.transform_file(xhtml_path, path)
return True


Expand Down Expand Up @@ -481,26 +484,13 @@ def _wsgi_search(environ, start_response, args):
'href':url}))
doc.body.append(html.Div(pages, **{'class':'pager'}))
# Transform that XHTML into HTML5

# FIXME: this way of transforming a etree to HTML5 is way too
# complicated, dependent on args['repo'][0], stores temporary
# files on disk for no good reason, abuses
# get_transform_configuration with a fake path and duplicates code
xsltfile = "res/xsl/search.xsl"
tmpfile = args['documentroot']+"/_searchtmp-%s.xhtml" % os.getpid()
outfile = args['documentroot']+"/_searchtmp-%s.html" % os.getpid()
# create a fake path so taht get_transform_configuration selects a
# resources.xml with correct relative path
depth = len(list(filter(None,args['searchendpoint'].split("/"))))
fake_outfile = "%s/%s/search.html" % (args['documentroot'],
"/".join(["fake"]*depth))
xsltdir = repo.setup_transform_templates(os.path.dirname(xsltfile), xsltfile)
params = repo.get_transform_configuration(xsltdir,fake_outfile)

repo.render_xhtml(doc,tmpfile)
repo.transform_html(xsltdir + "/search.xsl",
tmpfile, outfile, params, args['repos'][1:])
data = util.readfile(outfile,"rb")
conffile = os.sep.join([args['documentroot'],'rsrc','resources.xml'])
transformer = Transformer('XSLT', "res/xsl/search.xsl", ["res/xsl"],
config=conffile)
depth = len(args['searchendpoint'].split("/")) - 2 # '/mysearch/' = depth 1
repo = DocumentRepository()
tree = transformer.transform(repo.render_xhtml_tree(doc), depth)
data = transformer.t.html5_doctype_workaround(etree.tostring(tree))
start_response("200 OK", [
("Content-Type", "text/html; charset=utf-8"),
("Content-Length", str(len(data)))
Expand Down Expand Up @@ -1127,7 +1117,11 @@ def _list_class_usage(cls):
for attrname in dir(cls):
attr = getattr(cls, attrname)
if hasattr(attr, "runnable"):
res[attr.__name__] = attr.__doc__.split("\n")[0]
doc = attr.__doc__
if doc:
res[attr.__name__] = doc.split("\n")[0]
else:
res[attr.__name__] = "(Undocumented)"
return res


Expand Down
2 changes: 1 addition & 1 deletion ferenda/res/xsl/base.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
<xsl:template match="/">
<!-- this is a ugly workaround required to get the proper html5
doctype *and* the pre-rootnode conditional IE comments needed
for the h5bp template. Strip the <fakeroot> start and end
for the h5bp template. Strip the <remove-this-tag> start and end
tags as a postprocessing step. -->
<remove-this-tag>
<xsl:apply-templates/>
Expand Down
124 changes: 89 additions & 35 deletions ferenda/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,29 @@
from tempfile import mkdtemp
import os
import shutil
import re

import pkg_resources
from lxml import etree
from lxml.etree import XSLT

from ferenda import errors, util

# assumption: A transformer is initialized with a single template. If
# you want to use a different template, create a different
# transformer.
class Transformer(object):
def __init__(self, transformertype,
template,
templatedirs,
documentroot=None):
template,
templatedirs,
documentroot=None,
config=None):

cls = {'XSLT': XSLTTransform,
'JINJA': JinjaTransform}[transformertype]
self.t = cls(template, templatedirs)
self.documentroot = documentroot
self.config = config

# transform() always operate on the native datastructure -- this might
# be different depending on the transformer engine. For XSLT, which is
Expand All @@ -30,32 +36,53 @@ def __init__(self, transformertype,
# transform_file instead
#
# valid parameters
# - configurationfile: resources.xml -- cannot be calculated until
# we know the outfile
# - annotationfile: intermediate/basefile.grit.xml
def transform(self, indata, depth, parameters=None):
def transform(self, indata, depth, parameters=None, uritransform=None):
if parameters == None:
parameters = {}
configfile = self.t.getconfig(depth)
if configfile:
parameters['configfile'] = configfile
from pudb import set_trace; set_trace()
outdata = self.t.transform(indata, parameters)

# the provided configuration (might be a file or a python dict
# or anything else, depending on the transformer engine) will
# contain lists of JS/CSS resources. In order to make it
# possible to use relative links to these (needed for offline
# static HTML files), we first do a transformer
# engine-specific adaption of the configuration depending on
# the directory depth level of the outfile (as provided
# through the depth parameter), then we provide this adapted
# configuration to the transform call
if self.config:
adapted_config = self.t.getconfig(self.config, depth)
else:
adapted_config = None
outdata = self.t.transform(indata, adapted_config, parameters)
if uritransform:
self._transform_links(outdata.getroot(), uritransform)
return outdata

def _transform_links(self, tree, uritransform):
for part in tree:
# depth-first transformation seems the easiest
self._transform_links(part, uritransform)
if part.tag != "a": continue
uri = part.get("href")
if not uri: continue
part.set("href", uritransform(uri))


# accepts a file-like object, returns a file-like object
def transform_stream(self, instream,
parameters=None):
return self.t.native_to_stream(
self.transform(self.t.stream_to_native(instream),

))
self.transform(self.t.stream_to_native(instream)))

# accepts two filenames, reads from one, writes to the other
def transform_file(self, infile, outfile, parameters):
def transform_file(self, infile, outfile,
parameters=None, uritransform=None):
depth = self._depth(outfile, self.documentroot)
self.t.native_to_file(self.transform(self.t.file_to_native(infile),
depth,
parameters),
parameters,
uritransform),
outfile)

def _depth(self, outfile, root):
Expand All @@ -67,7 +94,7 @@ def __init__(self, template, templatedirs):
pass

class XSLTTransform(TransformerEngine):
def __init__(self, template, templatedirs):
def __init__(self, template, templatedirs, **kwargs):
self.format = True # FIXME: make configurable
self.templdir = self._setup_templates(template, templatedirs)
worktemplate = self.templdir + os.sep + os.path.basename(template)
Expand Down Expand Up @@ -102,40 +129,67 @@ def _setup_templates(self, template, templatedirs):
shutil.copy2(template, workdir)
return workdir

# getconfig may return different data depending on engine -- in this case
# it creates a xml file and returns the path for it
def getconfig(self, depth):
pass

# getconfig may return different data depending on engine -- in
# this case it creates a xml file and returns the path for it
def getconfig(self, configfile, depth):
filename = configfile
if depth != 0:
(base, ext) = os.path.splitext(configfile)
filename = "%(base)s-depth-%(depth)d%(ext)s" % locals()
if not util.outfile_is_newer([configfile], filename):
tree = etree.parse(configfile)
# adjust the relevant link attribute for some nodes
for xpath, attrib in (("stylesheets/link", "href"),
("javascripts/script", "src")):
for node in tree.findall(xpath):
# don't adjust absolute links
if not (re.match("(https?://|/)", node.get(attrib))):
node.set(attrib, "../"*depth + node.get(attrib))
tree.write(filename)
return filename

def transform(self, indata, config=None, parameters={}):
strparams = {}
if config:
strparams['configurationfile'] = XSLT.strparam(config)
for key, value in parameters.items():
if key.endswith("file"):
# relativize path of file relative to the XSL file
# we'll be using. The mechanism could be clearer...
value = os.path.relpath(value, self.templdir)
strparams[key] = XSLT.strparam(value)
try:
return self._transformer(indata,**strparams)
except etree.XSLTApplyError as e:
raise errors.TransformError(str(e.error_log))
if len(transform.error_log) > 0:
raise errors.TransformError(str(transform.error_log))

# nativedata = lxml.etree
def native_to_file(self, nativedata, outfile):
res = self.html5_doctype_workaround(
etree.tostring(nativedata, pretty_print=self.format))
util.ensure_dir(outfile)
with open(outfile,"wb") as fp:
fp.write(res)

@staticmethod
def html5_doctype_workaround(indata):
# FIXME: This is horrible
if indata.startswith(b"<remove-this-tag>"):
indata = b"<!DOCTYPE html>\n"+indata[17:-18].strip()
if indata[-1] == b"<" or indata[-1] == 60:
indata = indata[:-1]
found = False
endidx = -1
while not found:
if indata[endidx] == b"<" or indata[endidx] == 60:
found = True
else:
endidx -= 1
indata = b"<!DOCTYPE html>\n"+indata[17:endidx].strip()
return indata

def file_to_native(self, infile):
return etree.parse(infile)

def transform(self, indata, parameters):
strparams = {}
for key, value in parameters.items():
strparams[key] = XSLT.strparam(value)
try:
return self._transformer(indata,**parameters)
except etree.XSLTApplyError as e:
raise errors.TransformError(str(e.error_log))
if len(transform.error_log) > 0:
raise errors.TransformError(str(transform.error_log))
# FIXME: hook in the transform_links step somehow?

class JinjaTransform(TransformerEngine):
Expand Down
12 changes: 9 additions & 3 deletions ferenda/triplestore.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from six import text_type as str
from six import binary_type as bytes
from six.moves.urllib_parse import quote
import pyparsing

from ferenda.thirdparty import SQLite
from ferenda import util, errors
Expand Down Expand Up @@ -199,7 +200,10 @@ def triple_count(self, context=None):


def select(self, query, format="sparql"):
res = self.graph.query(query)
try:
res = self.graph.query(query)
except pyparsing.ParseException as e:
raise errors.SparqlError(e)
if format == "sparql":
return res.serialize(format="xml")
elif format == "json":
Expand All @@ -222,7 +226,10 @@ def construct(self, query):
:param query: A SPARQL query with all neccessary prefixes defined.
:type query: str
"""
res = self.graph.query(query)
try:
res = self.graph.query(query)
except pyparsing.ParseException as e:
raise errors.SparqlError(e)
return res.graph

def clear(self, context=None):
Expand Down Expand Up @@ -418,7 +425,6 @@ def construct(self, query):
url = self._endpoint_url()
url += "?query=" + quote(query)
try:
r = requests.get(url)
format = "xml"
headers = {'Accept': self._contenttype[format]}
resp = requests.get(url, headers=headers, data=query)
Expand Down
Loading

0 comments on commit a7cb810

Please sign in to comment.