Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1077 lines (921 sloc) 45.5 KB
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
"""Base class for handling a repository of documents. This inludes
downloading them from a remote source, parsing the raw data into a
structured XHTML+RDFa representation, transforming them to
browser-ready HTML, and some other stuff."""
# this replaces the LegalSource classes with a single class that has
# sensible logging, layered config handling (file + command line args)
# and in general does a lot of heavy lifting
# system
import os,sys
import logging
import logging.handlers
import multiprocessing # either get python 2.6 or the backported multiprocessing module
from tempfile import mktemp
import codecs
from time import time
import functools
import xml.etree.cElementTree as ET
import xml.dom.minidom
from datetime import datetime
import re
import urllib
# 3rd party
import BeautifulSoup
from configobj import ConfigObj
from mechanize import Browser, LinkNotFoundError, RobustFactory, URLError
from genshi.template import TemplateLoader
from rdflib import Literal, Namespace, URIRef, RDF, RDFS
# Assume RDFLib 3.0
from rdflib import Graph, ConjunctiveGraph
from rdflib.plugins.parsers.ntriples import unquote as ntriple_unquote
# mine
import Util
from LegalRef import LegalRef, Link
from DataObjects import UnicodeStructure, CompoundStructure, \
MapStructure, IntStructure, DateStructure, PredicateType, \
UnicodeSubject, Paragraph, Section, \
serialize
from SesameStore import SesameStore
__version__ = (1,6)
__author__ = u"Staffan Malmgren <staffan@tomtebo.org>"
# Magicality to make sure printing of unicode objects work no matter
# what platform we're running on
if sys.platform == 'win32':
if sys.stdout.encoding:
defaultencoding = sys.stdout.encoding
else:
# print "sys.stdout.encoding not set"
defaultencoding = 'cp850'
else:
if sys.stdout.encoding:
defaultencoding = sys.stdout.encoding
if defaultencoding == 'ANSI_X3.4-1968': # really?!
defaultencoding = 'iso-8859-1'
else:
import locale
locale.setlocale(locale.LC_ALL,'')
defaultencoding = locale.getpreferredencoding()
# for some reason, resetting sys.stdout to a more forgiving writer on
# OSX (builtin python 2.6) results in a strict ascii
# writer. Investigate further...
if (sys.platform != "darwin" and sys.platform != "linux2"):
sys.stdout = codecs.getwriter(defaultencoding)(sys.__stdout__, 'replace')
sys.stderr = codecs.getwriter(defaultencoding)(sys.__stderr__, 'replace')
# Global/static functions - global_init and global_run are used when
# running actions in parallel using multiprocessing.Pool. The argument
# to Pool.map needs to be a single picklabe method (i.e. not an
# instance method), which takes a single argument. We use a
# initializer (global_init) to set up some other arguments that the
# method (global_run) needs.
#
# I wonder if it has to be this complicated?
__execute_module = None
__execute_class = None
__execute_args = None
def global_init(modulename,classname,args):
"""This is a helper function to make L{multiprocessing} work nice under Windows"""
global __execute_module, __execute_class, __execute_args
__execute_module = modulename
__execute_class = classname
__execute_args = args
#log = multiprocessing.get_logger()
#if log.handlers == []:
# h = logging.StreamHandler()
# h.setLevel(logging.INFO)
# h.setFormatter(logging.Formatter("[%(levelname)s/%(process)d] %(message)s"))
# log.addHandler(h)
# log.setLevel(logging.INFO)
#log.info("initializing %s %r" % (__execute_class, __execute_args))
def global_run(argument):
"""This is a helper function to make L{multiprocessing} work nice under Windows"""
global __execute_module, __execute_class, __execute_args
#log = multiprocessing.get_logger()
#log.info("running %s %r %s" % (__execute_class, __execute_args, argument))
mod = __import__(__execute_module)
cls = getattr(mod, __execute_class)
return cls.run(__execute_args, argument)
#class SaneNamespaceManager(NamespaceManager):
# def compute_qname(self, uri):
# if not uri in self.__cache:
# namespace, name = split_uri(uri)
# namespace = URIRef(namespace)
# prefix = self.store.prefix(namespace)
# if prefix is None:
# raise Exception("Prefix for %s not bound" % namespace)
# self.__cache[uri] = (prefix, namespace, name)
# return self.__cache[uri]
class DocumentRepository(object):
"""Base class for downloadning, parsing and generating HTML
versions of a repository of documents.
If you want to do stuff with a set of documents (particularly
documents that can be fetched over the web), like downloading
them, parsing the data into some structured format, and
(re-)generating HTML versions of them, this class contains lots of
stuff to help you.
You use it by creating a new class that inherits from this class,
and overriding methods in that class. To get a very simple example
going, you only need to specify start_url and document_url
To get more control over parsing and HTML generation, you override
additional methods. There are eight main entry points into the
module, with the following principal call chains:
download_new
download_everything
download_single
downloaded_path
download_if_needed
remote_url
parse
parsed_path
soup_from_basefile
parse_from_soup
render_xhtml
relate
generate
generated_file
prep_annotation_file
graph_to_annotation_file
toc
toc_navigation
toc_title
toc_style
toc_style_list | toc_style_table | toc_style_multicol
toc_page
news
news_selections
news_selection
frontpage_content
tabs
"""
module_dir = "base"
"""The directory where this module will store downloaded, parsed
and generated files. You need to override this."""
genshi_tempate = "genshi/generic.xhtml"
"""The U{Genshi<http://genshi.edgewall.org/>} template used to
transform the parsed object structure into a standard XML file. If
your data is complex, you might want to override this (and write
your own Genshi template). If you prefer other ways of
transforming your data into a serialized XML file, you might want
to override L{render_xhtml} altogether."""
xslt_template = "xsl/generic.xsl"
"""A template used to transform the XML file into browser-ready
HTML. If your document type is complex, you might want to override
this (and write your own XSLT transform). You should include
base.xslt in that template, though."""
rdf_type = Namespace(Util.ns['rinfo'])['Rattsinformationsdokument']
"""The RDF type of the documents you are handling (expressed as a RDFLib URIRef)."""
source_encoding = "iso-8859-1"
"""The character set that the source HTML documents use (if applicable)"""
lang = "en"
"""The language that the source documents are written in (unless
otherwise specified, and that output document should use"""
start_url = "http://example.org/"
"""The main entry page for the remote web store of documents. May
be a list of documents, a search form or whatever. If it's
something more complicated than a simple list of documents, you
need to override download_everything in order to tell which
documents are to be downloaded."""
document_url = "http://example.org/docs/%s.html"
basefile_template = ".*"
# If set, uses BeautifulSoup as parser even for downloading
# (parsing the navigation/search/index pages). It's more robust
# aginst invalid HTML, but might be slower and seems to return
# incorrect results for link.text if the link text contain markup
browser_use_robustfactory = False
# this is a replacement for DispatchMixin.dispatch with built-in
# support for running the *_all methods (parse_all, relate_all and
# generate_all) in parallell using multiprocessing
@classmethod
def run(cls,argv=sys.argv[1:],*extra):
"""Method for running individual methods in a consistent and
multiprocessing-friendly manner. You don't need to override or
call this."""
# OptionParser seems to require that we define each and every
# possible option beforehand. Since each module may have it's
# own settings, this is not really possible
from collections import defaultdict
options = defaultdict(lambda:defaultdict(dict))
args = []
for arg in argv:
if arg.startswith("--"):
if "=" in arg:
(key,value) = arg.split("=",1)
else:
(key,value) = (arg, 'True')
# Note: Options may not contains hyphens (ie they can't
# be called "parse-force")
parts = key[2:].split("-")
if len(parts) == 1:
options[parts[0]] = value
elif len(parts) == 2:
print "options[%s][%s] = %r" % (parts[0], parts[1], value)
options[parts[0]][parts[1]] = value
elif len(parts) == 3:
options[parts[0]][parts[1]][parts[2]] = value
else:
args.append(arg)
for arg in extra:
args.append(arg)
(configfile,config,moduleconfig) = cls.initialize_config(options)
from pprint import pprint
#pprint(config)
#pprint(moduleconfig)
if len(args) == 0:
cls.print_valid_commands()
elif args[0].endswith("_all"):
cls.run_all(args[0],argv,config)
else:
c = cls(options)
func = getattr(c,args[0])
return func(*args[1:])
@classmethod
def print_valid_commands(cls):
internal_commands = ("run", "print_valid_commands")
print "Valid commands are:", ", ".join(
[str(m) for m in dir(cls) if (m not in internal_commands and
not m.startswith("_") and
callable(getattr(cls, m)))]
)
# how should download_all and relate_all be parallelizable (if at
# all?) For relate_all in particular we need to collect the
# results from each relate call in the end and do some custom
# processing on them.
@classmethod
def run_all(cls, func_name_all, argv, config):
start = time()
# replace "foo_all" with "foo" in the argument array we provide run()
func_name = func_name_all[:-4]
argv[argv.index(func_name_all)] = func_name
argv.append("--logfile=%s" % mktemp())
# FIXME: find out which module this class belongs to
global_init_args = (cls.__module__,cls.__name__, argv)
cls.setup(func_name_all, config)
iterable = cls.get_iterable_for(func_name_all,config['datadir'])
if 'processes' in config and int(config['processes']) > 1:
print "Running multiprocessing"
p = multiprocessing.Pool(int(config['processes']),global_init,global_init_args)
results = p.map(global_run,iterable)
else:
print "Not running multiprocessing"
global_init(*global_init_args)
results = []
for basefile in iterable:
results.append(global_run(basefile))
cls.teardown(func_name_all, config)
# FIXME: This should use the logging infrastructure, but
# _setup_logger is a instancemethod
# ret = cls.collect_results_for(func_name_all, results)
print u'%s: OK (%.3f sec)' % (func_name_all,time()-start)
@classmethod
def get_iterable_for(cls,funcname,base_dir):
if funcname == "parse_all":
directory = os.path.sep.join((base_dir, cls.module_dir, u"downloaded"))
suffix = ".html"
elif funcname in ("generate_all", "relate_all"):
directory = os.path.sep.join((base_dir, cls.module_dir, u"parsed"))
suffix = ".xhtml"
for x in Util.listDirs(directory,suffix,reverse=True):
yield cls.basefile_from_path(x)
@classmethod
def setup(cls,funcname,config):
"""Runs before any of the *_all methods starts executing"""
cbl = getattr(cls, funcname + "_setup")
cbl(config)
@classmethod
def teardown(cls,funcname,config):
"""Runs after any of the *_all methods has finished executing"""
cbl = getattr(cls, funcname + "_teardown")
cbl(config)
# @classmethod
# def collect_results_for(cls,funcname,results):
# if funcname == "relate_all":
# # results will be an array of NT files. Combine them into
# # one big NT file, submit it to sesame, and store it as a
# # NT file. Things to find out: the sesame server location
# # the context URI the name of the NT file
# for f in results:
# pass
# else:
# pass # nothin' to do
@classmethod
def initialize_config(cls,options):
configfile = ConfigObj(os.path.dirname(__file__)+"/ferenda.conf")
# Normally, you should read from self.config rather than
# self.configfile as this will make sure command line
# arguments take precedence over config file parameters. The
# exception is if you wish to save some sort of state
# (eg. "last-processed-id-number") in the config file.
config = DocumentRepository.merge_dict_recursive(dict(configfile), options)
if cls.module_dir not in config:
config[cls.module_dir] = {}
moduleconfig = config[cls.module_dir]
return (configfile,config,moduleconfig)
@classmethod
def basefile_from_path(cls,path):
seg = os.path.splitext(path)[0].split(os.sep)
return ":".join(seg[seg.index(cls.module_dir)+2:])
@classmethod
def context(cls):
"""Return the context URI under which RDF statements should be stored."""
return "http://example.org/ctx/%s" % (cls.module_dir)
@staticmethod
def merge_dict_recursive(base,other):
for (key,value) in other.items():
if (isinstance(value,dict) and
(key in base) and
(isinstance(base[key],dict))):
base[key] = DocumentRepository.merge_dict_recursive(base[key],value)
else:
base[key] = value
return base
def __init__(self,options):
(self.configfile,self.config,self.moduleconfig) = self.initialize_config(options)
# If we have a particular log level for this module, use that,
# otherwise use the global log level. If that isn't defined
# either, use the INFO loglevel.
if 'log' in self.moduleconfig:
loglevel = self.moduleconfig['log']
else:
loglevel = self.config.get('log','INFO')
self.log = self.setup_logger(self.module_dir,loglevel)
self.base_dir = self.config['datadir']
if self.browser_use_robustfactory:
self.browser = Browser(factory=RobustFactory())
else:
self.browser = Browser()
self.browser.addheaders = [('User-agent', 'lagen.nu-bot (staffan@lagen.nu)')]
# logger = logging.getLogger("mechanize")
# logger.addHandler(logging.StreamHandler(sys.stdout))
# logger.setLevel(logging.DEBUG)
# self.browser.set_debug_http(True)
# self.browser.set_debug_responses(True)
# self.browser.set_debug_redirects(True)
self.ns = {'rinfo': Namespace(Util.ns['rinfo']),
'rinfoex':Namespace(Util.ns['rinfoex']),
'dct': Namespace(Util.ns['dct'])}
def get_globals(self):
"""If your submodule defines classes or functions which your
genshi template expects to find, you need to implement this
(with a single "return globals()" statement. This is in order to
feed your modules global bindings to Genshi"""
return globals()
def canonical_uri(self,basefile):
"""return the canonical URI for this particular document/resource."""
# Note that there might not be a 1:1 mappning between
# documents and URIs -- don't know what we should do in those
# cases.
#
# It might also be impossible to provide the canonical_uri
# without actually parse()ing the document
return "http://example.org/res/%s/%s" % (self.module_dir, basefile)
def get_logger(self,name):
"""Create an additional logger (which can be turned on or off
in the config file) for debug messages in particular areas of
the code"""
# By default, don't really log anything (we'd like to create a
# logger with no handlers, but that prints out a warning
# message)
loglevel = self.moduleconfig[name].get('log','CRITICAL')
return self.setup_logger(name,loglevel)
def setup_logger(self,name,loglevel):
loglevels = {'DEBUG':logging.DEBUG,
'INFO':logging.INFO,
'WARNING':logging.WARNING,
'ERROR':logging.ERROR,
'CRITICAL':logging.CRITICAL}
if not isinstance(loglevel,int):
loglevel = loglevels[loglevel]
l = logging.getLogger(name)
if l.handlers == []:
h = logging.StreamHandler()
h.setLevel(loglevel)
h.setFormatter(logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"))
l.addHandler(h)
l.setLevel(loglevel)
return l
def store_triple(self,subj,pred,obj):
# store this changelog under a different context than the
# actual content, since that gets blown away by relate_all
store = SesameStore(self.config['triplestore'],self.config['repository'],self.context()+"/modified")
store.add_triple((subj,pred,obj))
store.commit()
################################################################
#
# STEP 1: Download documents from the web
#
################################################################
# This is a very simple generic implementation. Assumes all
# documents are linked from a single page, that they all have URLs
# matching the document_url template, and that the link text is
# always equal to basefile. If these assumptions don't hold, you
# need to override this method.
def download_everything(self,usecache=False):
self.log.info("Starting at %s" % self.start_url)
self.browser.open(self.start_url)
url_regex = self.document_url.replace("%s", "(.*)")
# self.log.info("url_regex: %s" % url_regex)
for link in self.browser.links(predicate=lambda l:re.match(url_regex,l.absolute_url)):
# self.log.debug("Found link (%r)" % (link))
try:
basefile = re.search(self.basefile_template, link.text).group(0)
# self.log.debug("Transformed into basefile %s" % (basefile))
self.download_single(basefile,usecache,link.absolute_url)
except AttributeError:
self.log.error("Couldn't find basefile information in link text %s" % link.text)
def download_new(self):
self.download_everything(usecache=True)
def download_single(self,basefile,usecache=False,url=None):
"""Downloads the document from the web (unless explicitly
specified, the URL to download is determined by
self.document_url combined with basefile, the location on disk
is determined by the function self.download_path). If usecache
is set and the document exists on disk no download is
attempted.
Otherwise, if the document exists on disk, but the version on
the web is unchanged, the file on disk is left unchanged
(i.e. the timestamp is not modified).
Returns True if the document was downloaded and stored on
disk, False if the file on disk was not updated.
"""
if not url:
url = self.remote_url(basefile)
filename = self.downloaded_path(basefile)
# self.log.debug("Usecache is %s, existance of %s is %s" % (usecache, filename,os.path.exists(filename)))
if not usecache or not os.path.exists(filename):
existed = os.path.exists(filename)
if self.download_if_needed(url,filename):
# the downloaded file was updated (or created) --
# let's make a note of this in the RDF graph!
uri = self.canonical_uri(basefile)
self.store_triple(URIRef(uri), self.ns['dct']['modified'], Literal(datetime.now()))
if existed:
self.log.debug("%s existed, but a new version was downloaded" % filename)
else:
self.log.debug("%s did not exist, so it was downloaded" % filename)
return True
else:
self.log.debug("%s exists and is unchanged" % filename)
else:
self.log.debug("%s already exists" % (filename))
return False
def download_if_needed(self,url,filename):
"""Downloads the url to local filename if it's needed. The
default implementation always downloads the url, and if the
local file is already present, replaces it."""
# FIXME: Check the timestamp of filename (if it exists), and
# do a if-modified-since request.
tmpfile = mktemp()
# self.log.debug("Retrieving %s to %s" % (url,filename))
try:
self.browser.retrieve(url,tmpfile)
return Util.replace_if_different(tmpfile,filename)
except URLError, e:
self.log.error("Failed to fetch %s: %s" % (url, e))
def remote_url(self,basefile):
return self.document_url % urllib.quote(basefile)
# Splits the basefile on a few common delimiters (/, : and space)
# and constructs a path from the segments
def generic_path(self,basefile,maindir,suffix):
segments = [self.base_dir, self.module_dir, maindir]
segments.extend(re.split("[/: ]", basefile))
return os.path.sep.join(segments)+suffix
def downloaded_path(self,basefile):
return self.generic_path(basefile,u'downloaded','.html')
################################################################
#
# STEP 2: Parse the downloaded data into a structured XML document
# with RDFa metadata.
#
################################################################
@classmethod
def parse_all_setup(cls, config):
pass
@classmethod
def parse_all_teardown(cls, config):
pass
# The boilerplate code for handling exceptions and logging time
# duration might be extracted to decorator functions (generate
# uses the same boilerplate code, as might other functions). Maybe
# even the parce_force handling?
def parse(self,basefile):
"""Takes the raw data downloaded by the download functions and
parses it into a structured XML document with RDFa sprinkled
throughout. It will also save the same RDF statements in a
separate RDF/XML file.
You will need to provide your own parsing logic, but often
it's easier to just override parse_from_soup (assuming your
indata is in a HTML format parseable by BeautifulSoup) and let
the base class read and write the files."""
try:
start = time()
infile = self.downloaded_path(basefile)
outfile = self.parsed_path(basefile)
force = ('parseforce' in self.moduleconfig and
self.moduleconfig['parseforce'] == 'True')
if not force and Util.outfile_is_newer([infile],outfile):
self.log.debug(u"%s: Överhoppad", basefile)
return
self.log.debug(u"%s: Starting", basefile)
# the actual function code
soup = self.soup_from_basefile(basefile,self.source_encoding)
doc = self.parse_from_soup(soup,basefile)
self.render_xhtml(self.genshi_tempate, doc,
self.parsed_path(basefile), self.get_globals())
# Check to see that all metadata contained in doc.meta is
# present in the serialized file.
#print "doc['meta']:"
#print doc['meta'].serialize(format="nt")
#print
distilled_graph = Graph()
distilled_graph.parse(outfile,format="rdfa")
#print "distilled_graph:"
#print distilled_graph.serialize(format="nt")
#print
distilled_file = self.distilled_path(basefile)
Util.ensureDir(distilled_file)
distilled_graph.serialize(distilled_file,format="pretty-xml", encoding="utf-8")
self.log.debug(u'%s: %s triples extracted', basefile, len(distilled_graph))
for triple in distilled_graph:
len_before = len(doc['meta'])
doc['meta'].remove(triple)
len_after = len(doc['meta'])
# should this even be a warning? The parse step may add extra metadata in the text (eg inserting links, which may become dct:references triples)
#if len_before == len_after:
# (s,p,o) = triple
# self.log.warning("The triple '%s %s %s .' from the XHTML file was not found in the original metadata" % (s.n3(),p.n3(), o.n3()))
if doc['meta']:
self.log.warning("%d triple(s) from the original metadata was not found in the serialized XHTML file:" % len(doc['meta']))
print doc['meta'].serialize(format="nt")
self.log.info(u'%s: OK (%.3f sec)', basefile,time()-start)
except KeyboardInterrupt:
raise
except:
self.log.exception("parse of %s failed" % basefile)
if 'fatalexceptions' in self.config:
raise
def soup_from_basefile(self,basefile,encoding='iso-8859-1'):
"""Helper function."""
filename = self.downloaded_path(basefile)
return BeautifulSoup.BeautifulSoup(
codecs.open(filename,encoding=encoding,errors='replace').read(),
convertEntities='html')
def parse_from_soup(self,soup,basefile):
"""Returns a dict with the keys 'meta', 'body', 'uri' and
'lang'.
body should be an iterable object, but in particular
it must be compatible with whatever template you've set
genshi_template to (the default generic.xhtml assumes a tree
of iterable objects built upon the DataObjects base
classes).
meta should be a RDFLib graph.
uri should be the canonical uri for this document, as used by
the above graph.
lang should be a ISO language code, eg 'sv' or 'en'.
The default implementation creates a simple representation of
the page body, a small metadatagraph containing the title, and
a generic uri based on the module_dir and basefile.
"""
# Default language unless we can find out from source doc?
# Check html/@xml:lang || html/@lang
root = soup.find('html')
try:
lang = root['xml:lang']
except KeyError:
try:
lang = root['lang']
except KeyError:
lang = self.lang
title = soup.find('title').string
# self.log.info("Title: %s" % title)
uri = self.canonical_uri(basefile)
# self.log.info("URI: %s" % uri)
meta = Graph()
meta.bind('dct',self.ns['dct'])
meta.add((URIRef(uri), self.ns['dct']['title'], Literal(title,lang=lang)))
meta.add((URIRef(uri), self.ns['dct']['identifier'], Literal(basefile)))
# remove all HTML comments, script tags
comments = soup.findAll(text=lambda text:isinstance(text, BeautifulSoup.Comment))
[comment.extract() for comment in comments]
scripts = soup.findAll('script')
[script.extract() for script in scripts]
# block-level elements that commonly directly contain text
body = CompoundStructure()
for block in soup.findAll(['blockquote', 'center','dt','dd','li','th','td','h1','h2','h3','h4','h5','h6','p', 'pre']):
t = Util.normalizeSpace(''.join(block.findAll(text=True)))
block.extract() # to avoid seeing it again
if t:
# self.log.info("Paragraph (%s %s): '%s...'" % (block.name, id(block), t[:20]))
body.append(Paragraph([t]))
return {'body':body,
'meta':meta,
'uri':uri,
'lang':lang}
def render_xhtml(self,template,doc,outfile,globals):
"""Serializes the parsed object structure into a XML file with
RDFa attributes, by using Genshi with a suitable template."""
# only look in cwd and this file's directory
loader = TemplateLoader(['.' , os.path.dirname(__file__)],
variable_lookup='lenient')
tmpl = loader.load(template)
stream = tmpl.generate(doc=doc,**globals)
try:
tmpfile = mktemp()
res = stream.render()
fp = open(tmpfile,"w")
fp.write(res)
fp.close()
Util.replace_if_different(tmpfile,outfile)
except Exception, e:
self.log.error(u'Fel vid templaterendring: %r' % (sys.exc_info()[1]))
raise
if 'class="warning"' in res:
start = res.index('class="warning">')
end = res.index('</',start+16)
msg = Util.normalizeSpace(res[start+16:end].decode('utf-8'))
self.log.error(u'templatefel \'%s\'' % (msg[:80]))
return res
def parsed_path(self,basefile):
return self.generic_path(basefile,u'parsed','.xhtml')
def distilled_path(self,basefile):
return self.generic_path(basefile,u'distilled','.rdf')
################################################################
#
# STEP 3: Extract and store the RDF data
#
################################################################
@classmethod
def relate_all_setup(cls, config):
store = SesameStore(config['triplestore'],config['repository'],cls.context())
print "Clearing context %s at repository %s" % (cls.context(), config['repository'])
store.clear()
@classmethod
def relate_all_teardown(cls, config):
pass
def relate(self,basefile):
"""Insert the (previously distilled) RDF statements into the triple store"""
self.log.debug("Adding %s to triple store" % self.distilled_path(basefile))
data = open(self.distilled_path(basefile)).read()
store = SesameStore(self.config['triplestore'],self.config['repository'],self.context())
store.add_serialized(data,format="xml")
def extract_rdfa(self,filename):
"""Helper function to extract RDF data from any XML document
containing RDFa attributes. Returns a RDFlib graph of the
triples found."""
dom = xml.dom.minidom.parse(filename)
o = pyRdfa.Options(space_preserve=False)
o.warning_graph = None
g = pyRdfa.parseRDFa(dom, "http://example.org/", options=o)
# clean up whitespace for Literals
#for tup in g:
# (o,p,s) = tup
# if isinstance(s,Literal):
# g.remove(tup)
# l = Literal(u' '.join(s.split()), lang=s.language, datatype=s.datatype)
# g.add((o,p,l))
return g
################################################################
#
# STEP 4: Generate browser-ready HTML with navigation panels,
# information about related documents and so on.
#
################################################################
@classmethod
def generate_all_setup(cls, config):
pass
@classmethod
def generate_all_teardown(cls, config):
pass
def generate(self,basefile):
"""Generate a browser-ready HTML file from the structured XML
file constructed by parse. The generation is done by XSLT, and
normally you won't need to override this, but you might want
to provide your own xslt file and set self.xslt_template to
the name of that file. If you want to generate your
browser-ready HTML by any other means than XSLT, you should
override this method."""
try:
start = time()
infile = self.parsed_path(basefile)
outfile = self.generated_path(basefile)
force = ('generateforce' in self.moduleconfig and
self.moduleconfig['generateforce'] == 'True')
if not force and Util.outfile_is_newer([infile],outfile):
self.log.debug(u"%s: Överhoppad", basefile)
return
self.log.debug(u"%s: Starting", basefile)
# The actual function code
annotation_file = self.prep_annotation_file(basefile)
if annotation_file:
# params = {'annotationfile':'../data/sfs/intermediate/%s.ann.xml' % basefile}
params = {'annotationfile':'../'+annotation_file.replace("\\","/")}
else:
params = {}
Util.transform(self.xslt_template,
infile,
outfile,
parameters = params,
validate=False)
self.log.info(u'%s: OK (%.3f sec)', basefile, time()-start)
except KeyboardInterrupt:
raise
except:
self.log.exception("parse of %s failed" % basefile)
def prep_annotation_file(self, basefile):
"""Helper function used by generate -- prepares a RDF/XML file
containing statements that in some way annotates the
information found in the document that generate handles, like
URI/title of other documents that refers to this one."""
return None
# helper for the prep_annotation_file helper -- it expects a
# RDFLib graph, and returns (the path to a file with) the same in
# Grit format.
def graph_to_annotation_file(self,graph,basename):
infile = mktemp()
fp = open(infile,"w")
fp.write(graph.serialize(format="pretty-xml"))
fp.close()
outfile = self.annotation_path(basename)
Util.transform("xsl/rdfxml-grit.xslt",
infile,
outfile,
validate=False)
return outfile
def generated_path(self,basefile):
return self.generic_path(basefile,u'generated','.html')
def annotation_path(self,basefile):
return self.generic_path(basefile,u'intermediate','.ann.xml')
################################################################
#
# STEP 5: Generate HTML pages for a TOC of a all documents, news
# pages of new/updated documents, and other odds'n ends.
#
################################################################
def toc(self):
"""Creates a set of pages that together acts as a table of
contents for all documents in the repository. For smaller
repositories a single page might be enough, but for
repositoriees with a few hundred documents or more, there will
usually be one page for all documents starting with A,
starting with B, and so on. There might be different ways of
browseing/drilling down, i.e. both by title, publication year,
keyword and so on."""
# Step 1: Select a table that contains most of the interesting
# info, eg:
#
# URI dct:title dct:issued dct:identifier
#
# and convert it to a list of dicts
# GENERALIZE: Subclasses should be able to change the query by
# implementing eg self.toc_query()
sq = """PREFIX dct:<http://purl.org/dc/terms/>
SELECT ?uri ?title ?id
WHERE {?uri dct:title ?title .
?uri dct:identifier ?id }"""
store = SesameStore(self.config['triplestore'],
self.config['repository'],
self.context())
data = store.select(sq,"python")
# Step 2: For each criterion (a criterion is a rdf predicate +
# selector function like first_letter or year_part + sort
# function) defined for the class:
# GENERALIZE: criteria should be initalized from a list in
# self.toc_categories. The list should be able to be very sparse,
# like [self.ns['dct']['title'],self.ns['dct']['issued']], and
# the initialization routine should add the appropriate
# bindning, label, selector and sorter (at least for standard
# DCT predicates.
criteria = ({'predicate':self.ns['dct']['title'],
'binding':'title', # must match sparql query
'label':'Sorted by title', # GENERALIZE: This string must me controllable/localizable
'selector':lambda x: x[0].lower(),
'sorter':cmp,
'pages': []},
{'predicate':self.ns['dct']['identifier'],
'binding':'id',
'label':'Sorted by identifier',
'selector':lambda x: x[0].lower(),
'sorter':cmp,
'pages': []})
g = Graph()
for qname in self.ns:
g.bind(qname, self.ns[qname])
for criterion in criteria:
# 2.1 Create the list of possible values from the selector
# function and...
selector_values = {}
selector = criterion['selector']
binding = criterion['binding']
qname = g.qname(criterion['predicate'])
for row in data:
selector_values[selector(row[binding])] = True
# 2.1 cont: For each value:
for value in sorted(selector_values.keys(),cmp=criterion['sorter']):
# 2.1.1 Prepare a filename based on the rdf predicate and the selector
# func value, eg. toc/dct/title/a.xhtml
tmpfile = os.path.sep.join((self.base_dir,
self.module_dir,
u'toc',
qname.split(":")[0],
qname.split(":")[1],
value.lower()+u".xhtml"))
# 2.1.2 Collate all selector func values into a list of dicts:
# [{'label':'A','outfile':'toc/dct/title/a.xhtml',...},
# 'label':'B:,'outfile':'toc/dct/title/b.xhtml',...}
criterion['pages'].append({'label':value,
# GENERALIZE: make localizable
# (toc_page(predicate,value))
'title':'Documents starting with "%s"' % value,
'tmpfile':tmpfile,
'outfile':tmpfile.replace(".xhtml",".html")})
selector_values = {}
# 4: Now that we've created neccessary base data for criterion,
# iterate through it again
# GENERALIZE: from this point, criteria is fully loaded and
# not neccessarily structured around RDF predicates. Sources
# with more specialized toc requirements (such as having each
# possible dct:creator as a primary criterion, and years in
# dct:issued as a secondary) can construct the criteria
# structure themselves. Therefore, all code above should be a
# call to toc_criteria() or maybe toc_navigation()
for criterion in criteria:
selector = criterion['selector']
binding = criterion['binding']
selector_values = [x['label'] for x in criterion['pages']]
# 4.1 For each selector value (reuse list from 2.1):
for page in criterion['pages']:
label = page['label']
title = page['title']
content = []
# Find documents that match this particular selector value
for row in data:
if selector(row[binding]) == label:
# 4.1.2 Prepare a list of dicts called content, like:
# [{'uri':'http://example.org/res/basefile',
# 'title':'Basefile title'}]
content.append({'uri':row['uri'],
'label':row[binding]})
# 4.1.4 Prepare a non-browser ready XHTML page using
# genshi/generic-toc.xhtml and navigation (3), title
# (4.1.1) and content (4.1.2)
# GENERALIZE: Allow for other genshi templates
# implementing eg table, column or tag-cloud based
# layouts
self.log.debug("Rendering XHTML to %s" % page['tmpfile'])
self.render_xhtml("genshi/generic-toc.xhtml",
{'navigation':criteria,
'title':title,
'content':content,
'lang':self.lang},
page['tmpfile'],
self.get_globals())
# 4.1.5 Prepare a browser-ready HTML page using generic.xsl
self.log.debug("Rendering HTML to %s" % page['tmpfile'])
Util.transform('xsl/generic.xsl',page['tmpfile'],page['outfile'], validate=False)
self.log.info("Created %s" % page['outfile'])
# 5. as a final step, make an index.html by copying the very first page
mainindex = os.path.sep.join((self.base_dir,
self.module_dir,
u'toc',
u'index.html'))
Util.copy_if_different(criteria[0]['pages'][0]['outfile'], mainindex)
def news(self):
"""Creates a set of pages where each page contains a list of
new/updated documents. Each page gets created in HTML and Atom
formats. To control the set of pages, see news_selections."""
for selection in self.news_selections():
result = self.news_selection(selection,some_cutoff_date)
tmpfile = mktemp()
self.render_xhtml('genshi/news.xhtml',None,tmpfile,{title:selection,
entries:result})
Util.transform('xsl/news.xsl',tmpfile,outfile)
self.render_atom('genshi/news.atom')
def news_selections(self):
"""Returns a list of news page titles. Each one will be used
as an argument to news_selection."""
return ("Nya och ändrade dokument")
def news_selection(self, selection_name, cutoff_date):
"""Returns a list of news entries for a particular news page."""
if selection_name == "Nya och ändrade dokument":
# FIXME: This should either be a list of RDF graphs or a
# list of Atom-Entry objects
return ({'title': 'Lag (2009:123) om blahonga',
'date': '2009-11-27', # published or updated
'uri':'urn:lex:sv:sfs:2009:123',
'body':'<p>A typical text with some <b>HTML</b> and <a href="urn:lex:sfs:2009:123">canonical linkz</a></p>',
'readmore':'Författningstext'})
def frontpage_content(self, primary=False):
"""If the module wants to provide any particular content on
the frontpage, it can do so by returning a XHTML fragment (in
text form) here. If primary is true, the caller wants the
module to take primary responsibility for the frontpage
content. If primary is false, the caller only expects a
smaller amount of content (like a smaller presentation of the
repository and the document it contains)."""
return "<div><h2>Module %s</h2><p>Handles %s documents</p></div>" % (module_dir, rdf_type)
def tabs(self,primary=False):
"""returns a list of tuples, where each tuple will be rendered
as a tab in the main UI. Normally, a module will only return a
single tab."""
return ([rdf_type,module_dir])