Skip to content

Commit

Permalink
tests for compositerepository, remaining decorators, describer and st…
Browse files Browse the repository at this point in the history
…art of devel, plus made legalref / legaluri helper modules for sources.legal.se instead of general utilities
  • Loading branch information
staffanm committed Oct 11, 2013
1 parent 145ea47 commit 9e7e57d
Show file tree
Hide file tree
Showing 21 changed files with 352 additions and 140 deletions.
4 changes: 2 additions & 2 deletions ferenda/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@
import unittest

try:
from unittest.mock import Mock, patch, call
from unittest.mock import Mock, MagicMock, patch, call
except ImportError: # pragma: no cover
from mock import Mock, patch, call
from mock import Mock, MagicMock, patch, call
88 changes: 48 additions & 40 deletions ferenda/compositerepository.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,34 @@

import os

from . import DocumentRepository, DocumentStore

from ferenda import DocumentRepository, DocumentStore
from ferenda import util, errors

class CompositeStore(DocumentStore):

def __init__(self, datadir, downloaded_suffix=".html", storage_policy="file", docrepos=[]):
def __init__(self, datadir, downloaded_suffix=".html",
storage_policy="file",
docrepo_instances=None):
self.datadir = datadir # docrepo.datadir + docrepo.alias
self.downloaded_suffix = downloaded_suffix
self.storage_policy = storage_policy
self.docrepos = docrepos
if not docrepo_instances:
docrepo_instances = {}
self.docrepo_instances = docrepo_instances

def list_basefiles_for(self, action, basedir=None):
if not basedir:
basedir = self.datadir
if action == "parse":
documents = set()
for inst in self.docrepos:
# assert self.docrepo_instances, "No docrepos are defined!"
for cls, inst in self.docrepo_instances.items():
for basefile in inst.store.list_basefiles_for("parse"):
if basefile not in documents:
documents.add(basefile)
yield basefile
else:
for basefile in inst.store.list_basefiles_for(action):
for basefile in super(CompositeStore, self).list_basefiles_for(action):
yield basefile


Expand Down Expand Up @@ -54,58 +59,61 @@ def __init__(self, **kwargs):
self.store = self.documentstore_class(self.config.datadir + os.sep + self.alias,
downloaded_suffix=self.downloaded_suffix,
storage_policy=self.storage_policy,
docrepos=self._instances)
docrepo_instances=self._instances)

def download(self):
for c in self.subrepos:
inst = self.get_instance(c, self.myoptions)
# make sure that our store has access to our now
# initialized subrepo objects
if c not in self.store.docrepo_instances:
self.store.docrepo_instances[c] = inst
inst.download()

# NOTE: this impl should NOT use the @managedparsing decorator
def parse(self, basefile):
start = time()
self.log.debug("%s: Starting", basefile)
ret = False
for c in self.subrepos:
inst = self.get_instance(c, self.myoptions)
try:
# each parse method should be smart about whether to re-parse
# or not (i.e. use the @managedparsing decorator)
ret = inst.parse(basefile)
except errors.ParseError: # or others
ret = False
with util.logtime(self.log.info, "%(basefile)s OK (%(elapsed).3f sec)",
{'basefile': basefile}):
ret = False
for c in self.subrepos:
inst = self.get_instance(c, self.myoptions)
try:
# each parse method should be smart about whether to re-parse
# or not (i.e. use the @managedparsing decorator)
ret = inst.parse(basefile)
except errors.ParseError: # or others
ret = False
if ret:
break
if ret:
break
if ret:
self.copy_parsed(basefile, inst)
self.copy_parsed(basefile, inst)
return ret

def copy_parsed(self, basefile, instance):
# If the distilled and parsed links are recent, assume that
# all external resources are OK as well
if (util.outfile_is_newer([instance.distilled_path(basefile)],
self.distilled_path(basefile)) and
util.outfile_is_newer([instance.parsed_path(basefile)],
self.parsed_path(basefile))):
self.log.debug(
"%s: External resources are (probably) up-to-date" % basefile)
if (util.outfile_is_newer([instance.store.distilled_path(basefile)],
self.store.distilled_path(basefile)) and
util.outfile_is_newer([instance.store.parsed_path(basefile)],
self.store.parsed_path(basefile))):
self.log.debug("%s: Attachments are (likely) up-to-date" % basefile)
return

util.link_or_copy(instance.store.distilled_path(basefile),
self.store.distilled_path(basefile))

util.link_or_copy(instance.store.parsed_path(basefile),
self.store.parsed_path(basefile))

cnt = 0
for attachment in instance.store.list_attachments(doc.basefile, "parsed"):
for attachment in instance.store.list_attachments(basefile, "parsed"):
cnt += 1
src = instance.store.parser_path(basename, attachment=attachment)
target = self.store.parsed_path(basename, attachment=attachment)
src = instance.store.parsed_path(basefile, attachment=attachment)
target = self.store.parsed_path(basefile, attachment=attachment)
util.link_or_copy(src, target)

util.link_or_copy(instance.distilled_path(basefile),
self.distilled_path(basefile))

util.link_or_copy(instance.parsed_path(basefile),
self.parsed_path(basefile))

if cnt:
self.log.debug("%s: Linked %s external resources from %s to %s" %
self.log.debug("%s: Linked %s attachments from %s to %s" %
(basefile,
cnt,
os.path.dirname(instance.parsed_path(basefile)),
os.path.dirname(self.parsed_path(basefile))))
os.path.dirname(instance.store.parsed_path(basefile)),
os.path.dirname(self.store.parsed_path(basefile))))
32 changes: 23 additions & 9 deletions ferenda/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,16 @@ def wrapper(self, doc):

def render(f):
"""Handles the serialization of the :py:class:`~ferenda.Document`
object to XHTML+RDFa and RDF/XML files. Must be used in conjunction
with :py:func:`~ferenda.decorators.makedocument`."""
object to XHTML+RDFa and RDF/XML files. Must be used in
conjunction with :py:func:`~ferenda.decorators.makedocument`.
"""
# NOTE: The actual rendering is two lines of code. The bulk of
# this function validates that the XHTML+RDFa file that we end up
# with contains the exact same triples as is present in the doc
# object (including both the doc.meta Graph and any other Graph
# that might be present on any doc.body object)

def iterate_graphs(node):
res = []
if hasattr(node, 'meta') and node.meta is not None:
Expand All @@ -97,28 +105,34 @@ def wrapper(self, doc):
# css file + background images + png renderings of text
self.create_external_resources(doc)

# Check to see that all metadata contained in doc.meta is
# present in the serialized file.
# Validate that all triples specified in doc.meta and any
# .meta property on any body object is present in the
# XHTML+RDFa file.
distilled_graph = Graph()

with codecs.open(self.store.parsed_path(doc.basefile), encoding="utf-8") as fp: # unicode
distilled_graph.parse(data=fp.read(), format="rdfa", publicID=doc.uri)
with codecs.open(self.store.parsed_path(doc.basefile),
encoding="utf-8") as fp: # unicode
distilled_graph.parse(data=fp.read(), format="rdfa",
publicID=doc.uri)
# The act of parsing from RDFa binds a lot of namespaces
# in the graph in an unneccesary manner. Particularly it
# binds both 'dc' and 'dcterms' to
# 'http://purl.org/dc/terms/', which makes serialization
# less than predictable. Blow these prefixes away.
distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/"))
distilled_graph.bind(
"dcterms", URIRef("http://example.org/this-prefix-should-not-be-used"))
"dcterms",
URIRef("http://example.org/this-prefix-should-not-be-used"))

util.ensure_dir(self.store.distilled_path(doc.basefile))
with open(self.store.distilled_path(doc.basefile), "wb") as distilled_file:
with open(self.store.distilled_path(doc.basefile),
"wb") as distilled_file:
# print("============distilled===============")
# print(distilled_graph.serialize(format="turtle").decode('utf-8'))
distilled_graph.serialize(distilled_file, format="pretty-xml")
self.log.debug(
'%s: %s triples extracted to %s', doc.basefile, len(distilled_graph), self.store.distilled_path(doc.basefile))
'%s: %s triples extracted to %s', doc.basefile,
len(distilled_graph), self.store.distilled_path(doc.basefile))

for g in iterate_graphs(doc.body):
doc.meta += g
Expand Down
8 changes: 4 additions & 4 deletions ferenda/describer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ def getvalue(self, p):
"""
values = list(self.getvalues(p))
if len(values) == 0:
raise KeyError("No objects for predicate %s" % p)
raise KeyError("No values for predicate %s" % p)
elif len(values) > 1:
raise KeyError("More than one object for predicatee %s" % p)
raise KeyError("More than one value for predicate %s" % p)
return values[0]

def getrel(self, p):
Expand All @@ -94,7 +94,7 @@ def getrel(self, p):
"""
refs = list(self.getrels(p))
if len(refs) == 0:
raise KeyError("No objects for predicate %s" + p)
raise KeyError("No objects for predicate %s" % p)
elif len(refs) > 1:
raise KeyError("More than one object for predicatee %s" + p)
raise KeyError("More than one object for predicate %s" % p)
return refs[0]
58 changes: 30 additions & 28 deletions ferenda/devel.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,22 +29,6 @@ class Devel(object):
"""

alias = "devel"
# FIXME: manager.py should not strictly require these to be present

class DummyStore(object):

def __init__(self, path, **kwargs):
pass

def list_basefiles_for(self, action, basedir=None):
return []
downloaded_suffix = ".html"
storage_policy = "file"
documentstore_class = DummyStore

# Don't document this -- just needed for ferenda.manager compatibility
def get_default_options(self):
return {}

@decorators.action
def dumprdf(self, filename, format="turtle"):
Expand Down Expand Up @@ -309,34 +293,52 @@ def select(self, template, uri, format="json"):
p['triples'] = len(res)
print(res.serialize(format=format).decode('utf-8'))


# FIXME: These are dummy implementations of methods and class
# variables that manager.py expects all docrepos to have. We don't
# want to have coverage counting these as missing lines, hence the
# pragma: no cover comments.

class DummyStore(object):

def __init__(self, path, **kwargs):
pass # pragma: no cover

def list_basefiles_for(self, action, basedir=None):
return [] # pragma: no cover

documentstore_class = DummyStore
downloaded_suffix = ".html"
storage_policy = "file"

def get_default_options(self):
return {} # pragma: no cover

def download(self):
pass
pass # pragma: no cover

def parse(self, basefile):
pass
pass # pragma: no cover

def relate(self, basefile):
pass
pass # pragma: no cover

def generate(self, basefile):
pass
pass # pragma: no cover

def toc(self, otherrepos):
pass
pass # pragma: no cover

def news(self, otherrepos):
pass
pass # pragma: no cover

def status(self):
pass

def list_basefiles_for(self, command):
return []
pass # pragma: no cover

@classmethod
def setup(cls, action, config):
pass
pass # pragma: no cover

@classmethod
def teardown(cls, action, config):
pass
pass # pragma: no cover
5 changes: 4 additions & 1 deletion ferenda/documentstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,10 @@ def list_basefiles_for(self, action, basedir=None):
suffix = ".rdf"
elif action == "generate":
directory = os.path.sep.join((basedir, "parsed"))
suffix = ".xhtml"
if self.storage_policy == "dir":
suffix = "index.xhtml"
else:
suffix = ".xhtml"
elif action == "news":
directory = os.path.sep.join((basedir, "entries"))
suffix = ".json"
Expand Down
2 changes: 1 addition & 1 deletion ferenda/sources/general/wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# mine
from ferenda import DocumentRepository
from ferenda import util
from ferenda.legalref import LegalRef, Link
# from ferenda.legalref import LegalRef, Link

# FIXME: Need to dynamically set this namespace (by inspecting the root?)
# as it varies with MW version
Expand Down
2 changes: 1 addition & 1 deletion ferenda/sources/legal/eu/eurlexcaselaw.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from rdflib import Graph

from ferenda import DocumentRepository
from ferenda.legalref import LegalRef
from ferenda.sources.legal.se.legalref import LegalRef
from ferenda.elements import Paragraph

# FIXME: 2008.json, containing a handful of cases, some which should not be fetched, and one continuation link.
Expand Down
2 changes: 1 addition & 1 deletion ferenda/sources/legal/se/dv.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from ferenda import DocumentStore, Describer, WordReader
from ferenda.decorators import managedparsing
from ferenda import util
from ferenda.legalref import LegalRef, Link
from ferenda.sources.legal.se.legalref import LegalRef, Link
from ferenda.elements import Body, Paragraph
from . import SwedishLegalSource, RPUBL

Expand Down
2 changes: 1 addition & 1 deletion ferenda/sources/legal/se/jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .swedishlegalsource import Stycke, Sektion
from ferenda.decorators import downloadmax, recordlastdownload
from ferenda import util
from ferenda.legalref import LegalRef, Link
from ferenda.sources.legal.se.legalref import LegalRef, Link


class JK(SwedishLegalSource):
Expand Down
Loading

0 comments on commit 9e7e57d

Please sign in to comment.