tests for compositerepository, remaining decorators, describer and st…

…art of devel, plus made legalref / legaluri helper modules for sources.legal.se instead of general utilities
staffanm · Oct 11, 2013 · 9e7e57d · 9e7e57d
1 parent 145ea47
commit 9e7e57d
Show file tree

Hide file tree

Showing 21 changed files with 352 additions and 140 deletions.
diff --git a/ferenda/compat.py b/ferenda/compat.py
@@ -21,6 +21,6 @@
     import unittest
 
 try:
-    from unittest.mock import Mock, patch, call
+    from unittest.mock import Mock, MagicMock, patch, call
 except ImportError: # pragma: no cover
-    from mock import Mock, patch, call
+    from mock import Mock, MagicMock, patch, call
diff --git a/ferenda/compositerepository.py b/ferenda/compositerepository.py
@@ -3,29 +3,34 @@
 
 import os
 
-from . import DocumentRepository, DocumentStore
-
+from ferenda import DocumentRepository, DocumentStore
+from ferenda import util, errors
 
 class CompositeStore(DocumentStore):
 
-    def __init__(self, datadir, downloaded_suffix=".html", storage_policy="file", docrepos=[]):
+    def __init__(self, datadir, downloaded_suffix=".html",
+                 storage_policy="file",
+                 docrepo_instances=None):
         self.datadir = datadir  # docrepo.datadir + docrepo.alias
         self.downloaded_suffix = downloaded_suffix
         self.storage_policy = storage_policy
-        self.docrepos = docrepos
+        if not docrepo_instances:
+            docrepo_instances = {}
+        self.docrepo_instances = docrepo_instances
 
     def list_basefiles_for(self, action, basedir=None):
         if not basedir:
             basedir = self.datadir
         if action == "parse":
             documents = set()
-            for inst in self.docrepos:
+            # assert self.docrepo_instances, "No docrepos are defined!"
+            for cls, inst in self.docrepo_instances.items():
                 for basefile in inst.store.list_basefiles_for("parse"):
                     if basefile not in documents:
                         documents.add(basefile)
                         yield basefile
         else:
-            for basefile in inst.store.list_basefiles_for(action):
+            for basefile in super(CompositeStore, self).list_basefiles_for(action):
                 yield basefile
 
 
@@ -54,58 +59,61 @@ def __init__(self, **kwargs):
         self.store = self.documentstore_class(self.config.datadir + os.sep + self.alias,
                                               downloaded_suffix=self.downloaded_suffix,
                                               storage_policy=self.storage_policy,
-                                              docrepos=self._instances)
+                                              docrepo_instances=self._instances)
 
     def download(self):
         for c in self.subrepos:
             inst = self.get_instance(c, self.myoptions)
+            # make sure that our store has access to our now
+            # initialized subrepo objects
+            if c not in self.store.docrepo_instances:
+                self.store.docrepo_instances[c] = inst
             inst.download()
 
     # NOTE: this impl should NOT use the @managedparsing decorator
     def parse(self, basefile):
-        start = time()
-        self.log.debug("%s: Starting", basefile)
-        ret = False
-        for c in self.subrepos:
-            inst = self.get_instance(c, self.myoptions)
-            try:
-                # each parse method should be smart about whether to re-parse
-                # or not (i.e. use the @managedparsing decorator)
-                ret = inst.parse(basefile)
-            except errors.ParseError:  # or others
-                ret = False
+        with util.logtime(self.log.info, "%(basefile)s OK (%(elapsed).3f sec)",
+                          {'basefile': basefile}):
+            ret = False
+            for c in self.subrepos:
+                inst = self.get_instance(c, self.myoptions)
+                try:
+                    # each parse method should be smart about whether to re-parse
+                    # or not (i.e. use the @managedparsing decorator)
+                    ret = inst.parse(basefile)
+                except errors.ParseError:  # or others
+                    ret = False
+                if ret:
+                    break
             if ret:
-                break
-        if ret:
-            self.copy_parsed(basefile, inst)
+                self.copy_parsed(basefile, inst)
+        return ret
 
     def copy_parsed(self, basefile, instance):
         # If the distilled and parsed links are recent, assume that
         # all external resources are OK as well
-        if (util.outfile_is_newer([instance.distilled_path(basefile)],
-                                  self.distilled_path(basefile)) and
-            util.outfile_is_newer([instance.parsed_path(basefile)],
-                                  self.parsed_path(basefile))):
-            self.log.debug(
-                "%s: External resources are (probably) up-to-date" % basefile)
+        if (util.outfile_is_newer([instance.store.distilled_path(basefile)],
+                                  self.store.distilled_path(basefile)) and
+            util.outfile_is_newer([instance.store.parsed_path(basefile)],
+                                  self.store.parsed_path(basefile))):
+            self.log.debug("%s: Attachments are (likely) up-to-date" % basefile)
             return
 
+        util.link_or_copy(instance.store.distilled_path(basefile),
+                          self.store.distilled_path(basefile))
+
+        util.link_or_copy(instance.store.parsed_path(basefile),
+                          self.store.parsed_path(basefile))
+
         cnt = 0
-        for attachment in instance.store.list_attachments(doc.basefile, "parsed"):
+        for attachment in instance.store.list_attachments(basefile, "parsed"):
             cnt += 1
-            src = instance.store.parser_path(basename, attachment=attachment)
-            target = self.store.parsed_path(basename, attachment=attachment)
+            src = instance.store.parsed_path(basefile, attachment=attachment)
+            target = self.store.parsed_path(basefile, attachment=attachment)
             util.link_or_copy(src, target)
-
-        util.link_or_copy(instance.distilled_path(basefile),
-                          self.distilled_path(basefile))
-
-        util.link_or_copy(instance.parsed_path(basefile),
-                          self.parsed_path(basefile))
-
         if cnt:
-            self.log.debug("%s: Linked %s external resources from %s to %s" %
+            self.log.debug("%s: Linked %s attachments from %s to %s" %
                            (basefile,
                             cnt,
-                            os.path.dirname(instance.parsed_path(basefile)),
-                            os.path.dirname(self.parsed_path(basefile))))
+                            os.path.dirname(instance.store.parsed_path(basefile)),
+                            os.path.dirname(self.store.parsed_path(basefile))))
diff --git a/ferenda/decorators.py b/ferenda/decorators.py
@@ -76,8 +76,16 @@ def wrapper(self, doc):
 
 def render(f):
     """Handles the serialization of the :py:class:`~ferenda.Document`
-object to XHTML+RDFa and RDF/XML files. Must be used in conjunction
-with :py:func:`~ferenda.decorators.makedocument`."""
+    object to XHTML+RDFa and RDF/XML files. Must be used in
+    conjunction with :py:func:`~ferenda.decorators.makedocument`.
+
+    """
+    # NOTE: The actual rendering is two lines of code. The bulk of
+    # this function validates that the XHTML+RDFa file that we end up
+    # with contains the exact same triples as is present in the doc
+    # object (including both the doc.meta Graph and any other Graph
+    # that might be present on any doc.body object)
+
     def iterate_graphs(node):
         res = []
         if hasattr(node, 'meta') and node.meta is not None:
@@ -97,28 +105,34 @@ def wrapper(self, doc):
         # css file + background images + png renderings of text
         self.create_external_resources(doc)
 
-        # Check to see that all metadata contained in doc.meta is
-        # present in the serialized file.
+        # Validate that all triples specified in doc.meta and any
+        # .meta property on any body object is present in the
+        # XHTML+RDFa file.
         distilled_graph = Graph()
 
-        with codecs.open(self.store.parsed_path(doc.basefile), encoding="utf-8") as fp:  # unicode
-            distilled_graph.parse(data=fp.read(), format="rdfa", publicID=doc.uri)
+        with codecs.open(self.store.parsed_path(doc.basefile),
+                         encoding="utf-8") as fp:  # unicode
+            distilled_graph.parse(data=fp.read(), format="rdfa",
+                                  publicID=doc.uri)
         # The act of parsing from RDFa binds a lot of namespaces
         # in the graph in an unneccesary manner. Particularly it
         # binds both 'dc' and 'dcterms' to
         # 'http://purl.org/dc/terms/', which makes serialization
         # less than predictable. Blow these prefixes away.
         distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/"))
         distilled_graph.bind(
-            "dcterms", URIRef("http://example.org/this-prefix-should-not-be-used"))
+            "dcterms",
+            URIRef("http://example.org/this-prefix-should-not-be-used"))
 
         util.ensure_dir(self.store.distilled_path(doc.basefile))
-        with open(self.store.distilled_path(doc.basefile), "wb") as distilled_file:
+        with open(self.store.distilled_path(doc.basefile),
+                  "wb") as distilled_file:
             # print("============distilled===============")
             # print(distilled_graph.serialize(format="turtle").decode('utf-8'))
             distilled_graph.serialize(distilled_file, format="pretty-xml")
         self.log.debug(
-            '%s: %s triples extracted to %s', doc.basefile, len(distilled_graph), self.store.distilled_path(doc.basefile))
+            '%s: %s triples extracted to %s', doc.basefile,
+            len(distilled_graph), self.store.distilled_path(doc.basefile))
 
         for g in iterate_graphs(doc.body):
             doc.meta += g

diff --git a/ferenda/describer.py b/ferenda/describer.py
@@ -77,9 +77,9 @@ def getvalue(self, p):
         """
         values = list(self.getvalues(p))
         if len(values) == 0:
-            raise KeyError("No objects for predicate %s" % p)
+            raise KeyError("No values for predicate %s" % p)
         elif len(values) > 1:
-            raise KeyError("More than one object for predicatee %s" % p)
+            raise KeyError("More than one value for predicate %s" % p)
         return values[0]
 
     def getrel(self, p):
@@ -94,7 +94,7 @@ def getrel(self, p):
         """
         refs = list(self.getrels(p))
         if len(refs) == 0:
-            raise KeyError("No objects for predicate %s" + p)
+            raise KeyError("No objects for predicate %s" % p)
         elif len(refs) > 1:
-            raise KeyError("More than one object for predicatee %s" + p)
+            raise KeyError("More than one object for predicate %s" % p)
         return refs[0]
diff --git a/ferenda/devel.py b/ferenda/devel.py
@@ -29,22 +29,6 @@ class Devel(object):
     """
 
     alias = "devel"
-    # FIXME: manager.py should not strictly require these to be present
-
-    class DummyStore(object):
-
-        def __init__(self, path, **kwargs):
-            pass
-
-        def list_basefiles_for(self, action, basedir=None):
-            return []
-    downloaded_suffix = ".html"
-    storage_policy = "file"
-    documentstore_class = DummyStore
-
-    # Don't document this -- just needed for ferenda.manager compatibility
-    def get_default_options(self):
-        return {}
 
     @decorators.action
     def dumprdf(self, filename, format="turtle"):
@@ -309,34 +293,52 @@ def select(self, template, uri, format="json"):
             p['triples'] = len(res)
             print(res.serialize(format=format).decode('utf-8'))
 
+
+    # FIXME: These are dummy implementations of methods and class
+    # variables that manager.py expects all docrepos to have. We don't
+    # want to have coverage counting these as missing lines, hence the
+    # pragma: no cover comments.
+
+    class DummyStore(object):
+
+        def __init__(self, path, **kwargs):
+            pass  # pragma: no cover
+
+        def list_basefiles_for(self, action, basedir=None):
+            return []  # pragma: no cover
+
+    documentstore_class = DummyStore
+    downloaded_suffix = ".html"
+    storage_policy = "file"
+
+    def get_default_options(self):
+        return {}  # pragma: no cover
+
     def download(self):
-        pass
+        pass  # pragma: no cover
 
     def parse(self, basefile):
-        pass
+        pass  # pragma: no cover
 
     def relate(self, basefile):
-        pass
+        pass  # pragma: no cover
 
     def generate(self, basefile):
-        pass
+        pass  # pragma: no cover
 
     def toc(self, otherrepos):
-        pass
+        pass  # pragma: no cover
 
     def news(self, otherrepos):
-        pass
+        pass  # pragma: no cover
 
     def status(self):
-        pass
-
-    def list_basefiles_for(self, command):
-        return []
+        pass  # pragma: no cover
 
     @classmethod
     def setup(cls, action, config):
-        pass
+        pass  # pragma: no cover
 
     @classmethod
     def teardown(cls, action, config):
-        pass
+        pass  # pragma: no cover
diff --git a/ferenda/documentstore.py b/ferenda/documentstore.py
@@ -217,7 +217,10 @@ def list_basefiles_for(self, action, basedir=None):
             suffix = ".rdf"
         elif action == "generate":
             directory = os.path.sep.join((basedir, "parsed"))
-            suffix = ".xhtml"
+            if self.storage_policy == "dir":
+                suffix = "index.xhtml"
+            else:
+                suffix = ".xhtml"
         elif action == "news":
             directory = os.path.sep.join((basedir, "entries"))
             suffix = ".json"

diff --git a/ferenda/sources/general/wiki.py b/ferenda/sources/general/wiki.py
@@ -13,7 +13,7 @@
 # mine
 from ferenda import DocumentRepository
 from ferenda import util
-from ferenda.legalref import LegalRef, Link
+# from ferenda.legalref import LegalRef, Link
 
 # FIXME: Need to dynamically set this namespace (by inspecting the root?)
 # as it varies with MW version

diff --git a/ferenda/sources/legal/eu/eurlexcaselaw.py b/ferenda/sources/legal/eu/eurlexcaselaw.py
@@ -7,7 +7,7 @@
 from rdflib import Graph
 
 from ferenda import DocumentRepository
-from ferenda.legalref import LegalRef
+from ferenda.sources.legal.se.legalref import LegalRef
 from ferenda.elements import Paragraph
 
 # FIXME: 2008.json, containing a handful of cases, some which should not be fetched, and one continuation link.

diff --git a/ferenda/sources/legal/se/dv.py b/ferenda/sources/legal/se/dv.py
@@ -25,7 +25,7 @@
 from ferenda import DocumentStore, Describer, WordReader
 from ferenda.decorators import managedparsing
 from ferenda import util
-from ferenda.legalref import LegalRef, Link
+from ferenda.sources.legal.se.legalref import LegalRef, Link
 from ferenda.elements import Body, Paragraph
 from . import SwedishLegalSource, RPUBL
 

diff --git a/ferenda/sources/legal/se/jk.py b/ferenda/sources/legal/se/jk.py
@@ -16,7 +16,7 @@
 from .swedishlegalsource import Stycke, Sektion
 from ferenda.decorators import downloadmax, recordlastdownload
 from ferenda import util
-from ferenda.legalref import LegalRef, Link
+from ferenda.sources.legal.se.legalref import LegalRef, Link
 
 
 class JK(SwedishLegalSource):