new generic ifneeded decorator, replaces ifparseneeded

staffanm · Aug 13, 2018 · feb7368 · feb7368
1 parent 905a33d
commit feb7368
Show file tree

Hide file tree

Showing 11 changed files with 394 additions and 76 deletions.
diff --git a/doc/examples/patents.py b/doc/examples/patents.py
@@ -56,6 +56,8 @@ def parse(self,doc):
     # actually work
     def ocr_and_structure(self, doc):
         return True # A-OK!
+    # FIXME: noone calls repo.parseneeded anymore, they should call
+    # repo.store.needed(basefile, "parse")
     def parseneeded(self, basefile):
         return True
     required_predicates = []

diff --git a/ferenda/compositerepository.py b/ferenda/compositerepository.py
@@ -184,17 +184,18 @@ def download(self, basefile=None):
     @updateentry("parse")
     def parse(self, basefile):
         # first, check if we really need to parse. If any subrepo
-        # returns that parseneeded is false and we have parsed file in
-        # the mainrepo, then we're done. This is mainly to avoid the
-        # log message below (to be in line with expected repo
-        # behaviour of not logging anything at severity INFO if no real
-        # work was done), it does not noticably affect performance
+        # returns that .store.needed(...., "parse") is false and we
+        # have parsed file in the mainrepo, then we're done. This is
+        # mainly to avoid the log message below (to be in line with
+        # expected repo behaviour of not logging anything at severity
+        # INFO if no real work was done), it does not noticably affect
+        # performance
         force = (self.config.force is True or
                  self.config.parseforce is True)
         if not force:
             for c in self.subrepos:
                 inst = self.get_instance(c)
-                needed = inst.parseneeded(basefile)
+                needed = inst.store.needed(basefile, "parse")
                 if not needed and os.path.exists(self.store.parsed_path(basefile)):
                     self.log.debug("%s: Skipped" % basefile)
                     return True  # signals everything OK

diff --git a/ferenda/decorators.py b/ferenda/decorators.py
@@ -22,13 +22,18 @@
 import sys
 import time
 import logging
+try:
+    from inspect import getfullargspec
+except ImportError: # py 2 doesn't have getfullargspec, use getargspec instead
+    from inspect import getargspec
 
 from rdflib import Graph, URIRef
 from rdflib.compare import graph_diff
 from layeredconfig import LayeredConfig
 
 from ferenda import util
 from ferenda import DocumentEntry
+from ferenda.documentstore import Needed
 from ferenda.errors import DocumentRemovedError, ParseError
 from ferenda.elements import serialize
 
@@ -82,7 +87,7 @@ def parseifneeded(f):
     it should be re-generated."""
     @functools.wraps(f)
     def wrapper(self, basefile):
-        # note: We hardcode the use of .parseneeded() and the
+        # note: We hardcode the use of .store.needed(..., "parse") and the
         # 'parseforce' config option, which means that this decorator
         # can only be used sensibly with the .parse() function.
         force = (self.config.force is True or
@@ -95,6 +100,33 @@ def wrapper(self, basefile):
             return f(self, basefile)
     return wrapper
 
+def ifneeded(action):
+    def outer_wrapper(f, *args):
+        @functools.wraps(f)
+        def inner_wrapper(self, basefile, *args, **kwargs):
+            if self.config.force:
+                needed = Needed(reason="force is True")
+            else:
+                needed = self.store.needed(basefile, action)
+            if not needed:
+                self.log.debug("%s: %s skipped" % (basefile, action))
+                return True  # signals that everything is OK
+            else:
+                reason = ""
+                if hasattr(needed, 'reason'):
+                    reason = " (%s)" % needed.reason
+                # we should log this msg at DEBUG level, except now
+                # we're troubleshooting why certain files are
+                # re-processed and so we log at a slightly higher
+                # level (INFO + 1, to get around compositerepo's
+                # supress_subrepo_logging feature)
+                # self.log.debug("%s: %s starting%s" % (basefile, action, reason))
+                self.log.log(logging.INFO+1, "%s: %s starting%s" % (basefile, action, reason))
+                if 'needed' in getfullargspec(f).args and 'needed' not in kwargs:
+                    kwargs['needed'] = needed
+                return f(self, basefile, *args, **kwargs)
+        return inner_wrapper
+    return outer_wrapper
 
 def render(f):
     """Handles the serialization of the :py:class:`~ferenda.Document`
@@ -277,11 +309,11 @@ def wrapper(self, basefile):
 
 def managedparsing(f):
     """Use all standard decorators for parse() in the correct order
-    (:py:func:`~ferenda.decorators.parseifneeded`, 
+    (:py:func:`~ferenda.decorators.ifneeded`, :py:func:`~ferenda.decorators.updateentry`, 
     :py:func:`~ferenda.decorators.makedocument`, 
     :py:func:`~ferenda.decorators.timed`, 
     :py:func:`~ferenda.decorators.render`)"""
-    return parseifneeded(
+    return ifneeded('parse')(
         updateentry('parse')(
             makedocument(
                 # handleerror( # is this really a good idea?

diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py
@@ -59,7 +59,7 @@
 from ferenda.elements import (Body, Link,
                               UnorderedList, ListItem, Paragraph)
 from ferenda.elements.html import elements_from_soup
-from ferenda.documentstore import Relate
+from ferenda.documentstore import RelateNeeded
 # establish two central RDF Namespaces at the top level
 DCTERMS = Namespace(util.ns['dcterms'])
 PROV = Namespace(util.ns['prov'])
@@ -1118,20 +1118,9 @@ def parse_all_teardown(cls, config, *args, **kwargs):
            this might change to a instance method.
         """
 
-    def parseneeded(self, basefile):
-        """Returns True iff there is a need to parse the given basefile. If
-        the resulting parsed file exists and is newer than the
-        downloaded file, there is typically no reason to parse the
-        file.
-
-        """
-        infile = self.store.downloaded_path(basefile)
-        outfile = self.store.parsed_path(basefile)
-        return not util.outfile_is_newer([infile], outfile)
-
     @decorators.action
     @decorators.managedparsing
-    def parse(self, doc):
+    def parse(self, doc, needed=True):
         """Parse downloaded documents into structured XML and RDF.
 
         It will also save the same RDF statements in a separate
@@ -1820,8 +1809,9 @@ def relate_all_teardown(cls, config, *args, **kwargs):
         return True
 
     @decorators.action
+    @decorators.ifneeded('relate')
     @decorators.updateentry('relate')
-    def relate(self, basefile, otherrepos=[]):
+    def relate(self, basefile, otherrepos=[], needed=RelateNeeded(True,True,True)):
         """Runs various indexing operations for the document.
 
            This includes inserting RDF statements into a triple store,
@@ -1835,13 +1825,6 @@ def relate(self, basefile, otherrepos=[]):
                              (basefile, self.alias))
             return False
         entry = DocumentEntry(self.store.documententry_path(basefile))
-        if self.config.force:
-            relate = Relate(True, True, True)
-        else:
-            relate = self.store.needed(basefile, "relate")
-        if not(relate.triples or relate.dependencies or relate.fulltext):
-            self.log.debug("%s: skipped relate" % basefile)
-            return
         timings = {'basefile': basefile,
                    'e_triples': 0,
                    'e_deps': 0,
@@ -1862,18 +1845,18 @@ def relate(self, basefile, otherrepos=[]):
             if self not in otherrepos:
                 otherrepos.append(self)
 
-            if self.config.fulltextindex and relate.fulltext:
+            if self.config.fulltextindex and needed.fulltext:
                 start = time.time()
                 timings['v_fulltext'] = self.relate_fulltext(basefile, otherrepos)
                 timings['e_fulltext'] = time.time() - start
                 entry.indexed_ft = datetime.now()
 
-            if relate.dependencies:
+            if needed.dependencies:
                 start = time.time()
                 timings['v_deps'] = self.relate_dependencies(basefile, otherrepos)
                 timings['e_deps'] = time.time() - start
                 entry.indexed_dep = datetime.now()
-            if relate.triples:
+            if needed.triples:
                 # If using the Bulk upload feature, append to the
                 # temporary file that is to be bulk uploaded (see
                 # relate_all_setup).
@@ -2372,8 +2355,9 @@ def generate_all_teardown(cls, config, *args, **kwargs):
         """
 
     @decorators.action
+    @decorators.ifneeded('generate')
     @decorators.updateentry('generate')
-    def generate(self, basefile, otherrepos=[]):
+    def generate(self, basefile, otherrepos=[], needed=True):
         """Generate a browser-ready HTML file from structured XML and RDF.
 
         Uses the XML and RDF files constructed by
@@ -2392,37 +2376,23 @@ def generate(self, basefile, otherrepos=[]):
         :type  basefile: str
         :returns: None
         """
-        # This dependency management could be abstracted away like
-        # the parseifneeded decorator does for parse(). But unlike
-        # parse(), noone is expected to override generate(), so
-        # the proper place to handle this complexity is probably
-        # here.
-        infile = self.store.parsed_path(basefile)
-
-        annotations = self.store.annotation_path(basefile)
-        if os.path.exists(self.store.dependencies_path(basefile)):
-            deptxt = util.readfile(self.store.dependencies_path(basefile))
-            dependencies = deptxt.strip().split("\n")
-        else:
-            dependencies = []
-        dependencies.extend((infile, annotations))
-
-        outfile = self.store.generated_path(basefile)
-        if ((not self.config.force) and
-                util.outfile_is_newer(dependencies, outfile)):
-            self.log.debug("%s: Skipped", basefile)
-            return
-
         with util.logtime(self.log.info, "%(basefile)s: generate OK (%(elapsed).3f sec)",
                           {'basefile': basefile}):
 
             self.log.debug("%s: Starting", basefile)
 
             # All bookkeping done, now lets prepare and transform!
 
+            infile = self.store.parsed_path(basefile)
+            outfile = self.store.generated_path(basefile)
             # The annotationfile might be newer than all dependencies
             # (and thus not need regenerateion) even though the
             # outfile is older.
+            if os.path.exists(self.store.dependencies_path(basefile)):
+                deptxt = util.readfile(self.store.dependencies_path(basefile))
+                dependencies = deptxt.strip().split("\n")
+            else:
+                dependencies = []
             if (self.config.force or (not
                                       util.outfile_is_newer(dependencies, self.store.annotation_path(basefile)))):
                 with util.logtime(self.log.debug,

diff --git a/ferenda/documentstore.py b/ferenda/documentstore.py
@@ -186,11 +186,25 @@ def closed(self):
     def name(self):
         return self.fp.name
 
-Relate = namedtuple('Relate', ['fulltext', 'dependencies', 'triples'])
+class Needed(int):
+    def __new__(cls, *args, **kwargs):
+        obj = int.__new__(cls, *args)
+        object.__setattr__(obj, 'reason', kwargs['reason'])
+        return obj
+
+    def __bool__(self):
+        return True
+
+RelateNeeded = namedtuple('RelateNeeded', ['fulltext', 'dependencies', 'triples'])
 # make this namedtuple class work in a bool context: False iff all
-# elements are falsy
-Relate.__bool__ = lambda self: any(self)
-
+# elements are falsy. Elements should be plain bools or Needed objects
+RelateNeeded.__bool__ = lambda self: any(self)
+
+# for reason, return the first True-ish elements reason (FIXME: what
+# happens if no element is True?)
+RelateNeeded.reason = property(lambda self: next(x.reason for x in self if x))
+
+
 
 class DocumentStore(object):
     """Unifies handling of reading and writing of various data files
@@ -351,25 +365,42 @@ def open(self, basefile, maindir, suffix, mode="r",
 
 
     def needed(self, basefile, action):
+        """Determine if we really need to perform *action* for the given
+*basefile*, or if the result of the action (in the form of the file
+that the action creates, or similar) is newer than all of the actions
+dependencies (in the form of source files for the action).
+
+        """
+
         # if this function is even called, it means that force is not
         # true (or ferenda-build.py has not been called with a single
         # basefile, which is an implied force)
         if action == "parse":
             infile = self.downloaded_path(basefile)
             outfile = self.parsed_path(basefile)
-            return not util.outfile_is_newer([infile], outfile)
+            newer = util.outfile_is_newer([infile], outfile)
+            if not newer:
+                return Needed(reason=getattr(newer, 'reason', None))
+            else:
+                return False
         elif action == "relate":
             entry = DocumentEntry(self.documententry_path(basefile))
-            def newer(filename, dt):
+            def newer(filename, dt, field):
                 if not os.path.exists(filename):
                     return False
                 elif not dt:  # has never been indexed
-                    return True
+                    return Needed(reason="%s has not been processed" % filename)
                 else:
-                    return datetime.fromtimestamp(os.stat(filename).st_mtime) > dt
-            return Relate(fulltext=newer(self.parsed_path(basefile), entry.indexed_ft),
-                          triples=newer(self.distilled_path(basefile), entry.indexed_ts),
-                          dependencies=newer(self.distilled_path(basefile), entry.indexed_dep))
+                    if datetime.fromtimestamp(os.stat(filename).st_mtime) > dt:
+                        return Needed(reason="%s is newer than %s in documententry %s" % (filename, field, entry._path))
+                    else:
+                        return False
+
+            return RelateNeeded(
+                fulltext=newer(self.parsed_path(basefile), entry.indexed_ft, 'indexed_ft'),
+                triples=newer(self.distilled_path(basefile), entry.indexed_ts, 'indexed_ts'),
+                dependencies=newer(self.dependencies_path(basefile), entry.indexed_dep,
+                                   'indexed_dep'))
         elif action == "generate":
             infile = self.parsed_path(basefile)
             annotations = self.annotation_path(basefile)
@@ -380,7 +411,16 @@ def newer(filename, dt):
                 dependencies = []
             dependencies.extend((infile, annotations))
             outfile = self.generated_path(basefile)
-            return not util.outfile_is_newer(dependencies, outfile)
+            # support generated 404 files (when served through HTTP,
+            # served with HTTP status 404, but otherwise works just as
+            # regular generated files)
+            if not os.path.exists(outfile) and os.path.exists(outfile + ".404"):
+                outfile += ".404"
+            newer = util.outfile_is_newer(dependencies, outfile)
+            if not newer:
+                return Needed(reason = getattr(newer, 'reason', None))
+            else:
+                return False
         else:
             # custom actions will need to override needed and provide logic there
             return True  

diff --git a/ferenda/manager.py b/ferenda/manager.py
@@ -1027,7 +1027,7 @@ def _run_class(enabled, argv, config):
                 # if we don't need to parse all basefiles, let's not
                 # even send jobs out to buildclients if we can avoid
                 # it
-                iterable = (x for x in iterable if inst.parseneeded(x))
+                iterable = (x for x in iterable if inst.store.needed(x, "parse"))
             res = []
             # semi-magic handling
             kwargs['currentrepo'] = inst

diff --git a/ferenda/util.py b/ferenda/util.py
@@ -387,18 +387,39 @@ def copy_if_different(src, dest):
 
 # util.File
 
+class OutfileIsNotNewer(int):
+    def __new__(cls, *args, **kwargs):
+        obj = int.__new__(cls, *args)
+        object.__setattr__(obj, 'reason', kwargs['reason'])
+        return obj
+
+    def __bool__(self):
+        return False
+
 
 def outfile_is_newer(infiles, outfile):
-    """Check if a given *outfile* is newer (has a more recent modification time) than a list of *infiles*. Returns True if so, False otherwise (including if outfile doesn't exist)."""
+    """Check if a given *outfile* is newer than all of the given files in the *infiles* list.
+
+    Newer is defined as having more recent modification time. Returns
+    True if so, a falsey value otherwise (including if outfile doesn't
+    exist).
+
+    If the outfile isn't never, the value returned will evaluate to
+    False in a bool context, but also contain a *reason* attribute
+    containing a text description of which infiles file was never than
+    outfile.
+
+    """
 
     if not os.path.exists(outfile):
-        return False
+        return OutfileIsNotNewer(reason="outfile doesn't exist: %s" % outfile)
     outfile_mtime = os.stat(outfile).st_mtime
     for f in infiles:
         # print "Testing whether %s is newer than %s" % (f, outfile)
         if os.path.exists(f) and os.stat(f).st_mtime > outfile_mtime:
             # print "%s was newer than %s" % (f, outfile)
-            return False
+            # return False
+            return OutfileIsNotNewer(reason="%s is newer than outfile %s" % (f, outfile))
     # print "%s is newer than %r" % (outfile, infiles)
     return True