Skip to content

Commit

Permalink
new generic ifneeded decorator, replaces ifparseneeded
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Aug 13, 2018
1 parent 905a33d commit feb7368
Show file tree
Hide file tree
Showing 11 changed files with 394 additions and 76 deletions.
2 changes: 2 additions & 0 deletions doc/examples/patents.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def parse(self,doc):
# actually work
def ocr_and_structure(self, doc):
return True # A-OK!
# FIXME: noone calls repo.parseneeded anymore, they should call
# repo.store.needed(basefile, "parse")
def parseneeded(self, basefile):
return True
required_predicates = []
Expand Down
13 changes: 7 additions & 6 deletions ferenda/compositerepository.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,17 +184,18 @@ def download(self, basefile=None):
@updateentry("parse")
def parse(self, basefile):
# first, check if we really need to parse. If any subrepo
# returns that parseneeded is false and we have parsed file in
# the mainrepo, then we're done. This is mainly to avoid the
# log message below (to be in line with expected repo
# behaviour of not logging anything at severity INFO if no real
# work was done), it does not noticably affect performance
# returns that .store.needed(...., "parse") is false and we
# have parsed file in the mainrepo, then we're done. This is
# mainly to avoid the log message below (to be in line with
# expected repo behaviour of not logging anything at severity
# INFO if no real work was done), it does not noticably affect
# performance
force = (self.config.force is True or
self.config.parseforce is True)
if not force:
for c in self.subrepos:
inst = self.get_instance(c)
needed = inst.parseneeded(basefile)
needed = inst.store.needed(basefile, "parse")
if not needed and os.path.exists(self.store.parsed_path(basefile)):
self.log.debug("%s: Skipped" % basefile)
return True # signals everything OK
Expand Down
38 changes: 35 additions & 3 deletions ferenda/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,18 @@
import sys
import time
import logging
try:
from inspect import getfullargspec
except ImportError: # py 2 doesn't have getfullargspec, use getargspec instead
from inspect import getargspec

from rdflib import Graph, URIRef
from rdflib.compare import graph_diff
from layeredconfig import LayeredConfig

from ferenda import util
from ferenda import DocumentEntry
from ferenda.documentstore import Needed
from ferenda.errors import DocumentRemovedError, ParseError
from ferenda.elements import serialize

Expand Down Expand Up @@ -82,7 +87,7 @@ def parseifneeded(f):
it should be re-generated."""
@functools.wraps(f)
def wrapper(self, basefile):
# note: We hardcode the use of .parseneeded() and the
# note: We hardcode the use of .store.needed(..., "parse") and the
# 'parseforce' config option, which means that this decorator
# can only be used sensibly with the .parse() function.
force = (self.config.force is True or
Expand All @@ -95,6 +100,33 @@ def wrapper(self, basefile):
return f(self, basefile)
return wrapper

def ifneeded(action):
def outer_wrapper(f, *args):
@functools.wraps(f)
def inner_wrapper(self, basefile, *args, **kwargs):
if self.config.force:
needed = Needed(reason="force is True")
else:
needed = self.store.needed(basefile, action)
if not needed:
self.log.debug("%s: %s skipped" % (basefile, action))
return True # signals that everything is OK
else:
reason = ""
if hasattr(needed, 'reason'):
reason = " (%s)" % needed.reason
# we should log this msg at DEBUG level, except now
# we're troubleshooting why certain files are
# re-processed and so we log at a slightly higher
# level (INFO + 1, to get around compositerepo's
# supress_subrepo_logging feature)
# self.log.debug("%s: %s starting%s" % (basefile, action, reason))
self.log.log(logging.INFO+1, "%s: %s starting%s" % (basefile, action, reason))
if 'needed' in getfullargspec(f).args and 'needed' not in kwargs:
kwargs['needed'] = needed
return f(self, basefile, *args, **kwargs)
return inner_wrapper
return outer_wrapper

def render(f):
"""Handles the serialization of the :py:class:`~ferenda.Document`
Expand Down Expand Up @@ -277,11 +309,11 @@ def wrapper(self, basefile):

def managedparsing(f):
"""Use all standard decorators for parse() in the correct order
(:py:func:`~ferenda.decorators.parseifneeded`,
(:py:func:`~ferenda.decorators.ifneeded`, :py:func:`~ferenda.decorators.updateentry`,
:py:func:`~ferenda.decorators.makedocument`,
:py:func:`~ferenda.decorators.timed`,
:py:func:`~ferenda.decorators.render`)"""
return parseifneeded(
return ifneeded('parse')(
updateentry('parse')(
makedocument(
# handleerror( # is this really a good idea?
Expand Down
62 changes: 16 additions & 46 deletions ferenda/documentrepository.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
from ferenda.elements import (Body, Link,
UnorderedList, ListItem, Paragraph)
from ferenda.elements.html import elements_from_soup
from ferenda.documentstore import Relate
from ferenda.documentstore import RelateNeeded
# establish two central RDF Namespaces at the top level
DCTERMS = Namespace(util.ns['dcterms'])
PROV = Namespace(util.ns['prov'])
Expand Down Expand Up @@ -1118,20 +1118,9 @@ def parse_all_teardown(cls, config, *args, **kwargs):
this might change to a instance method.
"""

def parseneeded(self, basefile):
"""Returns True iff there is a need to parse the given basefile. If
the resulting parsed file exists and is newer than the
downloaded file, there is typically no reason to parse the
file.
"""
infile = self.store.downloaded_path(basefile)
outfile = self.store.parsed_path(basefile)
return not util.outfile_is_newer([infile], outfile)

@decorators.action
@decorators.managedparsing
def parse(self, doc):
def parse(self, doc, needed=True):
"""Parse downloaded documents into structured XML and RDF.
It will also save the same RDF statements in a separate
Expand Down Expand Up @@ -1820,8 +1809,9 @@ def relate_all_teardown(cls, config, *args, **kwargs):
return True

@decorators.action
@decorators.ifneeded('relate')
@decorators.updateentry('relate')
def relate(self, basefile, otherrepos=[]):
def relate(self, basefile, otherrepos=[], needed=RelateNeeded(True,True,True)):
"""Runs various indexing operations for the document.
This includes inserting RDF statements into a triple store,
Expand All @@ -1835,13 +1825,6 @@ def relate(self, basefile, otherrepos=[]):
(basefile, self.alias))
return False
entry = DocumentEntry(self.store.documententry_path(basefile))
if self.config.force:
relate = Relate(True, True, True)
else:
relate = self.store.needed(basefile, "relate")
if not(relate.triples or relate.dependencies or relate.fulltext):
self.log.debug("%s: skipped relate" % basefile)
return
timings = {'basefile': basefile,
'e_triples': 0,
'e_deps': 0,
Expand All @@ -1862,18 +1845,18 @@ def relate(self, basefile, otherrepos=[]):
if self not in otherrepos:
otherrepos.append(self)

if self.config.fulltextindex and relate.fulltext:
if self.config.fulltextindex and needed.fulltext:
start = time.time()
timings['v_fulltext'] = self.relate_fulltext(basefile, otherrepos)
timings['e_fulltext'] = time.time() - start
entry.indexed_ft = datetime.now()

if relate.dependencies:
if needed.dependencies:
start = time.time()
timings['v_deps'] = self.relate_dependencies(basefile, otherrepos)
timings['e_deps'] = time.time() - start
entry.indexed_dep = datetime.now()
if relate.triples:
if needed.triples:
# If using the Bulk upload feature, append to the
# temporary file that is to be bulk uploaded (see
# relate_all_setup).
Expand Down Expand Up @@ -2372,8 +2355,9 @@ def generate_all_teardown(cls, config, *args, **kwargs):
"""

@decorators.action
@decorators.ifneeded('generate')
@decorators.updateentry('generate')
def generate(self, basefile, otherrepos=[]):
def generate(self, basefile, otherrepos=[], needed=True):
"""Generate a browser-ready HTML file from structured XML and RDF.
Uses the XML and RDF files constructed by
Expand All @@ -2392,37 +2376,23 @@ def generate(self, basefile, otherrepos=[]):
:type basefile: str
:returns: None
"""
# This dependency management could be abstracted away like
# the parseifneeded decorator does for parse(). But unlike
# parse(), noone is expected to override generate(), so
# the proper place to handle this complexity is probably
# here.
infile = self.store.parsed_path(basefile)

annotations = self.store.annotation_path(basefile)
if os.path.exists(self.store.dependencies_path(basefile)):
deptxt = util.readfile(self.store.dependencies_path(basefile))
dependencies = deptxt.strip().split("\n")
else:
dependencies = []
dependencies.extend((infile, annotations))

outfile = self.store.generated_path(basefile)
if ((not self.config.force) and
util.outfile_is_newer(dependencies, outfile)):
self.log.debug("%s: Skipped", basefile)
return

with util.logtime(self.log.info, "%(basefile)s: generate OK (%(elapsed).3f sec)",
{'basefile': basefile}):

self.log.debug("%s: Starting", basefile)

# All bookkeping done, now lets prepare and transform!

infile = self.store.parsed_path(basefile)
outfile = self.store.generated_path(basefile)
# The annotationfile might be newer than all dependencies
# (and thus not need regenerateion) even though the
# outfile is older.
if os.path.exists(self.store.dependencies_path(basefile)):
deptxt = util.readfile(self.store.dependencies_path(basefile))
dependencies = deptxt.strip().split("\n")
else:
dependencies = []
if (self.config.force or (not
util.outfile_is_newer(dependencies, self.store.annotation_path(basefile)))):
with util.logtime(self.log.debug,
Expand Down
64 changes: 52 additions & 12 deletions ferenda/documentstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,25 @@ def closed(self):
def name(self):
return self.fp.name

Relate = namedtuple('Relate', ['fulltext', 'dependencies', 'triples'])
class Needed(int):
def __new__(cls, *args, **kwargs):
obj = int.__new__(cls, *args)
object.__setattr__(obj, 'reason', kwargs['reason'])
return obj

def __bool__(self):
return True

RelateNeeded = namedtuple('RelateNeeded', ['fulltext', 'dependencies', 'triples'])
# make this namedtuple class work in a bool context: False iff all
# elements are falsy
Relate.__bool__ = lambda self: any(self)

# elements are falsy. Elements should be plain bools or Needed objects
RelateNeeded.__bool__ = lambda self: any(self)

# for reason, return the first True-ish elements reason (FIXME: what
# happens if no element is True?)
RelateNeeded.reason = property(lambda self: next(x.reason for x in self if x))



class DocumentStore(object):
"""Unifies handling of reading and writing of various data files
Expand Down Expand Up @@ -351,25 +365,42 @@ def open(self, basefile, maindir, suffix, mode="r",


def needed(self, basefile, action):
"""Determine if we really need to perform *action* for the given
*basefile*, or if the result of the action (in the form of the file
that the action creates, or similar) is newer than all of the actions
dependencies (in the form of source files for the action).
"""

# if this function is even called, it means that force is not
# true (or ferenda-build.py has not been called with a single
# basefile, which is an implied force)
if action == "parse":
infile = self.downloaded_path(basefile)
outfile = self.parsed_path(basefile)
return not util.outfile_is_newer([infile], outfile)
newer = util.outfile_is_newer([infile], outfile)
if not newer:
return Needed(reason=getattr(newer, 'reason', None))
else:
return False
elif action == "relate":
entry = DocumentEntry(self.documententry_path(basefile))
def newer(filename, dt):
def newer(filename, dt, field):
if not os.path.exists(filename):
return False
elif not dt: # has never been indexed
return True
return Needed(reason="%s has not been processed" % filename)
else:
return datetime.fromtimestamp(os.stat(filename).st_mtime) > dt
return Relate(fulltext=newer(self.parsed_path(basefile), entry.indexed_ft),
triples=newer(self.distilled_path(basefile), entry.indexed_ts),
dependencies=newer(self.distilled_path(basefile), entry.indexed_dep))
if datetime.fromtimestamp(os.stat(filename).st_mtime) > dt:
return Needed(reason="%s is newer than %s in documententry %s" % (filename, field, entry._path))
else:
return False

return RelateNeeded(
fulltext=newer(self.parsed_path(basefile), entry.indexed_ft, 'indexed_ft'),
triples=newer(self.distilled_path(basefile), entry.indexed_ts, 'indexed_ts'),
dependencies=newer(self.dependencies_path(basefile), entry.indexed_dep,
'indexed_dep'))
elif action == "generate":
infile = self.parsed_path(basefile)
annotations = self.annotation_path(basefile)
Expand All @@ -380,7 +411,16 @@ def newer(filename, dt):
dependencies = []
dependencies.extend((infile, annotations))
outfile = self.generated_path(basefile)
return not util.outfile_is_newer(dependencies, outfile)
# support generated 404 files (when served through HTTP,
# served with HTTP status 404, but otherwise works just as
# regular generated files)
if not os.path.exists(outfile) and os.path.exists(outfile + ".404"):
outfile += ".404"
newer = util.outfile_is_newer(dependencies, outfile)
if not newer:
return Needed(reason = getattr(newer, 'reason', None))
else:
return False
else:
# custom actions will need to override needed and provide logic there
return True
Expand Down
2 changes: 1 addition & 1 deletion ferenda/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,7 @@ def _run_class(enabled, argv, config):
# if we don't need to parse all basefiles, let's not
# even send jobs out to buildclients if we can avoid
# it
iterable = (x for x in iterable if inst.parseneeded(x))
iterable = (x for x in iterable if inst.store.needed(x, "parse"))
res = []
# semi-magic handling
kwargs['currentrepo'] = inst
Expand Down
27 changes: 24 additions & 3 deletions ferenda/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,18 +387,39 @@ def copy_if_different(src, dest):

# util.File

class OutfileIsNotNewer(int):
def __new__(cls, *args, **kwargs):
obj = int.__new__(cls, *args)
object.__setattr__(obj, 'reason', kwargs['reason'])
return obj

def __bool__(self):
return False


def outfile_is_newer(infiles, outfile):
"""Check if a given *outfile* is newer (has a more recent modification time) than a list of *infiles*. Returns True if so, False otherwise (including if outfile doesn't exist)."""
"""Check if a given *outfile* is newer than all of the given files in the *infiles* list.
Newer is defined as having more recent modification time. Returns
True if so, a falsey value otherwise (including if outfile doesn't
exist).
If the outfile isn't never, the value returned will evaluate to
False in a bool context, but also contain a *reason* attribute
containing a text description of which infiles file was never than
outfile.
"""

if not os.path.exists(outfile):
return False
return OutfileIsNotNewer(reason="outfile doesn't exist: %s" % outfile)
outfile_mtime = os.stat(outfile).st_mtime
for f in infiles:
# print "Testing whether %s is newer than %s" % (f, outfile)
if os.path.exists(f) and os.stat(f).st_mtime > outfile_mtime:
# print "%s was newer than %s" % (f, outfile)
return False
# return False
return OutfileIsNotNewer(reason="%s is newer than outfile %s" % (f, outfile))
# print "%s is newer than %r" % (outfile, infiles)
return True

Expand Down
Loading

0 comments on commit feb7368

Please sign in to comment.