Skip to content

Commit

Permalink
better handling of adjusting basefiles during parse
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Aug 18, 2018
1 parent 33bd4bf commit 9d7c190
Show file tree
Hide file tree
Showing 13 changed files with 197 additions and 39 deletions.
17 changes: 15 additions & 2 deletions ferenda/compositerepository.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,15 +221,28 @@ def parse(self, basefile):
break

if ret:
oldbasefile = basefile
if ret is not True and ret != basefile:
# this is a signal that parse discovered
# that the basefile was wrong
# this is a signal that parse discovered that the
# basefile was adjusted. We should raise
# DocumentRenamedError at the very end to get
# updateentry do the right thing.
basefile = ret
# Also, touch the old parsed path so we don't
# regenerate.
with self.store.open_parsed(oldbasefile, "w"):
pass


self.copy_parsed(basefile, inst)
self.log.info("%(basefile)s parse OK (%(elapsed).3f sec)",
{'basefile': basefile,
'elapsed': time.time() - start})

if basefile != oldbasefile:
msg = "%s: In subrepo %s basefile turned out to really be %s" % (
oldbasefile, inst.qualified_class_name(), basefile)
raise errors.DocumentRenamedError(True, msg, oldbasefile, basefile)
return ret
else:
# subrepos should only contain those repos that actually
Expand Down
36 changes: 29 additions & 7 deletions ferenda/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from ferenda import util
from ferenda import DocumentEntry
from ferenda.documentstore import Needed
from ferenda.errors import DocumentRemovedError, ParseError
from ferenda.errors import DocumentRemovedError, ParseError, DocumentRenamedError
from ferenda.elements import serialize


Expand Down Expand Up @@ -161,7 +161,24 @@ def cssuri(baseuri, filename):
@functools.wraps(f)
def wrapper(self, doc):
# call the actual function that creates the doc data
oldbasefile = doc.basefile
ret = f(self, doc)
if doc.basefile != oldbasefile:
# means that basefile was adjusted. Touch the old parsed
# path first so we don't regenerate.
with self.store.open_parsed(oldbasefile, "w"):
pass
# move any intermediate files (in particular extracted
# image backgrounds from PDF files) that might be
# needed later.
old_intermediate = self.store.intermediate_path(oldbasefile)
new_intermediate = self.store.intermediate_path(doc.basefile)
if self.store.storage_policy == "dir":
old_intermediate = os.path.dirname(old_intermediate)
new_intermediate = os.path.dirname(new_intermediate)
if os.path.exists(old_intermediate) and not os.path.exists(new_intermediate):
util.ensure_dir(new_intermediate)
os.rename(old_intermediate, new_intermediate)
# now render thath doc data as files (JSON, XHTML, RDF/XML)
if self.config.serializejson == True:
with self.store.open_serialized(doc.basefile, "wb") as fp:
Expand Down Expand Up @@ -303,7 +320,12 @@ def makedocument(f):
@functools.wraps(f)
def wrapper(self, basefile):
doc = self.make_document(basefile)
return f(self, doc)
ret = f(self, doc)
if doc.basefile != basefile:
raise DocumentRenamedError(
"%s: Basefile turned out to really be %s" % (basefile, doc.basefile),
returnvalue=doc.basefile, oldbasefile=basefile, newbasefile=doc.basefile)
return ret
return wrapper


Expand Down Expand Up @@ -360,14 +382,14 @@ def updateentry(section):
def outer_wrapper(f, *args):
@functools.wraps(f)
def inner_wrapper(self, *args, **kwargs):
# try to find out if we have a basefile
if args and args[0]:
# try to find out if we have a basefile
basefile = args[0]
entrypath_arg = args[0]
else:
basefile = ".root"
args = ()
entrypath = self.store.documententry_path(basefile)
entrypath_arg = ".root"
entrypath = self.store.documententry_path
args = [self] + list(args)
return DocumentEntry.updateentry(f, section, entrypath, *args, **kwargs)
return DocumentEntry.updateentry(f, section, entrypath, entrypath_arg, *args, **kwargs)
return inner_wrapper
return outer_wrapper
2 changes: 1 addition & 1 deletion ferenda/devel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1397,7 +1397,7 @@ def statusreport(self, alias=None):
{"id": action,
"success": str(status["success"]),
"duration": str(status["duration"]),
"date": status["date"]})
"date": str(status["date"])})
if status["success"]:
successcnt += 1
else:
Expand Down
15 changes: 10 additions & 5 deletions ferenda/documententry.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from rdflib.namespace import RDF

from ferenda import util
from ferenda.errors import DocumentRemovedError
from ferenda.errors import DocumentRemovedError, DocumentRenamedError

class DocumentEntry(object):

Expand Down Expand Up @@ -264,11 +264,12 @@ def guess_type(self, filename):
return "application/octet-stream"

@staticmethod
def updateentry(f, section, entrypath, *args, **kwargs):
def updateentry(f, section, entrypath, entrypath_arg, *args, **kwargs):
"""runs the provided function with the provided arguments, captures
any logged events emitted, catches any errors, and records the
result in the entry file under the provided section. The basefile
is assumed to be the first element in args.
result in the entry file under the provided section. Entrypath
should be a function that takes a basefile string and returns
the full path to the entry file for that basefile.
"""
def clear(key, d):
Expand All @@ -292,6 +293,10 @@ def clear(key, d):
except DocumentRemovedError as e:
success = "removed"
raise
except DocumentRenamedError as e:
entrypath_arg = e.newbasefile
success = True
return e.returnvalue
except Exception as e:
success = False
errortype, errorval, errortb = sys.exc_info()
Expand All @@ -305,7 +310,7 @@ def clear(key, d):
rootlog.removeHandler(handler)
if success is not None:
warnings = logstream.getvalue()
entry = DocumentEntry(entrypath)
entry = DocumentEntry(entrypath(entrypath_arg))
if section not in entry.status:
entry.status[section] = {}
entry.status[section]['success'] = success
Expand Down
5 changes: 3 additions & 2 deletions ferenda/documentrepository.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,7 @@ def dataset_uri(self, param=None, value=None, feed=False):
>>> d = DocumentRepository()
>>> d.alias
'base'
>>> d.config.url = "http://exampole.org/"
>>> d.config.url = "http://example.org/"
>>> d.dataset_uri()
'http://example.org/dataset/base'
>>> d.dataset_uri("title","a")
Expand Down Expand Up @@ -798,7 +798,8 @@ def download(self, basefile=None, reporter=None):
try:
ret = DocumentEntry.updateentry(self.download_single,
'download',
self.store.documententry_path(basefile),
self.store.documententry_path,
basefile,
basefile,
link)
except requests.exceptions.HTTPError as e:
Expand Down
2 changes: 1 addition & 1 deletion ferenda/documentstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def newer(filename, dt, field):
if not os.path.exists(filename):
return False
elif not dt: # has never been indexed
return Needed(reason="%s has not been processed" % filename)
return Needed(reason="%s has not been processed according to %s in documententry %s" % (filename, field, entry._path))
else:
if datetime.fromtimestamp(os.stat(filename).st_mtime) > dt:
return Needed(reason="%s is newer than %s in documententry %s" % (filename, field, entry._path))
Expand Down
8 changes: 8 additions & 0 deletions ferenda/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ class DocumentSkippedError(DocumentRemovedError):
exist) since it's not interesting."""


class DocumentRenamedError(FerendaException):
def __init__(self, value, returnvalue, oldbasefile, newbasefile):
super(DocumentRenamedError, self).__init__(value)
self.returnvalue = returnvalue
self.oldbasefile = oldbasefile
self.newbasefile = newbasefile


class PatchError(ParseError):
"""Raised if a patch cannot be applied by
:py:meth:`~ferenda.DocumentRepository.patch_if_needed`."""
Expand Down
22 changes: 15 additions & 7 deletions ferenda/sources/legal/se/swedishlegalsource.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import operator
import os
import re
import shutil
import sys
import unicodedata

Expand Down Expand Up @@ -586,7 +587,12 @@ def parse(self, doc):
resource = self.parse_metadata(fp, doc.basefile)
doc.meta = resource.graph
doc.uri = str(resource.identifier)
self._adjust_basefile(doc, orig_uri)
# _adjust_basefile returns true iff basefile was adjusted. We
# return the new adjusted basefile to the caller
if self._adjust_basefile(doc, orig_uri):
ret = doc.basefile
else:
ret = True
if resource.value(DCTERMS.title):
doc.lang = resource.value(DCTERMS.title).language
if options == "metadataonly":
Expand All @@ -601,7 +607,7 @@ def parse(self, doc):
self.postprocess_doc(doc)
self.parse_entry_update(doc)
# print(doc.meta.serialize(format="turtle").decode("utf-8"))
return True
return ret

def _adjust_basefile(self, doc, orig_uri):
# In some cases, we might discover during parse that we've
Expand All @@ -613,13 +619,15 @@ def _adjust_basefile(self, doc, orig_uri):
# if generate() hasn't run yet.
if orig_uri != doc.uri:
newbasefile = self.basefile_from_uri(doc.uri)
oldbasefile = doc.basefile
if newbasefile:
# change the basefile we're dealing with. Touch
# self.store.parsed_path(basefile) first so we don't
# regenerate.
with self.store.open_parsed(doc.basefile, "w"):
pass
# Decorators.render should deal with writing results
# to the correct locations, moving intermediate files
# etc, as long as doc.basefile is changed
doc.basefile = newbasefile
return True # basefile was adjusted
else:
return False # basefile was not adjusted

def parse_open(self, basefile, attachment=None):
"""Open the main downloaded file for the given basefile, caching the
Expand Down
11 changes: 6 additions & 5 deletions ferenda/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,15 +864,16 @@ def robust_fetch(method, url, logger, attempts=5, sleep=1, raise_for_status=True
except (requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
socket.timeout) as e:
logger.warning(
"Failed to fetch %s: err %s (%s remaining attempts)" %
(url, e, attempts))
lastexception = e
logger.warning(
"Failed to fetch %s: err %s (%s remaining attempts)" %
(url, e, attempts))
lastexception = e
time.sleep(sleep)
if response and response.status_code >= 400 and response.status_code != 404:
fetched = False # let's retry even for 400 or 500 class errors, maybe it'll go better in a second
logger.warning("Failed to fetch %s: status %s (%s remaining attempts)" % (url, response.status_code, attempts))
time.sleep(sleep)
attempts -= 1
time.sleep(sleep)
if not fetched:
logger.error("Failed to fetch %s, giving up" % url)
if lastexception and raise_for_status:
Expand Down
2 changes: 1 addition & 1 deletion lagen/nu/myndfskr.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def parse(self, basefile):
self.copy_parsed(subbasefile, inst)
break
else:
return super(MyndFskr, self).parse(basefile)
return super(MyndFskr, self).parse(basefile)

def facets(self):
# maybe if each entry in the list could be a tuple or a single
Expand Down
61 changes: 60 additions & 1 deletion test/testCompositeRepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,15 @@
print_function, unicode_literals)
from builtins import *

import os

import rdflib
from rdflib.namespace import DCTERMS

from ferenda import DocumentRepository, util, errors
from ferenda.testutil import RepoTester
from ferenda.decorators import updateentry
from ferenda.decorators import updateentry, managedparsing
from ferenda.elements.html import Body, H1
# SUT
from ferenda import CompositeRepository

Expand Down Expand Up @@ -173,3 +179,56 @@ def test_config(self):
def test_super(self):
got = self.repo.qualified_class_name()
self.assertEqual("Q:testCompositeRepo.SubrepoBSubclass", got)


class RenamingRepo(DocumentRepository):
alias = "renaming"

@managedparsing
def parse(self, doc):
# create an intermediate file before we know the correct
# path for it. Later steps should move this file to the
# correct place.
util.writefile(self.store.intermediate_path(doc.basefile), "dummy")
doc.meta.add((rdflib.URIRef(doc.uri), DCTERMS.title,
rdflib.Literal("Hello World", lang="en")))
doc.body = Body([H1(["Hello world"])])
doc.basefile = doc.basefile.replace("a/", "b/")
return True

class BasefileRename(RepoTester):

class RenamingCompositeRepo(CompositeRepository):
alias = "composite"
subrepos = RenamingRepo,

repoclass = RenamingCompositeRepo

def test_rename(self):
self.repo.store.basefiles[RenamingRepo].add("a/1")
ret = self.repo.parse("a/1")
exists = os.path.exists
store = self.repo.get_instance(RenamingRepo).store
compositestore = self.repo.store
self.assertTrue(exists(store.parsed_path("b/1")))
self.assertTrue(exists(store.distilled_path("b/1")))
self.assertTrue(exists(store.documententry_path("b/1")))
self.assertTrue(exists(store.intermediate_path("b/1")))
# to make @ifneeded report that a re-parse is not needed
self.assertTrue(exists(store.parsed_path("a/1")))
self.assertEqual(0, os.path.getsize(store.parsed_path("a/1")))
# make sure only b/1 files exists at distilled/intermediate/docentry
self.assertFalse(exists(store.distilled_path("a/1")))
self.assertFalse(exists(store.documententry_path("a/1")))
self.assertFalse(exists(store.intermediate_path("a/1")))
# make sure the composite docentry is correctly placed and pointing
self.assertFalse(exists(compositestore.documententry_path("a/1")))
self.assertTrue(exists(compositestore.documententry_path("b/1")))
self.assertTrue(os.path.islink(compositestore.documententry_path("b/1")))
link = os.path.normpath(os.path.join(
os.path.dirname(compositestore.documententry_path("b/1")),
os.readlink(compositestore.documententry_path("b/1"))))
self.assertEqual(store.documententry_path("b/1"), link)

# make sure a reparse works
self.assertTrue(self.repo.parse("a/1"))
1 change: 0 additions & 1 deletion test/testDecorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,6 @@ def test_makedocument(self):
@makedocument
def testfunc(repo, doc):
return doc

doc = testfunc(DocumentRepository(), "base/file")
self.assertIsInstance(doc, Document)
self.assertEqual(doc.basefile, "base/file")
Expand Down
Loading

0 comments on commit 9d7c190

Please sign in to comment.