From 9d7c1903447582de4316e9951e39f4bb3b8af445 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Sat, 18 Aug 2018 21:47:22 +0200 Subject: [PATCH] better handling of adjusting basefiles during parse --- ferenda/compositerepository.py | 17 +++++- ferenda/decorators.py | 36 ++++++++--- ferenda/devel.py | 2 +- ferenda/documententry.py | 15 +++-- ferenda/documentrepository.py | 5 +- ferenda/documentstore.py | 2 +- ferenda/errors.py | 8 +++ .../sources/legal/se/swedishlegalsource.py | 22 ++++--- ferenda/util.py | 11 ++-- lagen/nu/myndfskr.py | 2 +- test/testCompositeRepo.py | 61 ++++++++++++++++++- test/testDecorators.py | 1 - test/testDocRepo.py | 54 ++++++++++++++-- 13 files changed, 197 insertions(+), 39 deletions(-) diff --git a/ferenda/compositerepository.py b/ferenda/compositerepository.py index 09e9afa7..69045350 100644 --- a/ferenda/compositerepository.py +++ b/ferenda/compositerepository.py @@ -221,15 +221,28 @@ def parse(self, basefile): break if ret: + oldbasefile = basefile if ret is not True and ret != basefile: - # this is a signal that parse discovered - # that the basefile was wrong + # this is a signal that parse discovered that the + # basefile was adjusted. We should raise + # DocumentRenamedError at the very end to get + # updateentry do the right thing. basefile = ret + # Also, touch the old parsed path so we don't + # regenerate. + with self.store.open_parsed(oldbasefile, "w"): + pass + self.copy_parsed(basefile, inst) self.log.info("%(basefile)s parse OK (%(elapsed).3f sec)", {'basefile': basefile, 'elapsed': time.time() - start}) + + if basefile != oldbasefile: + msg = "%s: In subrepo %s basefile turned out to really be %s" % ( + oldbasefile, inst.qualified_class_name(), basefile) + raise errors.DocumentRenamedError(True, msg, oldbasefile, basefile) return ret else: # subrepos should only contain those repos that actually diff --git a/ferenda/decorators.py b/ferenda/decorators.py index 716c0139..5915783a 100644 --- a/ferenda/decorators.py +++ b/ferenda/decorators.py @@ -34,7 +34,7 @@ from ferenda import util from ferenda import DocumentEntry from ferenda.documentstore import Needed -from ferenda.errors import DocumentRemovedError, ParseError +from ferenda.errors import DocumentRemovedError, ParseError, DocumentRenamedError from ferenda.elements import serialize @@ -161,7 +161,24 @@ def cssuri(baseuri, filename): @functools.wraps(f) def wrapper(self, doc): # call the actual function that creates the doc data + oldbasefile = doc.basefile ret = f(self, doc) + if doc.basefile != oldbasefile: + # means that basefile was adjusted. Touch the old parsed + # path first so we don't regenerate. + with self.store.open_parsed(oldbasefile, "w"): + pass + # move any intermediate files (in particular extracted + # image backgrounds from PDF files) that might be + # needed later. + old_intermediate = self.store.intermediate_path(oldbasefile) + new_intermediate = self.store.intermediate_path(doc.basefile) + if self.store.storage_policy == "dir": + old_intermediate = os.path.dirname(old_intermediate) + new_intermediate = os.path.dirname(new_intermediate) + if os.path.exists(old_intermediate) and not os.path.exists(new_intermediate): + util.ensure_dir(new_intermediate) + os.rename(old_intermediate, new_intermediate) # now render thath doc data as files (JSON, XHTML, RDF/XML) if self.config.serializejson == True: with self.store.open_serialized(doc.basefile, "wb") as fp: @@ -303,7 +320,12 @@ def makedocument(f): @functools.wraps(f) def wrapper(self, basefile): doc = self.make_document(basefile) - return f(self, doc) + ret = f(self, doc) + if doc.basefile != basefile: + raise DocumentRenamedError( + "%s: Basefile turned out to really be %s" % (basefile, doc.basefile), + returnvalue=doc.basefile, oldbasefile=basefile, newbasefile=doc.basefile) + return ret return wrapper @@ -360,14 +382,14 @@ def updateentry(section): def outer_wrapper(f, *args): @functools.wraps(f) def inner_wrapper(self, *args, **kwargs): + # try to find out if we have a basefile if args and args[0]: - # try to find out if we have a basefile - basefile = args[0] + entrypath_arg = args[0] else: - basefile = ".root" args = () - entrypath = self.store.documententry_path(basefile) + entrypath_arg = ".root" + entrypath = self.store.documententry_path args = [self] + list(args) - return DocumentEntry.updateentry(f, section, entrypath, *args, **kwargs) + return DocumentEntry.updateentry(f, section, entrypath, entrypath_arg, *args, **kwargs) return inner_wrapper return outer_wrapper diff --git a/ferenda/devel.py b/ferenda/devel.py index 7fc042a0..e5b6d8cc 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -1397,7 +1397,7 @@ def statusreport(self, alias=None): {"id": action, "success": str(status["success"]), "duration": str(status["duration"]), - "date": status["date"]}) + "date": str(status["date"])}) if status["success"]: successcnt += 1 else: diff --git a/ferenda/documententry.py b/ferenda/documententry.py index a5ec777a..98583570 100644 --- a/ferenda/documententry.py +++ b/ferenda/documententry.py @@ -22,7 +22,7 @@ from rdflib.namespace import RDF from ferenda import util -from ferenda.errors import DocumentRemovedError +from ferenda.errors import DocumentRemovedError, DocumentRenamedError class DocumentEntry(object): @@ -264,11 +264,12 @@ def guess_type(self, filename): return "application/octet-stream" @staticmethod - def updateentry(f, section, entrypath, *args, **kwargs): + def updateentry(f, section, entrypath, entrypath_arg, *args, **kwargs): """runs the provided function with the provided arguments, captures any logged events emitted, catches any errors, and records the - result in the entry file under the provided section. The basefile - is assumed to be the first element in args. + result in the entry file under the provided section. Entrypath + should be a function that takes a basefile string and returns + the full path to the entry file for that basefile. """ def clear(key, d): @@ -292,6 +293,10 @@ def clear(key, d): except DocumentRemovedError as e: success = "removed" raise + except DocumentRenamedError as e: + entrypath_arg = e.newbasefile + success = True + return e.returnvalue except Exception as e: success = False errortype, errorval, errortb = sys.exc_info() @@ -305,7 +310,7 @@ def clear(key, d): rootlog.removeHandler(handler) if success is not None: warnings = logstream.getvalue() - entry = DocumentEntry(entrypath) + entry = DocumentEntry(entrypath(entrypath_arg)) if section not in entry.status: entry.status[section] = {} entry.status[section]['success'] = success diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py index 8dc08fbb..dcda9ca9 100644 --- a/ferenda/documentrepository.py +++ b/ferenda/documentrepository.py @@ -665,7 +665,7 @@ def dataset_uri(self, param=None, value=None, feed=False): >>> d = DocumentRepository() >>> d.alias 'base' - >>> d.config.url = "http://exampole.org/" + >>> d.config.url = "http://example.org/" >>> d.dataset_uri() 'http://example.org/dataset/base' >>> d.dataset_uri("title","a") @@ -798,7 +798,8 @@ def download(self, basefile=None, reporter=None): try: ret = DocumentEntry.updateentry(self.download_single, 'download', - self.store.documententry_path(basefile), + self.store.documententry_path, + basefile, basefile, link) except requests.exceptions.HTTPError as e: diff --git a/ferenda/documentstore.py b/ferenda/documentstore.py index ffbf13a6..1fb66aa5 100644 --- a/ferenda/documentstore.py +++ b/ferenda/documentstore.py @@ -376,7 +376,7 @@ def newer(filename, dt, field): if not os.path.exists(filename): return False elif not dt: # has never been indexed - return Needed(reason="%s has not been processed" % filename) + return Needed(reason="%s has not been processed according to %s in documententry %s" % (filename, field, entry._path)) else: if datetime.fromtimestamp(os.stat(filename).st_mtime) > dt: return Needed(reason="%s is newer than %s in documententry %s" % (filename, field, entry._path)) diff --git a/ferenda/errors.py b/ferenda/errors.py index 3cdb94fe..a77f961d 100644 --- a/ferenda/errors.py +++ b/ferenda/errors.py @@ -65,6 +65,14 @@ class DocumentSkippedError(DocumentRemovedError): exist) since it's not interesting.""" +class DocumentRenamedError(FerendaException): + def __init__(self, value, returnvalue, oldbasefile, newbasefile): + super(DocumentRenamedError, self).__init__(value) + self.returnvalue = returnvalue + self.oldbasefile = oldbasefile + self.newbasefile = newbasefile + + class PatchError(ParseError): """Raised if a patch cannot be applied by :py:meth:`~ferenda.DocumentRepository.patch_if_needed`.""" diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py index 1502b809..d27d2fd2 100644 --- a/ferenda/sources/legal/se/swedishlegalsource.py +++ b/ferenda/sources/legal/se/swedishlegalsource.py @@ -19,6 +19,7 @@ import operator import os import re +import shutil import sys import unicodedata @@ -586,7 +587,12 @@ def parse(self, doc): resource = self.parse_metadata(fp, doc.basefile) doc.meta = resource.graph doc.uri = str(resource.identifier) - self._adjust_basefile(doc, orig_uri) + # _adjust_basefile returns true iff basefile was adjusted. We + # return the new adjusted basefile to the caller + if self._adjust_basefile(doc, orig_uri): + ret = doc.basefile + else: + ret = True if resource.value(DCTERMS.title): doc.lang = resource.value(DCTERMS.title).language if options == "metadataonly": @@ -601,7 +607,7 @@ def parse(self, doc): self.postprocess_doc(doc) self.parse_entry_update(doc) # print(doc.meta.serialize(format="turtle").decode("utf-8")) - return True + return ret def _adjust_basefile(self, doc, orig_uri): # In some cases, we might discover during parse that we've @@ -613,13 +619,15 @@ def _adjust_basefile(self, doc, orig_uri): # if generate() hasn't run yet. if orig_uri != doc.uri: newbasefile = self.basefile_from_uri(doc.uri) + oldbasefile = doc.basefile if newbasefile: - # change the basefile we're dealing with. Touch - # self.store.parsed_path(basefile) first so we don't - # regenerate. - with self.store.open_parsed(doc.basefile, "w"): - pass + # Decorators.render should deal with writing results + # to the correct locations, moving intermediate files + # etc, as long as doc.basefile is changed doc.basefile = newbasefile + return True # basefile was adjusted + else: + return False # basefile was not adjusted def parse_open(self, basefile, attachment=None): """Open the main downloaded file for the given basefile, caching the diff --git a/ferenda/util.py b/ferenda/util.py index 13ed4181..de51fd0e 100755 --- a/ferenda/util.py +++ b/ferenda/util.py @@ -864,15 +864,16 @@ def robust_fetch(method, url, logger, attempts=5, sleep=1, raise_for_status=True except (requests.exceptions.ConnectionError, requests.exceptions.Timeout, socket.timeout) as e: - logger.warning( - "Failed to fetch %s: err %s (%s remaining attempts)" % - (url, e, attempts)) - lastexception = e + logger.warning( + "Failed to fetch %s: err %s (%s remaining attempts)" % + (url, e, attempts)) + lastexception = e + time.sleep(sleep) if response and response.status_code >= 400 and response.status_code != 404: fetched = False # let's retry even for 400 or 500 class errors, maybe it'll go better in a second logger.warning("Failed to fetch %s: status %s (%s remaining attempts)" % (url, response.status_code, attempts)) + time.sleep(sleep) attempts -= 1 - time.sleep(sleep) if not fetched: logger.error("Failed to fetch %s, giving up" % url) if lastexception and raise_for_status: diff --git a/lagen/nu/myndfskr.py b/lagen/nu/myndfskr.py index 771d573f..65788988 100644 --- a/lagen/nu/myndfskr.py +++ b/lagen/nu/myndfskr.py @@ -221,7 +221,7 @@ def parse(self, basefile): self.copy_parsed(subbasefile, inst) break else: - return super(MyndFskr, self).parse(basefile) + return super(MyndFskr, self).parse(basefile) def facets(self): # maybe if each entry in the list could be a tuple or a single diff --git a/test/testCompositeRepo.py b/test/testCompositeRepo.py index 140b5413..6304b5e5 100644 --- a/test/testCompositeRepo.py +++ b/test/testCompositeRepo.py @@ -3,9 +3,15 @@ print_function, unicode_literals) from builtins import * +import os + +import rdflib +from rdflib.namespace import DCTERMS + from ferenda import DocumentRepository, util, errors from ferenda.testutil import RepoTester -from ferenda.decorators import updateentry +from ferenda.decorators import updateentry, managedparsing +from ferenda.elements.html import Body, H1 # SUT from ferenda import CompositeRepository @@ -173,3 +179,56 @@ def test_config(self): def test_super(self): got = self.repo.qualified_class_name() self.assertEqual("Q:testCompositeRepo.SubrepoBSubclass", got) + + +class RenamingRepo(DocumentRepository): + alias = "renaming" + + @managedparsing + def parse(self, doc): + # create an intermediate file before we know the correct + # path for it. Later steps should move this file to the + # correct place. + util.writefile(self.store.intermediate_path(doc.basefile), "dummy") + doc.meta.add((rdflib.URIRef(doc.uri), DCTERMS.title, + rdflib.Literal("Hello World", lang="en"))) + doc.body = Body([H1(["Hello world"])]) + doc.basefile = doc.basefile.replace("a/", "b/") + return True + +class BasefileRename(RepoTester): + + class RenamingCompositeRepo(CompositeRepository): + alias = "composite" + subrepos = RenamingRepo, + + repoclass = RenamingCompositeRepo + + def test_rename(self): + self.repo.store.basefiles[RenamingRepo].add("a/1") + ret = self.repo.parse("a/1") + exists = os.path.exists + store = self.repo.get_instance(RenamingRepo).store + compositestore = self.repo.store + self.assertTrue(exists(store.parsed_path("b/1"))) + self.assertTrue(exists(store.distilled_path("b/1"))) + self.assertTrue(exists(store.documententry_path("b/1"))) + self.assertTrue(exists(store.intermediate_path("b/1"))) + # to make @ifneeded report that a re-parse is not needed + self.assertTrue(exists(store.parsed_path("a/1"))) + self.assertEqual(0, os.path.getsize(store.parsed_path("a/1"))) + # make sure only b/1 files exists at distilled/intermediate/docentry + self.assertFalse(exists(store.distilled_path("a/1"))) + self.assertFalse(exists(store.documententry_path("a/1"))) + self.assertFalse(exists(store.intermediate_path("a/1"))) + # make sure the composite docentry is correctly placed and pointing + self.assertFalse(exists(compositestore.documententry_path("a/1"))) + self.assertTrue(exists(compositestore.documententry_path("b/1"))) + self.assertTrue(os.path.islink(compositestore.documententry_path("b/1"))) + link = os.path.normpath(os.path.join( + os.path.dirname(compositestore.documententry_path("b/1")), + os.readlink(compositestore.documententry_path("b/1")))) + self.assertEqual(store.documententry_path("b/1"), link) + + # make sure a reparse works + self.assertTrue(self.repo.parse("a/1")) diff --git a/test/testDecorators.py b/test/testDecorators.py index 09049ec1..90205937 100644 --- a/test/testDecorators.py +++ b/test/testDecorators.py @@ -335,7 +335,6 @@ def test_makedocument(self): @makedocument def testfunc(repo, doc): return doc - doc = testfunc(DocumentRepository(), "base/file") self.assertIsInstance(doc, Document) self.assertEqual(doc.basefile, "base/file") diff --git a/test/testDocRepo.py b/test/testDocRepo.py index dcc815a4..5d4d0b4d 100644 --- a/test/testDocRepo.py +++ b/test/testDocRepo.py @@ -18,18 +18,21 @@ from layeredconfig import LayeredConfig, Defaults, INIFile import lxml.etree as etree import rdflib +from rdflib.namespace import DCTERMS import requests.exceptions from ferenda.compat import Mock, patch, call, unittest from ferenda import DocumentEntry, Describer, Facet, Transformer from ferenda.fulltextindex import WhooshIndex +from ferenda.elements.html import Body, H1 +from ferenda.decorators import managedparsing from ferenda.errors import * # The main system under test (SUT) from test.quiet import silence from ferenda import DocumentRepository -from ferenda.testutil import RepoTester +from ferenda.testutil import RepoTester, parametrize # helper classes from examplerepos import DocRepo1, DocRepo2, DocRepo3 @@ -2145,13 +2148,52 @@ def test_inexact_patch(self): result, desc = self.repo.patch_if_needed("123/a", self.sourcedoc2) self.assertEqual(self.targetdoc3, result) +class BasefileRename(RepoTester): + # If parse() changes doc.basefile during parse (in case the parse + # process discovers that download() got the basefile wrong) the + # various decorators in @managedparsing should respect this new basefile (creating parsed/distilled/documententry files under that new + class RenamingRepo(DocumentRepository): + @managedparsing + def parse(self, doc): + # create an intermediate file before we know the correct + # path for it. Later steps should move this file to the + # correct place. + util.writefile(self.store.intermediate_path(doc.basefile), "dummy") + doc.meta.add((rdflib.URIRef(doc.uri), DCTERMS.title, + rdflib.Literal("Hello World", lang="en"))) + doc.body = Body([H1(["Hello world"])]) + doc.basefile = doc.basefile.replace("a/", "b/") + return True + + repoclass = RenamingRepo + + def test_rename(self): + ret = self.repo.parse("a/1") + exists = os.path.exists + store = self.repo.store + self.assertTrue(ret) + self.assertTrue(exists(store.parsed_path("b/1"))) + self.assertTrue(exists(store.distilled_path("b/1"))) + self.assertTrue(exists(store.documententry_path("b/1"))) + self.assertTrue(exists(store.intermediate_path("b/1"))) + # to make @ifneeded report that a re-parse is not needed + self.assertTrue(exists(store.parsed_path("a/1"))) + self.assertEqual(0, os.path.getsize(store.parsed_path("a/1"))) + # make sure only b/1 files exists at distilled/intermediate/docentry + self.assertFalse(exists(store.distilled_path("a/1"))) + self.assertFalse(exists(store.documententry_path("a/1"))) + self.assertFalse(exists(store.intermediate_path("a/1"))) + + # make sure a reparse works + self.assertTrue(self.repo.parse("a/1")) + # # Add doctests in the module -# from ferenda import documentrepository -# from ferenda.testutil import Py23DocChecker -# def load_tests(loader,tests,ignore): -# tests.addTests(doctest.DocTestSuite(documentrepository, checker=Py23DocChecker())) -# return tests +from ferenda import documentrepository +from ferenda.testutil import Py23DocChecker +def load_tests(loader,tests,ignore): + tests.addTests(doctest.DocTestSuite(documentrepository, checker=Py23DocChecker())) + return tests