better handling of adjusting basefiles during parse

staffanm · Aug 18, 2018 · 9d7c190 · 9d7c190
1 parent 33bd4bf
commit 9d7c190
Show file tree

Hide file tree

Showing 13 changed files with 197 additions and 39 deletions.
diff --git a/ferenda/compositerepository.py b/ferenda/compositerepository.py
@@ -221,15 +221,28 @@ def parse(self, basefile):
                 break
 
         if ret:
+            oldbasefile = basefile
             if ret is not True and ret != basefile:
-                # this is a signal that parse discovered
-                # that the basefile was wrong
+                # this is a signal that parse discovered that the
+                # basefile was adjusted. We should raise
+                # DocumentRenamedError at the very end to get
+                # updateentry do the right thing.
                 basefile = ret
+                # Also, touch the old parsed path so we don't
+                # regenerate.
+                with self.store.open_parsed(oldbasefile, "w"):
+                    pass
+
 
             self.copy_parsed(basefile, inst)
             self.log.info("%(basefile)s parse OK (%(elapsed).3f sec)",
                           {'basefile': basefile,
                            'elapsed': time.time() - start})
+
+            if basefile != oldbasefile:
+                msg = "%s: In subrepo %s basefile turned out to really be %s" % (
+                    oldbasefile, inst.qualified_class_name(), basefile)
+                raise errors.DocumentRenamedError(True, msg, oldbasefile, basefile)
             return ret
         else:
             # subrepos should only contain those repos that actually

diff --git a/ferenda/decorators.py b/ferenda/decorators.py
@@ -34,7 +34,7 @@
 from ferenda import util
 from ferenda import DocumentEntry
 from ferenda.documentstore import Needed
-from ferenda.errors import DocumentRemovedError, ParseError
+from ferenda.errors import DocumentRemovedError, ParseError, DocumentRenamedError
 from ferenda.elements import serialize
 
 
@@ -161,7 +161,24 @@ def cssuri(baseuri, filename):
     @functools.wraps(f)
     def wrapper(self, doc):
         # call the actual function that creates the doc data
+        oldbasefile = doc.basefile
         ret = f(self, doc)
+        if doc.basefile != oldbasefile:
+            # means that basefile was adjusted.  Touch the old parsed
+            # path first so we don't regenerate.
+            with self.store.open_parsed(oldbasefile, "w"):
+                pass
+            # move any intermediate files (in particular extracted
+            # image backgrounds from PDF files) that might be
+            # needed later. 
+            old_intermediate = self.store.intermediate_path(oldbasefile)
+            new_intermediate = self.store.intermediate_path(doc.basefile)
+            if self.store.storage_policy == "dir":
+                old_intermediate = os.path.dirname(old_intermediate)
+                new_intermediate = os.path.dirname(new_intermediate)
+            if os.path.exists(old_intermediate) and not os.path.exists(new_intermediate):
+                util.ensure_dir(new_intermediate)
+                os.rename(old_intermediate, new_intermediate)
         # now render thath doc data as files (JSON, XHTML, RDF/XML)
         if self.config.serializejson == True:
             with self.store.open_serialized(doc.basefile, "wb") as fp:
@@ -303,7 +320,12 @@ def makedocument(f):
     @functools.wraps(f)
     def wrapper(self, basefile):
         doc = self.make_document(basefile)
-        return f(self, doc)
+        ret = f(self, doc)
+        if doc.basefile != basefile:
+            raise DocumentRenamedError(
+                "%s: Basefile turned out to really be %s" % (basefile, doc.basefile),
+                returnvalue=doc.basefile, oldbasefile=basefile, newbasefile=doc.basefile)
+        return ret
     return wrapper
 
 
@@ -360,14 +382,14 @@ def updateentry(section):
     def outer_wrapper(f, *args):
         @functools.wraps(f)
         def inner_wrapper(self, *args, **kwargs):
+            # try to find out if we have a basefile
             if args and args[0]:
-                # try to find out if we have a basefile
-                basefile = args[0]
+                entrypath_arg = args[0]
             else:
-                basefile = ".root"
                 args = ()
-            entrypath = self.store.documententry_path(basefile)
+                entrypath_arg = ".root"
+            entrypath = self.store.documententry_path
             args = [self] + list(args)
-            return DocumentEntry.updateentry(f, section, entrypath, *args, **kwargs)
+            return DocumentEntry.updateentry(f, section, entrypath, entrypath_arg, *args, **kwargs)
         return inner_wrapper
     return outer_wrapper
diff --git a/ferenda/devel.py b/ferenda/devel.py
@@ -1397,7 +1397,7 @@ def statusreport(self, alias=None):
                                                  {"id": action,
                                                   "success": str(status["success"]),
                                                   "duration": str(status["duration"]),
-                                                  "date": status["date"]})
+                                                  "date": str(status["date"])})
                     if status["success"]:
                         successcnt += 1
                     else:

diff --git a/ferenda/documententry.py b/ferenda/documententry.py
@@ -22,7 +22,7 @@
 from rdflib.namespace import RDF
 
 from ferenda import util
-from ferenda.errors import DocumentRemovedError
+from ferenda.errors import DocumentRemovedError, DocumentRenamedError
 
 class DocumentEntry(object):
 
@@ -264,11 +264,12 @@ def guess_type(self, filename):
         return "application/octet-stream"
 
     @staticmethod
-    def updateentry(f, section, entrypath, *args, **kwargs):
+    def updateentry(f, section, entrypath, entrypath_arg, *args, **kwargs):
         """runs the provided function with the provided arguments, captures
         any logged events emitted, catches any errors, and records the
-        result in the entry file under the provided section. The basefile
-        is assumed to be the first element in args.
+        result in the entry file under the provided section. Entrypath
+        should be a function that takes a basefile string and returns
+        the full path to the entry file for that basefile.
 
         """
         def clear(key, d):
@@ -292,6 +293,10 @@ def clear(key, d):
         except DocumentRemovedError as e:
             success = "removed"
             raise
+        except DocumentRenamedError as e:
+            entrypath_arg = e.newbasefile
+            success = True
+            return e.returnvalue
         except Exception as e:
             success = False
             errortype, errorval, errortb = sys.exc_info()
@@ -305,7 +310,7 @@ def clear(key, d):
             rootlog.removeHandler(handler)
             if success is not None:
                 warnings = logstream.getvalue()
-                entry = DocumentEntry(entrypath)
+                entry = DocumentEntry(entrypath(entrypath_arg))
                 if section not in entry.status:
                     entry.status[section] = {}
                 entry.status[section]['success'] = success

diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py
@@ -665,7 +665,7 @@ def dataset_uri(self, param=None, value=None, feed=False):
         >>> d = DocumentRepository()
         >>> d.alias
         'base'
-        >>> d.config.url = "http://exampole.org/"
+        >>> d.config.url = "http://example.org/"
         >>> d.dataset_uri()
         'http://example.org/dataset/base'
         >>> d.dataset_uri("title","a")
@@ -798,7 +798,8 @@ def download(self, basefile=None, reporter=None):
                 try:
                     ret = DocumentEntry.updateentry(self.download_single,
                                                     'download',
-                                                    self.store.documententry_path(basefile),
+                                                    self.store.documententry_path,
+                                                    basefile,
                                                     basefile,
                                                     link)
                 except requests.exceptions.HTTPError as e:

diff --git a/ferenda/documentstore.py b/ferenda/documentstore.py
@@ -376,7 +376,7 @@ def newer(filename, dt, field):
             if not os.path.exists(filename):
                 return False
             elif not dt:  # has never been indexed
-                return Needed(reason="%s has not been processed" % filename)
+                return Needed(reason="%s has not been processed according to %s in documententry %s" % (filename, field, entry._path))
             else:
                 if datetime.fromtimestamp(os.stat(filename).st_mtime) > dt:
                     return Needed(reason="%s is newer than %s in documententry %s" % (filename, field, entry._path))

diff --git a/ferenda/errors.py b/ferenda/errors.py
@@ -65,6 +65,14 @@ class DocumentSkippedError(DocumentRemovedError):
     exist) since it's not interesting."""
 
 
+class DocumentRenamedError(FerendaException):
+    def __init__(self, value, returnvalue, oldbasefile, newbasefile):
+        super(DocumentRenamedError, self).__init__(value)
+        self.returnvalue = returnvalue
+        self.oldbasefile = oldbasefile
+        self.newbasefile = newbasefile
+
+
 class PatchError(ParseError):
     """Raised if a patch cannot be applied by
 :py:meth:`~ferenda.DocumentRepository.patch_if_needed`."""

diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py
@@ -19,6 +19,7 @@
 import operator
 import os
 import re
+import shutil
 import sys
 import unicodedata
 
@@ -586,7 +587,12 @@ def parse(self, doc):
         resource = self.parse_metadata(fp, doc.basefile)
         doc.meta = resource.graph
         doc.uri = str(resource.identifier)
-        self._adjust_basefile(doc, orig_uri)
+        # _adjust_basefile returns true iff basefile was adjusted. We
+        # return the new adjusted basefile to the caller
+        if self._adjust_basefile(doc, orig_uri):
+            ret = doc.basefile
+        else:
+            ret = True
         if resource.value(DCTERMS.title):
             doc.lang = resource.value(DCTERMS.title).language
         if options == "metadataonly":
@@ -601,7 +607,7 @@ def parse(self, doc):
         self.postprocess_doc(doc)
         self.parse_entry_update(doc)
         # print(doc.meta.serialize(format="turtle").decode("utf-8"))
-        return True
+        return ret
 
     def _adjust_basefile(self, doc, orig_uri):
         # In some cases, we might discover during parse that we've
@@ -613,13 +619,15 @@ def _adjust_basefile(self, doc, orig_uri):
         # if generate() hasn't run yet.
         if orig_uri != doc.uri:
             newbasefile = self.basefile_from_uri(doc.uri)
+            oldbasefile = doc.basefile
             if newbasefile:
-                # change the basefile we're dealing with. Touch
-                # self.store.parsed_path(basefile) first so we don't
-                # regenerate. 
-                with self.store.open_parsed(doc.basefile, "w"):
-                    pass
+                # Decorators.render should deal with writing results
+                # to the correct locations, moving intermediate files
+                # etc, as long as doc.basefile is changed
                 doc.basefile = newbasefile
+            return True  # basefile was adjusted
+        else:
+            return False # basefile was not adjusted
 
     def parse_open(self, basefile, attachment=None):
         """Open the main downloaded file for the given basefile, caching the

diff --git a/ferenda/util.py b/ferenda/util.py
@@ -864,15 +864,16 @@ def robust_fetch(method, url, logger, attempts=5, sleep=1, raise_for_status=True
             except (requests.exceptions.ConnectionError,
                         requests.exceptions.Timeout,
                         socket.timeout) as e:
-                    logger.warning(
-                        "Failed to fetch %s: err %s (%s remaining attempts)" %
-                        (url, e, attempts))
-                    lastexception = e
+                logger.warning(
+                    "Failed to fetch %s: err %s (%s remaining attempts)" %
+                    (url, e, attempts))
+                lastexception = e
+                time.sleep(sleep)
             if response and response.status_code >= 400 and response.status_code != 404:
                 fetched = False  # let's retry even for 400 or 500 class errors, maybe it'll go better in a second
                 logger.warning("Failed to fetch %s: status %s (%s remaining attempts)" % (url, response.status_code, attempts))
+                time.sleep(sleep)
             attempts -= 1
-            time.sleep(sleep)
         if not fetched:
             logger.error("Failed to fetch %s, giving up" % url)
             if lastexception and raise_for_status:

diff --git a/lagen/nu/myndfskr.py b/lagen/nu/myndfskr.py
@@ -221,7 +221,7 @@ def parse(self, basefile):
                         self.copy_parsed(subbasefile, inst)
                     break
         else:
-             return super(MyndFskr, self).parse(basefile)   
+            return super(MyndFskr, self).parse(basefile)   
 
     def facets(self):
         # maybe if each entry in the list could be a tuple or a single

diff --git a/test/testCompositeRepo.py b/test/testCompositeRepo.py
@@ -3,9 +3,15 @@
                         print_function, unicode_literals)
 from builtins import *
 
+import os
+
+import rdflib
+from rdflib.namespace import DCTERMS
+
 from ferenda import DocumentRepository, util, errors
 from ferenda.testutil import RepoTester
-from ferenda.decorators import updateentry
+from ferenda.decorators import updateentry, managedparsing
+from ferenda.elements.html import Body, H1
 # SUT
 from ferenda import CompositeRepository
 
@@ -173,3 +179,56 @@ def test_config(self):
     def test_super(self):
         got = self.repo.qualified_class_name()
         self.assertEqual("Q:testCompositeRepo.SubrepoBSubclass", got)
+
+
+class RenamingRepo(DocumentRepository):
+    alias = "renaming"
+
+    @managedparsing
+    def parse(self, doc):
+        # create an intermediate file before we know the correct
+        # path for it. Later steps should move this file to the
+        # correct place.
+        util.writefile(self.store.intermediate_path(doc.basefile), "dummy")
+        doc.meta.add((rdflib.URIRef(doc.uri), DCTERMS.title,
+                      rdflib.Literal("Hello World", lang="en")))
+        doc.body = Body([H1(["Hello world"])])
+        doc.basefile = doc.basefile.replace("a/", "b/")
+        return True
+
+class BasefileRename(RepoTester):
+
+    class RenamingCompositeRepo(CompositeRepository):
+        alias = "composite"
+        subrepos = RenamingRepo,
+
+    repoclass = RenamingCompositeRepo
+
+    def test_rename(self):
+        self.repo.store.basefiles[RenamingRepo].add("a/1")
+        ret = self.repo.parse("a/1")
+        exists = os.path.exists
+        store = self.repo.get_instance(RenamingRepo).store
+        compositestore = self.repo.store
+        self.assertTrue(exists(store.parsed_path("b/1")))
+        self.assertTrue(exists(store.distilled_path("b/1")))
+        self.assertTrue(exists(store.documententry_path("b/1")))
+        self.assertTrue(exists(store.intermediate_path("b/1")))
+        # to make @ifneeded report that a re-parse is not needed
+        self.assertTrue(exists(store.parsed_path("a/1")))
+        self.assertEqual(0, os.path.getsize(store.parsed_path("a/1")))
+        # make sure only b/1 files exists at distilled/intermediate/docentry
+        self.assertFalse(exists(store.distilled_path("a/1")))
+        self.assertFalse(exists(store.documententry_path("a/1")))
+        self.assertFalse(exists(store.intermediate_path("a/1")))
+        # make sure the composite docentry is correctly placed and pointing
+        self.assertFalse(exists(compositestore.documententry_path("a/1")))
+        self.assertTrue(exists(compositestore.documententry_path("b/1")))
+        self.assertTrue(os.path.islink(compositestore.documententry_path("b/1")))
+        link = os.path.normpath(os.path.join(
+            os.path.dirname(compositestore.documententry_path("b/1")),
+            os.readlink(compositestore.documententry_path("b/1"))))
+        self.assertEqual(store.documententry_path("b/1"), link)
+
+        # make sure a reparse works
+        self.assertTrue(self.repo.parse("a/1"))
diff --git a/test/testDecorators.py b/test/testDecorators.py
@@ -335,7 +335,6 @@ def test_makedocument(self):
         @makedocument
         def testfunc(repo, doc):
             return doc
-
         doc = testfunc(DocumentRepository(), "base/file")
         self.assertIsInstance(doc, Document)
         self.assertEqual(doc.basefile, "base/file")