fix merge conflict

staffanm · Nov 17, 2017 · 1334b0f · 1334b0f
2 parents 1c58ca5 + dc9fa60
commit 1334b0f
Show file tree

Hide file tree

Showing 44 changed files with 723 additions and 615 deletions.
diff --git a/ferenda/devel.py b/ferenda/devel.py
@@ -193,16 +193,20 @@ def format_exception():
                             pfp = StringIO(codecs.decode(pfp.read(), "rot13"))
                         try:
                             ps = PatchSet.from_stream(pfp)
-                            text = "\n".join(ps.patches[0].merge(text.split("\n")))
+                            lines = text.split("\n")
+                            offsets = ps.patches[0].adjust(lines)
+                            text = "\n".join(ps.patches[0].merge(lines))
                             if ps.patches[0].hunks[0].comment:
                                 patchdescription = ps.patches[0].hunks[0].comment
                             else:
                                 patchdescription = ""
                             instructions = Div([
                                 P(["Existing patch at %s has been applied (" % patchpath,
                                    A("ignore existing patch", href=ignorepatchlink), ")"]),
-                                P("Contents of that patch, for reference"),
+                                P(["Contents of that patch, for reference"]),
                                 Pre([util.readfile(patchpath)])])
+                            if any(offsets):
+                                instructions.append(P("Patch did not apply cleanly, the following adjustments were made: %s" % offsets))
                         except (PatchSyntaxError, PatchConflictError) as e:
                             instructions = Div([
                                 P(["Existing patch at %s could not be applied (" % patchpath,

diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py
@@ -1330,7 +1330,9 @@ def patch_if_needed(self, basefile, text):
         else:
             desc = "(No patch description available)"
         try:
-            stream = ps.patches[0].merge(text.split("\n"))
+            lines = text.split("\n")
+            ps.patches[0].adjust(lines)
+            stream = ps.patches[0].merge(lines)
             return "\n".join(stream), desc
         except PatchConflictError as e:
             raise errors.PatchError(e)

diff --git a/ferenda/pdfreader.py b/ferenda/pdfreader.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import re
+import shutil
 import tempfile
 import warnings
 import unicodedata
@@ -202,8 +203,15 @@ def __init__(self,
             os.unlink(convertedfile)
         return res
 
-    def _tesseract(self, tmppdffile, workdir, lang, hocr=True):
-        root = os.path.splitext(os.path.basename(tmppdffile))[0]
+    def _tesseract(self, pdffile, workdir, lang, hocr=True):
+        root = os.path.splitext(os.path.basename(pdffile))[0]
+
+        # step 0: copy the pdf into a temp dir (which is probably on
+        # local disk, saving us some network traffic if the pdf file
+        # is huge and on a NFS mount somewhere)
+        tmpdir = tempfile.mkdtemp()
+        tmppdffile = os.sep.join([tmpdir, os.path.basename(pdffile)])
+        util.copy_if_different(pdffile, tmppdffile)
 
         # step 1: find the number of pages
         cmd = "pdfinfo %s" % tmppdffile
@@ -219,36 +227,35 @@ def _tesseract(self, tmppdffile, workdir, lang, hocr=True):
             topage = min((i + 1) * 10, number_of_pages)
             if frompage > topage:
                 continue
-            cmd = "pdfimages -png -p -f %(frompage)s -l %(topage)s %(tmppdffile)s %(workdir)s/%(root)s" % locals(
+            cmd = "pdfimages -all -p -f %(frompage)s -l %(topage)s %(tmppdffile)s %(tmpdir)s/%(root)s" % locals(
             )
             self.log.debug("- running " + cmd)
             (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
-            # step 2.1: Combine the recently extracted images (which
-            # are always png) into a new tif (so that we add 10 pages
-            # at a time to the tif, as imagemagick can create a number
-            # of pretty large files for each page, so converting 200
-            # images will fill 10 G of your temp space -- which we'd
-            # like to avoid)
-            cmd = "convert %(workdir)s/%(root)s-*.png -compress Zip %(workdir)s/%(root)s-tmp%(idx)04d.tif" % locals(
+            # step 2.1: convert and combine the recently extracted
+            # images (which can be ppm, jpg, ccitt or whatever) into a
+            # new tif (so that we add 10 pages at a time to the tif,
+            # as imagemagick can create a number of pretty large files
+            # for each page, so converting 200 images will fill 10 G
+            # of your temp space -- which we'd like to avoid)
+            cmd = "convert %(tmpdir)s/%(root)s-* -compress Zip %(tmpdir)s/%(root)s_tmp%(idx)04d.tif" % locals(
             )
             self.log.debug("- running " + cmd)
             (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
-            # step 2.2: Remove png files now that they're in the .tif
-            for f in glob("%(workdir)s/%(root)s-*.png" % locals()):
+            # step 2.2: Remove extracted image files now that they're in the .tif
+            for f in glob("%(tmpdir)s/%(root)s-*" % locals()):
                 os.unlink(f)
 
         # Step 3: Combine all the 10-page tifs into a giant tif using tiffcp
-        cmd = "tiffcp -c zip %(workdir)s/%(root)s-tmp*.tif %(workdir)s/%(root)s.tif" % locals()
+        cmd = "tiffcp -c zip %(tmpdir)s/%(root)s_tmp*.tif %(tmpdir)s/%(root)s.tif" % locals()
         self.log.debug("- running " + cmd)
         (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
-
         # Step 3: OCR the giant tif file to create a .hocr.html file
         # Note that -psm 1 (automatic page segmentation with
         # orientation and script detection) requires the installation
         # of tesseract-ocr-3.01.osd.tar.gz
         usehocr = "hocr" if hocr else ""
         suffix = ".hocr" if hocr else ""
-        cmd = "tesseract %(workdir)s/%(root)s.tif %(workdir)s/%(root)s%(suffix)s -l %(lang)s -psm 1 %(usehocr)s" % locals(
+        cmd = "tesseract %(tmpdir)s/%(root)s.tif %(tmpdir)s/%(root)s%(suffix)s -l %(lang)s -psm 1 %(usehocr)s" % locals(
         )
         self.log.debug("running " + cmd)
         (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
@@ -257,15 +264,14 @@ def _tesseract(self, tmppdffile, workdir, lang, hocr=True):
         # suffix, while earlier versions add a automatic .html. Other
         # parts of the code expects the .html suffix, so we check to
         # see if we have new-tesseract behaviour and compensate.
-        if os.path.exists("%(workdir)s/%(root)s%(suffix)s.hocr" % locals()):
-            util.robust_rename("%(workdir)s/%(root)s%(suffix)s.hocr" % locals(),
-                               "%(workdir)s/%(root)s%(suffix)s.html" % locals())
-
+        if os.path.exists("%(tmpdir)s/%(root)s%(suffix)s.hocr" % locals()):
+            util.robust_rename("%(tmpdir)s/%(root)s%(suffix)s.hocr" % locals(),
+                               "%(tmpdir)s/%(root)s%(suffix)s.html" % locals())
 
-        # Step 5: Cleanup (the main .tif file can stay)
-        os.unlink(tmppdffile)
-        for f in glob("%(workdir)s/%(root)s-tmp*.tif" % locals()):
-            os.unlink(f)
+        # Step 5: Move our hOCR file to the workdir, then cleanup
+        util.robust_rename("%(tmpdir)s/%(root)s%(suffix)s.html" % locals(),
+                           "%(workdir)s/%(root)s%(suffix)s.html" % locals())
+        shutil.rmtree(tmpdir)        
 
     def _pdftohtml(self, tmppdffile, workdir, images):
         root = os.path.splitext(os.path.basename(tmppdffile))[0]
@@ -579,8 +585,9 @@ def txt(element_text):
                     continue
                 assert element.tag == 'text', "Got <%s>, expected <text>" % element.tag
                 # eliminate "empty" textboxes, including "<text><i> </i></text>\n"
-                if element.text and txt(
-                        element.text).strip() == "" and not element.getchildren():
+                if (((element.text and txt(element.text).strip() == "") or
+                     (element.text is None)) and
+                    not element.getchildren()):
                     # print "Skipping empty box"
                     continue
                 if len(page) > 0:
@@ -991,16 +998,19 @@ def convert(self, filename, workdir=None, images=True,
         if ocr_lang:
             converter = self._tesseract
             converter_extra = {'lang': ocr_lang}
+            tmpfilename = filename
         else:
             converter = self._pdftohtml
             converter_extra = {'images': images}
+            tmpfilename = os.sep.join([workdir, os.path.basename(filename)])
 
-        tmpfilename = os.sep.join([workdir, os.path.basename(filename)])
         # copying the filename to the workdir is only needed if we use
         # PDFReader._pdftohtml
 
         if not util.outfile_is_newer([filename], convertedfile):
-            util.copy_if_different(filename, tmpfilename)
+            if not ocr_lang:
+                # this is somewhat expensive and not really needed when converter is tesseract
+                util.copy_if_different(filename, tmpfilename)
             # this is the expensive operation
             converter(tmpfilename, workdir, **converter_extra)
 
@@ -1020,7 +1030,6 @@ def convert(self, filename, workdir=None, images=True,
                 # (in _parse_xml), a workaround will be applied to the
                 # document on the fly.
                 pass
-#            
             if keep_xml == "bz2":
                 with open(convertedfile.replace(".bz2", ""), mode="rb") as rfp:
                     # BZ2File supports the with statement in py27+,

diff --git a/ferenda/sources/legal/se/dv.py b/ferenda/sources/legal/se/dv.py
@@ -1428,6 +1428,7 @@ def postprocess_doc(self, doc):
         if self.config.mapfiletype == "nginx":
             path = urlparse(doc.uri).path
         else:
+            idx = len(self.urispace_base) + len(self.urispace_segment) + 2
             path = doc.uri[idx:]
 
         def map_append_needed(mapped_path, filename):

diff --git a/ferenda/sources/legal/se/sou.py b/ferenda/sources/legal/se/sou.py
@@ -151,7 +151,7 @@ class SOUKB(Offtryck, PDFDocumentRepository):
     @classmethod
     def get_default_options(cls):
         opts = super(SOUKB, cls).get_default_options()
-        opts['ocr'] = False
+        opts['ocr'] = True
         return opts
 
 

diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py
@@ -583,7 +583,6 @@ def parse_open(self, basefile, attachment=None):
     def patch_if_needed(self, fp, basefile):
         """Override of DocumentRepository.patch_if_needed with different,
         streamier API."""
-
         if self.config.ignorepatch is True:
             return fp
         # 1. do we have a patch?
@@ -624,8 +623,10 @@ def patch_if_needed(self, fp, basefile):
         # an attribute with the description
         # lines = [l.decode().rstrip() for l in fp.readlines()]
         lines = [l.rstrip("\n") for l in fp.readlines()]
-        import pudb; pu.db
-        patchedlines = list(ps.patches[0].merge(lines))
+        offsets = ps.patches[0].adjust(lines)
+        if any(offsets):
+            self.log.warning("Patch source ranges had to be adjusted: %s" % offsets)
+        patchedlines = ps.patches[0].merge(lines)
         patchedtext = "\n".join(patchedlines)
         if binarystream:
             fp = BytesIO(patchedtext.encode(self.source_encoding))

diff --git a/ferenda/thirdparty/patchit.py b/ferenda/thirdparty/patchit.py
@@ -66,6 +66,44 @@ def add_operation(self, symbol, text):
             symbol = self.operation_symbol_map[symbol]
         self.operations.append(self.Operation(symbol, text))
 
+    def adjust(self, lines):
+        """Adjust the source_range of Hunk based on it's context lines.
+
+        :param lines: collection of lines for the entire source text
+        :type lines: list
+        :returns: offset compared to existing source_range
+        """
+        done = False
+        offset = 0
+        while self.source_range[0] + offset >= 0 and self.source_range[1] + offset <= len(lines):
+            if self.match(lines, offset):
+                done = True
+                break
+            offset = -offset
+            if self.match(lines, offset):
+                done = True
+                break
+            offset = -offset + 1
+        if not done:
+            raise PatchConflictError('Cannot match context lines')
+        self.source_range = tuple([x + offset for x in self.source_range])
+        return offset
+
+    def match(self, lines, offset):
+        """Check if the context lines at a particular offset matches the source text"""
+        # maybe we should read from front and back until we encounter our first non OP_EQUAL? 
+        for idx, (symbol, text) in enumerate(self.operations):
+            if symbol == self.OP_INSERT:
+                offset -= 1
+            elif symbol == self.OP_DELETE:
+                offset += 0
+            else:  # self.OP_EQUAL, i.e. a context line
+                lineidx = self.source_range[0] - 1 + offset + idx
+                if lineidx < 0 or lineidx >= len(lines) or lines[lineidx] != text:
+                    return False
+        return True
+
+
     def merge(self, lines):
         """Merge Hunk into `lines`.
 
@@ -85,7 +123,7 @@ def merge(self, lines):
                     raise PatchConflictError('Unexpected end of stream')
 
                 if line != text:
-                    raise PatchConflictError('patch conflict')
+                    raise PatchConflictError('patch conflict: Expected %r, got %r' % (line, text))
                 if symbol == Hunk.OP_EQUAL:
                     yield line
 
@@ -123,6 +161,16 @@ def merge(self, lines):
         for line in lines_enumerator:
             yield line
 
+    def adjust(self, lines):
+        """Adjust the source_range of all hunks, to allow for inexact matching as long as any context lines still fit."""
+        offset = 0
+        offsets = []
+        for hunk in self.hunks:
+            hunk.source_range = tuple([x+offset for x in hunk.source_range])
+            offset = hunk.adjust(lines)
+            offsets.append(offset)
+        return offsets
+
 
 class PatchSet(object):
     """Collection of Patches."""

diff --git a/lagen/nu/res/patches/sfs/patches/1974/152.patch b/lagen/nu/res/patches/sfs/patches/1974/152.patch
@@ -0,0 +1,10 @@
+--- data/sfs/intermediate/1974/152.txt.bz2
++++ 
+@@ -234,6 +234,7 @@ Formattering av rubrik innan 2 kap 12 §
+ En rättegång ska genomföras rättvist och inom skälig tid.
+ Förhandling vid domstol ska vara offentlig. Lag
+ (2010:1408).
++
+ Skydd mot diskriminering
+
+ 12 § Lag eller annan föreskrift får inte innebära att någon
diff --git a/lagen/nu/res/patches/sfs/patches/1998/1513.desc b/lagen/nu/res/patches/sfs/patches/1998/1513.desc
diff --git a/lagen/nu/res/patches/sfs/patches/1998/1513.patch b/lagen/nu/res/patches/sfs/patches/1998/1513.patch
diff --git a/lagen/nu/res/patches/sfs/patches/2009/62.desc b/lagen/nu/res/patches/sfs/patches/2009/62.desc
diff --git a/lagen/nu/res/patches/sfs/patches/2009/62.patch b/lagen/nu/res/patches/sfs/patches/2009/62.patch
diff --git a/test/files/repo/dirregeringen/parsed/2016/15.xhtml b/test/files/repo/dirregeringen/parsed/2016/15.xhtml
@@ -88,7 +88,7 @@
       <div class="unorderedsection" about="https://lagen.nu/dir/2016:15#US9" property="dcterms:title" content="Behovet av särskilt författningsstöd för behandling av personuppgifter i den offentliga sektorn" typeof="bibo:DocumentPart">
         <span rel="dcterms:isPartOf" href="https://lagen.nu/dir/2016:15#US7"/>
         <p class="textbox fontspec3" style="top: 151px; left: 106px; height: 243px; width: 430px">De statliga och kommunala myndigheternas personuppgiftsbehandling kommer huvudsakligen att ske med stöd av de rättsliga grunder som kommer till uttryck i artikel 6.1 c och e i dataskyddsförordningen. Myndigheternas behandling av personuppgifter är alltså i normalfallet antingen nödvändig för att fullgöra en rättslig skyldighet eller utföra en arbetsuppgift av allmänt intresse eller i samband med myndighetsutövning. Detsamma gäller sådan behandling av personuppgifter som sker hos andra än myndigheter vid utförandet av förvaltningsuppgifter, exempelvis bilprovningsföretag eller fristående skolor. Det kan emellertid också gälla för andra verksamheter där arbetsuppgifterna mot bakgrund av verksamhetens syfte bedöms ha ett allmänt intresse. </p>
-        <p class="textbox fontspec3" style="top: 398px; left: 106px; height: 395px; width: 430px">I dag sker behandling av detta slag till viss del med stöd av reglering i särskilda registerförfattningar men i stor utsträckning enbart med stöd av den generella regleringen i <a href="https://lagen.nu/1998:204" rel="dcterms:references">personuppgiftslagen</a> (10 § b), c) och d) PUL). Enligt artikel 6.3 i förordningen måste dock grunden för behandling av personuppgifter som bygger på någon av de rättsliga grunderna i artikel 6.1 c och e fastställas i unionsrätten eller den nationella rätten. Detta innebär att det inte kommer vara möjligt att endast stödja sig på den generella regleringen i förordningen vid sådan behandling. Det behöver därför analyseras vad dataskyddsförordningens krav i denna del innebär i fråga om nationell författningsreglering och om det bör införas generella bestämmelser till stöd för åtminstone den offentliga sektorns behandling av personuppgifter. Informationshanteringsutredningen föreslår en sådan reglering i 8 § i förslaget till myndighetsdatalag (<a href="https://lagen.nu/utr/sou/2015:39" rel="dcterms:references">SOU 2015:39</a>). Enligt förslaget får en myndighet behandla personuppgifter om det är nödvändigt för att den ska kunna utföra sin verksamhet. Informationshanteringsutredningens förslag med beaktande av de synpunkter som framförts vid remissbehandlingen är en lämplig utgångspunkt för utredarens analys. </p>
+        <p class="textbox fontspec3" style="top: 398px; left: 106px; height: 395px; width: 430px">I dag sker behandling av detta slag till viss del med stöd av reglering i särskilda registerförfattningar men i stor utsträckning enbart med stöd av den generella regleringen i <a href="https://lagen.nu/1998:204" rel="dcterms:references">personuppgiftslagen</a> (10 § b), c) och d) PUL). Enligt artikel 6.3 i förordningen måste dock grunden för behandling av personuppgifter som bygger på någon av de rättsliga grunderna i artikel 6.1 c och e fastställas i unionsrätten eller den nationella rätten. Detta innebär att det inte kommer vara möjligt att endast stödja sig på den generella regleringen i förordningen vid sådan behandling. Det behöver därför analyseras vad dataskyddsförordningens krav i denna del innebär i fråga om nationell författningsreglering och om det bör införas generella bestämmelser till stöd för åtminstone den offentliga sektorns behandling av personuppgifter. Informationshanteringsutredningen föreslår en sådan reglering i 8 § i förslaget till myndighetsdatalag (<a href="https://lagen.nu/sou/2015:39" rel="dcterms:references">SOU 2015:39</a>). Enligt förslaget får en myndighet behandla personuppgifter om det är nödvändigt för att den ska kunna utföra sin verksamhet. Informationshanteringsutredningens förslag med beaktande av de synpunkter som framförts vid remissbehandlingen är en lämplig utgångspunkt för utredarens analys. </p>
         <p class="textbox fontspec3" style="top: 797px; left: 128px; height: 15px; width: 141px">Utredaren ska därför </p>
         <span id="sid8" class="sidbrytning" src="https://lagen.nu/dir/2016:15/sid8.png" width="892" height="1263"/>
         <ul>