Skip to content

Commit

Permalink
fix merge conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Nov 17, 2017
2 parents 1c58ca5 + dc9fa60 commit 1334b0f
Show file tree
Hide file tree
Showing 44 changed files with 723 additions and 615 deletions.
8 changes: 6 additions & 2 deletions ferenda/devel.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,16 +193,20 @@ def format_exception():
pfp = StringIO(codecs.decode(pfp.read(), "rot13"))
try:
ps = PatchSet.from_stream(pfp)
text = "\n".join(ps.patches[0].merge(text.split("\n")))
lines = text.split("\n")
offsets = ps.patches[0].adjust(lines)
text = "\n".join(ps.patches[0].merge(lines))
if ps.patches[0].hunks[0].comment:
patchdescription = ps.patches[0].hunks[0].comment
else:
patchdescription = ""
instructions = Div([
P(["Existing patch at %s has been applied (" % patchpath,
A("ignore existing patch", href=ignorepatchlink), ")"]),
P("Contents of that patch, for reference"),
P(["Contents of that patch, for reference"]),
Pre([util.readfile(patchpath)])])
if any(offsets):
instructions.append(P("Patch did not apply cleanly, the following adjustments were made: %s" % offsets))
except (PatchSyntaxError, PatchConflictError) as e:
instructions = Div([
P(["Existing patch at %s could not be applied (" % patchpath,
Expand Down
4 changes: 3 additions & 1 deletion ferenda/documentrepository.py
Original file line number Diff line number Diff line change
Expand Up @@ -1330,7 +1330,9 @@ def patch_if_needed(self, basefile, text):
else:
desc = "(No patch description available)"
try:
stream = ps.patches[0].merge(text.split("\n"))
lines = text.split("\n")
ps.patches[0].adjust(lines)
stream = ps.patches[0].merge(lines)
return "\n".join(stream), desc
except PatchConflictError as e:
raise errors.PatchError(e)
Expand Down
65 changes: 37 additions & 28 deletions ferenda/pdfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import logging
import os
import re
import shutil
import tempfile
import warnings
import unicodedata
Expand Down Expand Up @@ -202,8 +203,15 @@ def __init__(self,
os.unlink(convertedfile)
return res

def _tesseract(self, tmppdffile, workdir, lang, hocr=True):
root = os.path.splitext(os.path.basename(tmppdffile))[0]
def _tesseract(self, pdffile, workdir, lang, hocr=True):
root = os.path.splitext(os.path.basename(pdffile))[0]

# step 0: copy the pdf into a temp dir (which is probably on
# local disk, saving us some network traffic if the pdf file
# is huge and on a NFS mount somewhere)
tmpdir = tempfile.mkdtemp()
tmppdffile = os.sep.join([tmpdir, os.path.basename(pdffile)])
util.copy_if_different(pdffile, tmppdffile)

# step 1: find the number of pages
cmd = "pdfinfo %s" % tmppdffile
Expand All @@ -219,36 +227,35 @@ def _tesseract(self, tmppdffile, workdir, lang, hocr=True):
topage = min((i + 1) * 10, number_of_pages)
if frompage > topage:
continue
cmd = "pdfimages -png -p -f %(frompage)s -l %(topage)s %(tmppdffile)s %(workdir)s/%(root)s" % locals(
cmd = "pdfimages -all -p -f %(frompage)s -l %(topage)s %(tmppdffile)s %(tmpdir)s/%(root)s" % locals(
)
self.log.debug("- running " + cmd)
(returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
# step 2.1: Combine the recently extracted images (which
# are always png) into a new tif (so that we add 10 pages
# at a time to the tif, as imagemagick can create a number
# of pretty large files for each page, so converting 200
# images will fill 10 G of your temp space -- which we'd
# like to avoid)
cmd = "convert %(workdir)s/%(root)s-*.png -compress Zip %(workdir)s/%(root)s-tmp%(idx)04d.tif" % locals(
# step 2.1: convert and combine the recently extracted
# images (which can be ppm, jpg, ccitt or whatever) into a
# new tif (so that we add 10 pages at a time to the tif,
# as imagemagick can create a number of pretty large files
# for each page, so converting 200 images will fill 10 G
# of your temp space -- which we'd like to avoid)
cmd = "convert %(tmpdir)s/%(root)s-* -compress Zip %(tmpdir)s/%(root)s_tmp%(idx)04d.tif" % locals(
)
self.log.debug("- running " + cmd)
(returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
# step 2.2: Remove png files now that they're in the .tif
for f in glob("%(workdir)s/%(root)s-*.png" % locals()):
# step 2.2: Remove extracted image files now that they're in the .tif
for f in glob("%(tmpdir)s/%(root)s-*" % locals()):
os.unlink(f)

# Step 3: Combine all the 10-page tifs into a giant tif using tiffcp
cmd = "tiffcp -c zip %(workdir)s/%(root)s-tmp*.tif %(workdir)s/%(root)s.tif" % locals()
cmd = "tiffcp -c zip %(tmpdir)s/%(root)s_tmp*.tif %(tmpdir)s/%(root)s.tif" % locals()
self.log.debug("- running " + cmd)
(returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)

# Step 3: OCR the giant tif file to create a .hocr.html file
# Note that -psm 1 (automatic page segmentation with
# orientation and script detection) requires the installation
# of tesseract-ocr-3.01.osd.tar.gz
usehocr = "hocr" if hocr else ""
suffix = ".hocr" if hocr else ""
cmd = "tesseract %(workdir)s/%(root)s.tif %(workdir)s/%(root)s%(suffix)s -l %(lang)s -psm 1 %(usehocr)s" % locals(
cmd = "tesseract %(tmpdir)s/%(root)s.tif %(tmpdir)s/%(root)s%(suffix)s -l %(lang)s -psm 1 %(usehocr)s" % locals(
)
self.log.debug("running " + cmd)
(returncode, stdout, stderr) = util.runcmd(cmd, require_success=True)
Expand All @@ -257,15 +264,14 @@ def _tesseract(self, tmppdffile, workdir, lang, hocr=True):
# suffix, while earlier versions add a automatic .html. Other
# parts of the code expects the .html suffix, so we check to
# see if we have new-tesseract behaviour and compensate.
if os.path.exists("%(workdir)s/%(root)s%(suffix)s.hocr" % locals()):
util.robust_rename("%(workdir)s/%(root)s%(suffix)s.hocr" % locals(),
"%(workdir)s/%(root)s%(suffix)s.html" % locals())

if os.path.exists("%(tmpdir)s/%(root)s%(suffix)s.hocr" % locals()):
util.robust_rename("%(tmpdir)s/%(root)s%(suffix)s.hocr" % locals(),
"%(tmpdir)s/%(root)s%(suffix)s.html" % locals())

# Step 5: Cleanup (the main .tif file can stay)
os.unlink(tmppdffile)
for f in glob("%(workdir)s/%(root)s-tmp*.tif" % locals()):
os.unlink(f)
# Step 5: Move our hOCR file to the workdir, then cleanup
util.robust_rename("%(tmpdir)s/%(root)s%(suffix)s.html" % locals(),
"%(workdir)s/%(root)s%(suffix)s.html" % locals())
shutil.rmtree(tmpdir)

def _pdftohtml(self, tmppdffile, workdir, images):
root = os.path.splitext(os.path.basename(tmppdffile))[0]
Expand Down Expand Up @@ -579,8 +585,9 @@ def txt(element_text):
continue
assert element.tag == 'text', "Got <%s>, expected <text>" % element.tag
# eliminate "empty" textboxes, including "<text><i> </i></text>\n"
if element.text and txt(
element.text).strip() == "" and not element.getchildren():
if (((element.text and txt(element.text).strip() == "") or
(element.text is None)) and
not element.getchildren()):
# print "Skipping empty box"
continue
if len(page) > 0:
Expand Down Expand Up @@ -991,16 +998,19 @@ def convert(self, filename, workdir=None, images=True,
if ocr_lang:
converter = self._tesseract
converter_extra = {'lang': ocr_lang}
tmpfilename = filename
else:
converter = self._pdftohtml
converter_extra = {'images': images}
tmpfilename = os.sep.join([workdir, os.path.basename(filename)])

tmpfilename = os.sep.join([workdir, os.path.basename(filename)])
# copying the filename to the workdir is only needed if we use
# PDFReader._pdftohtml

if not util.outfile_is_newer([filename], convertedfile):
util.copy_if_different(filename, tmpfilename)
if not ocr_lang:
# this is somewhat expensive and not really needed when converter is tesseract
util.copy_if_different(filename, tmpfilename)
# this is the expensive operation
converter(tmpfilename, workdir, **converter_extra)

Expand All @@ -1020,7 +1030,6 @@ def convert(self, filename, workdir=None, images=True,
# (in _parse_xml), a workaround will be applied to the
# document on the fly.
pass
#
if keep_xml == "bz2":
with open(convertedfile.replace(".bz2", ""), mode="rb") as rfp:
# BZ2File supports the with statement in py27+,
Expand Down
1 change: 1 addition & 0 deletions ferenda/sources/legal/se/dv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1428,6 +1428,7 @@ def postprocess_doc(self, doc):
if self.config.mapfiletype == "nginx":
path = urlparse(doc.uri).path
else:
idx = len(self.urispace_base) + len(self.urispace_segment) + 2
path = doc.uri[idx:]

def map_append_needed(mapped_path, filename):
Expand Down
2 changes: 1 addition & 1 deletion ferenda/sources/legal/se/sou.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ class SOUKB(Offtryck, PDFDocumentRepository):
@classmethod
def get_default_options(cls):
opts = super(SOUKB, cls).get_default_options()
opts['ocr'] = False
opts['ocr'] = True
return opts


Expand Down
7 changes: 4 additions & 3 deletions ferenda/sources/legal/se/swedishlegalsource.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,6 @@ def parse_open(self, basefile, attachment=None):
def patch_if_needed(self, fp, basefile):
"""Override of DocumentRepository.patch_if_needed with different,
streamier API."""

if self.config.ignorepatch is True:
return fp
# 1. do we have a patch?
Expand Down Expand Up @@ -624,8 +623,10 @@ def patch_if_needed(self, fp, basefile):
# an attribute with the description
# lines = [l.decode().rstrip() for l in fp.readlines()]
lines = [l.rstrip("\n") for l in fp.readlines()]
import pudb; pu.db
patchedlines = list(ps.patches[0].merge(lines))
offsets = ps.patches[0].adjust(lines)
if any(offsets):
self.log.warning("Patch source ranges had to be adjusted: %s" % offsets)
patchedlines = ps.patches[0].merge(lines)
patchedtext = "\n".join(patchedlines)
if binarystream:
fp = BytesIO(patchedtext.encode(self.source_encoding))
Expand Down
50 changes: 49 additions & 1 deletion ferenda/thirdparty/patchit.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,44 @@ def add_operation(self, symbol, text):
symbol = self.operation_symbol_map[symbol]
self.operations.append(self.Operation(symbol, text))

def adjust(self, lines):
"""Adjust the source_range of Hunk based on it's context lines.
:param lines: collection of lines for the entire source text
:type lines: list
:returns: offset compared to existing source_range
"""
done = False
offset = 0
while self.source_range[0] + offset >= 0 and self.source_range[1] + offset <= len(lines):
if self.match(lines, offset):
done = True
break
offset = -offset
if self.match(lines, offset):
done = True
break
offset = -offset + 1
if not done:
raise PatchConflictError('Cannot match context lines')
self.source_range = tuple([x + offset for x in self.source_range])
return offset

def match(self, lines, offset):
"""Check if the context lines at a particular offset matches the source text"""
# maybe we should read from front and back until we encounter our first non OP_EQUAL?
for idx, (symbol, text) in enumerate(self.operations):
if symbol == self.OP_INSERT:
offset -= 1
elif symbol == self.OP_DELETE:
offset += 0
else: # self.OP_EQUAL, i.e. a context line
lineidx = self.source_range[0] - 1 + offset + idx
if lineidx < 0 or lineidx >= len(lines) or lines[lineidx] != text:
return False
return True


def merge(self, lines):
"""Merge Hunk into `lines`.
Expand All @@ -85,7 +123,7 @@ def merge(self, lines):
raise PatchConflictError('Unexpected end of stream')

if line != text:
raise PatchConflictError('patch conflict')
raise PatchConflictError('patch conflict: Expected %r, got %r' % (line, text))
if symbol == Hunk.OP_EQUAL:
yield line

Expand Down Expand Up @@ -123,6 +161,16 @@ def merge(self, lines):
for line in lines_enumerator:
yield line

def adjust(self, lines):
"""Adjust the source_range of all hunks, to allow for inexact matching as long as any context lines still fit."""
offset = 0
offsets = []
for hunk in self.hunks:
hunk.source_range = tuple([x+offset for x in hunk.source_range])
offset = hunk.adjust(lines)
offsets.append(offset)
return offsets


class PatchSet(object):
"""Collection of Patches."""
Expand Down
10 changes: 10 additions & 0 deletions lagen/nu/res/patches/sfs/patches/1974/152.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
--- data/sfs/intermediate/1974/152.txt.bz2
+++
@@ -234,6 +234,7 @@ Formattering av rubrik innan 2 kap 12 §
En rättegång ska genomföras rättvist och inom skälig tid.
Förhandling vid domstol ska vara offentlig. Lag
(2010:1408).
+
Skydd mot diskriminering

12 § Lag eller annan föreskrift får inte innebära att någon
1 change: 0 additions & 1 deletion lagen/nu/res/patches/sfs/patches/1998/1513.desc

This file was deleted.

13 changes: 0 additions & 13 deletions lagen/nu/res/patches/sfs/patches/1998/1513.patch

This file was deleted.

1 change: 0 additions & 1 deletion lagen/nu/res/patches/sfs/patches/2009/62.desc

This file was deleted.

14 changes: 0 additions & 14 deletions lagen/nu/res/patches/sfs/patches/2009/62.patch

This file was deleted.

2 changes: 1 addition & 1 deletion test/files/repo/dirregeringen/parsed/2016/15.xhtml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
<div class="unorderedsection" about="https://lagen.nu/dir/2016:15#US9" property="dcterms:title" content="Behovet av särskilt författningsstöd för behandling av personuppgifter i den offentliga sektorn" typeof="bibo:DocumentPart">
<span rel="dcterms:isPartOf" href="https://lagen.nu/dir/2016:15#US7"/>
<p class="textbox fontspec3" style="top: 151px; left: 106px; height: 243px; width: 430px">De statliga och kommunala myndigheternas personuppgiftsbehandling kommer huvudsakligen att ske med stöd av de rättsliga grunder som kommer till uttryck i artikel 6.1 c och e i dataskyddsförordningen. Myndigheternas behandling av personuppgifter är alltså i normalfallet antingen nödvändig för att fullgöra en rättslig skyldighet eller utföra en arbetsuppgift av allmänt intresse eller i samband med myndighetsutövning. Detsamma gäller sådan behandling av personuppgifter som sker hos andra än myndigheter vid utförandet av förvaltningsuppgifter, exempelvis bilprovningsföretag eller fristående skolor. Det kan emellertid också gälla för andra verksamheter där arbetsuppgifterna mot bakgrund av verksamhetens syfte bedöms ha ett allmänt intresse. </p>
<p class="textbox fontspec3" style="top: 398px; left: 106px; height: 395px; width: 430px">I dag sker behandling av detta slag till viss del med stöd av reglering i särskilda registerförfattningar men i stor utsträckning enbart med stöd av den generella regleringen i <a href="https://lagen.nu/1998:204" rel="dcterms:references">personuppgiftslagen</a> (10 § b), c) och d) PUL). Enligt artikel 6.3 i förordningen måste dock grunden för behandling av personuppgifter som bygger på någon av de rättsliga grunderna i artikel 6.1 c och e fastställas i unionsrätten eller den nationella rätten. Detta innebär att det inte kommer vara möjligt att endast stödja sig på den generella regleringen i förordningen vid sådan behandling. Det behöver därför analyseras vad dataskyddsförordningens krav i denna del innebär i fråga om nationell författningsreglering och om det bör införas generella bestämmelser till stöd för åtminstone den offentliga sektorns behandling av personuppgifter. Informationshanteringsutredningen föreslår en sådan reglering i 8 § i förslaget till myndighetsdatalag (<a href="https://lagen.nu/utr/sou/2015:39" rel="dcterms:references">SOU 2015:39</a>). Enligt förslaget får en myndighet behandla personuppgifter om det är nödvändigt för att den ska kunna utföra sin verksamhet. Informationshanteringsutredningens förslag med beaktande av de synpunkter som framförts vid remissbehandlingen är en lämplig utgångspunkt för utredarens analys. </p>
<p class="textbox fontspec3" style="top: 398px; left: 106px; height: 395px; width: 430px">I dag sker behandling av detta slag till viss del med stöd av reglering i särskilda registerförfattningar men i stor utsträckning enbart med stöd av den generella regleringen i <a href="https://lagen.nu/1998:204" rel="dcterms:references">personuppgiftslagen</a> (10 § b), c) och d) PUL). Enligt artikel 6.3 i förordningen måste dock grunden för behandling av personuppgifter som bygger på någon av de rättsliga grunderna i artikel 6.1 c och e fastställas i unionsrätten eller den nationella rätten. Detta innebär att det inte kommer vara möjligt att endast stödja sig på den generella regleringen i förordningen vid sådan behandling. Det behöver därför analyseras vad dataskyddsförordningens krav i denna del innebär i fråga om nationell författningsreglering och om det bör införas generella bestämmelser till stöd för åtminstone den offentliga sektorns behandling av personuppgifter. Informationshanteringsutredningen föreslår en sådan reglering i 8 § i förslaget till myndighetsdatalag (<a href="https://lagen.nu/sou/2015:39" rel="dcterms:references">SOU 2015:39</a>). Enligt förslaget får en myndighet behandla personuppgifter om det är nödvändigt för att den ska kunna utföra sin verksamhet. Informationshanteringsutredningens förslag med beaktande av de synpunkter som framförts vid remissbehandlingen är en lämplig utgångspunkt för utredarens analys. </p>
<p class="textbox fontspec3" style="top: 797px; left: 128px; height: 15px; width: 141px">Utredaren ska därför </p>
<span id="sid8" class="sidbrytning" src="https://lagen.nu/dir/2016:15/sid8.png" width="892" height="1263"/>
<ul>
Expand Down
Loading

0 comments on commit 1334b0f

Please sign in to comment.