Skip to content

Commit

Permalink
fixed minor parsing issues for documents with ocr errors / spurious s…
Browse files Browse the repository at this point in the history
…paces
  • Loading branch information
staffanm committed Aug 7, 2018
1 parent 9e37da0 commit d8aa8c9
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 4 deletions.
6 changes: 3 additions & 3 deletions ferenda/sources/legal/se/dv.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,7 @@ def parse_not(self, text, basefile, filetype):
if coll == "HDO":
# keep this in sync w extract_notis
re_notisstart = re.compile(
"(?:Den (?P<avgdatum>\d+):[ae].\s+|)(?P<ordinal>\d+)\.[ \xa0]*\((?P<malnr>\w[ \xa0]\d+-\d+)\)",
"(?:Den (?P<avgdatum>\d+):[ae].\s+|)(?P<ordinal>\d+)\s*\.\s*\((?P<malnr>\w[ \xa0]\d+-\d+)\)",
flags=re.UNICODE)
re_avgdatum = re_malnr = re_notisstart
re_lagrum = re_sokord = None
Expand Down Expand Up @@ -966,7 +966,7 @@ def parse_ooxml(self, text, basefile):
# domstolen") följt av referatnumret ("NJA 1987
# s. 113").
firstfield = soup.find("w:t")
# Ibland ärdomstolsnamnet uppsplittat på två
# Ibland är domstolsnamnet uppsplittat på två
# w:r-element. Bäst att gå på all text i
# föräldra-w:tc-cellen
firstfield = firstfield.find_parent("w:tc")
Expand Down Expand Up @@ -1091,7 +1091,7 @@ def parse_antiword_docbook(self, text, basefile):
def sanitize_metadata(self, head, basefile):
basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)')
nja_regex = re.compile(
"NJA ?(\d+) ?s\.? ?(\d+) ?\( ?(?:NJA|) ?[ :]?(\d+) ?: ?(\d+)")
"NJA ?(\d+) ?s\.? ?(\d+) *\( ?(?:NJA|) ?[ :]?(\d+) ?: ?(\d+)")
date_regex = re.compile("(\d+)[^\d]+(\d+)[^\d]+(\d+)")
referat_regex = re.compile(
"(?P<type>[A-ZÅÄÖ]+)[^\d]*(?P<year>\d+)[^\d]+(?P<ordinal>\d+)")
Expand Down
5 changes: 4 additions & 1 deletion ferenda/sources/legal/se/propositioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -807,7 +807,10 @@ def find_firstpage_metadata(self, firstpage, basefile):
self.log.warning("%s: Couldn't find date in first %s characters (first page)" %
(basefile, len(firstpage)))
else:
res["dcterms:issued"] = self.parse_swedish_date(m.group(1).lower())
try:
res["dcterms:issued"] = self.parse_swedish_date(m.group(1).lower())
except ValueError as e:
self.log.warning("%s: Couldn't parse date %s" % (basefile, m.group(1)))
return res

def extract_body(self, fp, basefile):
Expand Down
11 changes: 11 additions & 0 deletions lagen/nu/res/patches/dv/patches/HDO/Ö1229-17.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--- qngn/qi/vagrezrqvngr/UQB/Ö1229-17.kzy.om2
+++
@@ -25,7 +25,7 @@ Xbeevtrenq eäggfsnyyforgrpxavat
<j:c>
<j:e>
<j:g>
- AWN 2017 f.683 (AWN:65)
+ AWN 2017 f.683 (AWN 2017:65)
</j:g>
</j:e>
</j:c>
11 changes: 11 additions & 0 deletions lagen/nu/res/patches/dv/patches/HDO/Ö663-17.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--- qngn/qi/vagrezrqvngr/UQB/Ö663-17.kzy.om2
+++
@@ -25,7 +25,7 @@ Xbeevtrenq eäggfsnyyforgrpxavat
<j:c>
<j:e>
<j:g>
- AWN 2017 f.683 (AWN:65)
+ AWN 2017 f.683 (AWN 2017:65)
</j:g>
</j:e>
</j:c>
11 changes: 11 additions & 0 deletions lagen/nu/res/patches/dv/patches/HDO/Ö754-17.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--- qngn/qi/vagrezrqvngr/UQB/Ö754-17.kzy.om2
+++
@@ -25,7 +25,7 @@ Xbeevtrenq eäggfsnyyforgrpxavat
<j:c>
<j:e>
<j:g>
- AWN 2017 f.683 (AWN:65)
+ AWN 2017 f.683 (AWN 2017:65)
</j:g>
</j:e>
</j:c>

0 comments on commit d8aa8c9

Please sign in to comment.