py2 compat, better sanitize_identifier for offtryck

staffanm · Jun 13, 2017 · 99d0e61 · 99d0e61
1 parent 75430f5
commit 99d0e61
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 2 deletions.
diff --git a/ferenda/pdfreader.py b/ferenda/pdfreader.py
@@ -677,7 +677,10 @@ def _parse_xml_make_textbox(self, element, nextelement, after_footnote, lastbox,
         del attribs['font']
         return self._textdecoder(Textbox(textelements, **attribs), self.fontspec)
 
-    ws_trans = str.maketrans("\n\t\xa0", "   ")
+    import string
+    ws_trans = {ord("\n"): " ",
+                ord("\t"): " ",
+                ord("\xa0"): " "}
 
     def _parse_xml_make_textelement(self, element, **origkwargs):
         # the complication is that a hierarchical sequence of tags
@@ -716,6 +719,9 @@ def cleantag(kwargs):
 
         def normspace(txt):
             # like util.normalize_space, but preserves a single leading/trailing space
+            if not isinstance(txt, str): # under py2, element.text can
+                                         # sometimes be a bytestring?
+                txt = txt.decode()
             txt = txt.translate(self.ws_trans)
             startspace = " " if txt.startswith(" ") else ""
             endspace = " " if txt.endswith(" ") and len(txt) > 1 else ""

diff --git a/ferenda/sources/legal/se/direktiv.py b/ferenda/sources/legal/se/direktiv.py
@@ -272,7 +272,7 @@ def metadata_from_basefile(self, basefile):
         return a
 
     def infer_identifier(self, basefile):
-        return "Dir %s" % basefile
+        return "Dir. %s" % basefile
 
     def postprocess_doc(self, doc):
         next_is_title = False

diff --git a/ferenda/sources/legal/se/offtryck.py b/ferenda/sources/legal/se/offtryck.py
@@ -647,6 +647,16 @@ def sanitize_identifier(self, identifier):
                    self.SO: "%s %s:%s"}
         try:
             parts = re.split("[\.:/ ]+", identifier.strip())
+            id_template = pattern[self.document_type]
+            # do we have enough parts for our template?
+            if len(parts) == id_template.count("%s") - 1:
+                # we're probably missing the first part (eg "Prop",
+                # "Ds") and so what we have is a basefile-like
+                # thing. Reconstruct the first part.
+                parts.insert(0, re.split("[\.:/ ]+", self.infer_identifier(identifier))[0])
+            # make sure the initial char is capitalized (this is
+            # preferred to .capitalize() for strings that should be
+            # all-caps, eg "SOU"
             parts[0] = parts[0][0].upper() + parts[0][1:]
             return pattern[self.document_type] % tuple(parts)
         except: