From 99d0e61837a2f9c9d10978a0b7b71bf505983c1e Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Tue, 13 Jun 2017 22:02:12 +0200 Subject: [PATCH] py2 compat, better sanitize_identifier for offtryck --- ferenda/pdfreader.py | 8 +++++++- ferenda/sources/legal/se/direktiv.py | 2 +- ferenda/sources/legal/se/offtryck.py | 10 ++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/ferenda/pdfreader.py b/ferenda/pdfreader.py index ba45c4bc..17b2064d 100644 --- a/ferenda/pdfreader.py +++ b/ferenda/pdfreader.py @@ -677,7 +677,10 @@ def _parse_xml_make_textbox(self, element, nextelement, after_footnote, lastbox, del attribs['font'] return self._textdecoder(Textbox(textelements, **attribs), self.fontspec) - ws_trans = str.maketrans("\n\t\xa0", " ") + import string + ws_trans = {ord("\n"): " ", + ord("\t"): " ", + ord("\xa0"): " "} def _parse_xml_make_textelement(self, element, **origkwargs): # the complication is that a hierarchical sequence of tags @@ -716,6 +719,9 @@ def cleantag(kwargs): def normspace(txt): # like util.normalize_space, but preserves a single leading/trailing space + if not isinstance(txt, str): # under py2, element.text can + # sometimes be a bytestring? + txt = txt.decode() txt = txt.translate(self.ws_trans) startspace = " " if txt.startswith(" ") else "" endspace = " " if txt.endswith(" ") and len(txt) > 1 else "" diff --git a/ferenda/sources/legal/se/direktiv.py b/ferenda/sources/legal/se/direktiv.py index f154cd17..ba12de35 100644 --- a/ferenda/sources/legal/se/direktiv.py +++ b/ferenda/sources/legal/se/direktiv.py @@ -272,7 +272,7 @@ def metadata_from_basefile(self, basefile): return a def infer_identifier(self, basefile): - return "Dir %s" % basefile + return "Dir. %s" % basefile def postprocess_doc(self, doc): next_is_title = False diff --git a/ferenda/sources/legal/se/offtryck.py b/ferenda/sources/legal/se/offtryck.py index 0a2a5142..1d5f2f11 100644 --- a/ferenda/sources/legal/se/offtryck.py +++ b/ferenda/sources/legal/se/offtryck.py @@ -647,6 +647,16 @@ def sanitize_identifier(self, identifier): self.SO: "%s %s:%s"} try: parts = re.split("[\.:/ ]+", identifier.strip()) + id_template = pattern[self.document_type] + # do we have enough parts for our template? + if len(parts) == id_template.count("%s") - 1: + # we're probably missing the first part (eg "Prop", + # "Ds") and so what we have is a basefile-like + # thing. Reconstruct the first part. + parts.insert(0, re.split("[\.:/ ]+", self.infer_identifier(identifier))[0]) + # make sure the initial char is capitalized (this is + # preferred to .capitalize() for strings that should be + # all-caps, eg "SOU" parts[0] = parts[0][0].upper() + parts[0][1:] return pattern[self.document_type] % tuple(parts) except: