From 8e2c536e6627c9184a798eba4d068a1ae503dda2 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Tue, 7 Aug 2018 22:25:39 +0200 Subject: [PATCH] fall back on inferred identifier if we really can't interpret the identifier on the HTML page --- ferenda/sources/legal/se/regeringen.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ferenda/sources/legal/se/regeringen.py b/ferenda/sources/legal/se/regeringen.py index 159fa8b5..828bc657 100644 --- a/ferenda/sources/legal/se/regeringen.py +++ b/ferenda/sources/legal/se/regeringen.py @@ -336,8 +336,17 @@ def sanitize_metadata(self, a, basefile): if k in a: a[k] = util.normalize_space(a[k]) # trim identifier - a["dcterms:identifier"] = self.sanitize_identifier( - a["dcterms:identifier"].replace("ID-nummer: ", "")) + try: + # The identifier displayed on the HTML page is not always + # correct -- it might be missing digits (eg "SOU 207:111" + # instead of "SOU 2017:111"). Try to sanitize it, but if + # we fail, infer it from our basefile instead. + a["dcterms:identifier"] = self.sanitize_identifier( + a["dcterms:identifier"].replace("ID-nummer: ", "")) + except ValueError as e: + inferred_identifier = str(self.infer_identifier(basefile)) + self.log.error("%s: Irregular identifier %s, using inferred identifier %s instead" % (basefile, a["dcterms:identifier"], inferred_identifier)) + a["dcterms:identifier"] = inferred_identifier # save for later self._identifier = a["dcterms:identifier"] # it's rare, but in some cases a document can be published by