From 7daca4f2f3bc70646c1c2fc1d96ae8c06ae38634 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Fri, 4 Mar 2016 09:16:35 +0100 Subject: [PATCH] make sure extract_head don't accidentally read a chunk that ends in the middle of a multibyte char --- ferenda/sources/legal/se/direktiv.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ferenda/sources/legal/se/direktiv.py b/ferenda/sources/legal/se/direktiv.py index bc2f9a66..12c9eb71 100644 --- a/ferenda/sources/legal/se/direktiv.py +++ b/ferenda/sources/legal/se/direktiv.py @@ -90,6 +90,15 @@ def extract_head(self, fp, basefile): # open() or bz2.BZ2File() in self.parse_open(), it might # return bytes or unicode strings. This seem to be a # problem in BZ2File (or how we use it). Just roll with it. + # + # if the very last byte is the start of a multi-byte UTF-8 + # character, skip it so that we don't get a unicodedecode + # error because of the incomplete character. In py2, wrap + # in future.types.newbytes to get a py3 compatible + # interface. + textheader = bytes(textheader) + if textheader[-1] == ord(bytes(b'\xc3')): + textheader = textheader[:-1] textheader = textheader.decode(self.source_encoding) idx = textheader.index("-"*64) header = textheader[:idx]