make sure extract_head don't accidentally read a chunk that ends in t…

…he middle of a multibyte char
staffanm · Mar 4, 2016 · 7daca4f · 7daca4f
1 parent a345d91
commit 7daca4f
Showing 1 changed file with 9 additions and 0 deletions.
diff --git a/ferenda/sources/legal/se/direktiv.py b/ferenda/sources/legal/se/direktiv.py
@@ -90,6 +90,15 @@ def extract_head(self, fp, basefile):
             # open() or bz2.BZ2File() in self.parse_open(), it might
             # return bytes or unicode strings. This seem to be a
             # problem in BZ2File (or how we use it). Just roll with it.
+            # 
+            # if the very last byte is the start of a multi-byte UTF-8
+            # character, skip it so that we don't get a unicodedecode
+            # error because of the incomplete character. In py2, wrap
+            # in future.types.newbytes to get a py3 compatible
+            # interface.
+            textheader = bytes(textheader)
+            if textheader[-1] == ord(bytes(b'\xc3')):
+                textheader = textheader[:-1]
             textheader = textheader.decode(self.source_encoding)
         idx = textheader.index("-"*64)
         header = textheader[:idx]