From 7daca4f2f3bc70646c1c2fc1d96ae8c06ae38634 Mon Sep 17 00:00:00 2001
From: Staffan Malmgren <staffan.malmgren@gmail.com>
Date: Fri, 4 Mar 2016 09:16:35 +0100
Subject: [PATCH] make sure extract_head don't accidentally read a chunk that
 ends in the middle of a multibyte char

---
 ferenda/sources/legal/se/direktiv.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ferenda/sources/legal/se/direktiv.py b/ferenda/sources/legal/se/direktiv.py
index bc2f9a66..12c9eb71 100644
--- a/ferenda/sources/legal/se/direktiv.py
+++ b/ferenda/sources/legal/se/direktiv.py
@@ -90,6 +90,15 @@ def extract_head(self, fp, basefile):
             # open() or bz2.BZ2File() in self.parse_open(), it might
             # return bytes or unicode strings. This seem to be a
             # problem in BZ2File (or how we use it). Just roll with it.
+            # 
+            # if the very last byte is the start of a multi-byte UTF-8
+            # character, skip it so that we don't get a unicodedecode
+            # error because of the incomplete character. In py2, wrap
+            # in future.types.newbytes to get a py3 compatible
+            # interface.
+            textheader = bytes(textheader)
+            if textheader[-1] == ord(bytes(b'\xc3')):
+                textheader = textheader[:-1]
             textheader = textheader.decode(self.source_encoding)
         idx = textheader.index("-"*64)
         header = textheader[:idx]