Decode HTTP responses with correct charset before trying to parse.

pombredanne · Jun 12, 2013 · 05e3770 · 05e3770
1 parent ec90e13
commit 05e3770
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 2 deletions.
diff --git a/micawber/compat.py b/micawber/compat.py
@@ -7,8 +7,12 @@
     from urllib.parse import urlencode
     text_type = str
     string_types = str,
+    def get_charset(response):
+        return response.headers.get_param('charset')
 else:
     from urllib2 import Request, urlopen, URLError, HTTPError
     from urllib import urlencode
     text_type = unicode
     string_types = basestring,
+    def get_charset(response):
+        return response.headers.getparam('charset')
diff --git a/micawber/providers.py b/micawber/providers.py
@@ -2,7 +2,7 @@
 import pickle
 import re
 import socket
-from .compat import urlencode, Request, urlopen, URLError, HTTPError
+from .compat import urlencode, Request, urlopen, URLError, HTTPError, get_charset
 try:
     import simplejson as json
 except ImportError:
@@ -35,7 +35,10 @@ def fetch(self, url):
         if resp.code < 200 or resp.code >= 300:
             return False
 
-        content = resp.read()
+        # by RFC, default HTTP charset is ISO-8859-1
+        charset = get_charset(resp) or 'iso-8859-1'
+
+        content = resp.read().decode(charset)
         resp.close()
         return content