scrapy · shaneaevans · Feb 14, 2012 · Feb 8, 2012 · Feb 13, 2012 · Feb 14, 2012
diff --git a/README.rst b/README.rst
@@ -13,6 +13,7 @@ This is a Python library of web-related functions, such as:
 * encoding mulitpart/form-data
 * convert raw HTTP headers to dicts and vice-versa
 * construct HTTP auth header
+* converting HTML pages to unicode
 * RFC-compliant url joining
 * sanitize urls (like browsers do)
 * extract arguments from urls
@@ -25,6 +26,7 @@ The w3lib package consists of four modules:
 * ``w3lib.url`` - functions for working with URLs
 * ``w3lib.html`` - functions for working with HTML
 * ``w3lib.http`` - functions for working with HTTP
+* ``w3lib.encoding`` - functions for working with character encoding
 * ``w3lib.form`` - functions for working with web forms
 
 Requirements

diff --git a/w3lib/encoding.py b/w3lib/encoding.py
@@ -0,0 +1,187 @@
+"""
+Functions for handling encoding of web pages
+"""
+import re, codecs, encodings
+
+_HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
+
+def http_content_type_encoding(content_type):
+    """Extract the encoding in the content-type header"""
+    if content_type:
+        match = _HEADER_ENCODING_RE.search(content_type)
+        if match:
+            return resolve_encoding(match.group(1))
+
+# regexp for parsing HTTP meta tags
+_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
+_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type')
+_CONTENT_RE = _TEMPLATE % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
+_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P<charset2>[\w-]+)')
+_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
+
+# check for meta tags, or xml decl. and stop search if a body tag is encountered
+_BODY_ENCODING_RE = re.compile(
+    r'<\s*(?:meta\s+(?:%s\s+%s|%s)|\?xml\s[^>]+%s|body)' % \
+        (_HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE), re.I)
+
+def html_body_declared_encoding(html_body_str):
+    """encoding specified in meta tags in the html body, or None if no 
+    suitable encoding was found
+    """
+    # html5 suggests the first 1024 bytes are sufficient, we allow for more
+    chunk = html_body_str[:4096]
+    match = _BODY_ENCODING_RE.search(chunk)
+    if match:
+        encoding = match.group('charset') or match.group('charset2') \
+                or match.group('xmlcharset')
+        return resolve_encoding(encoding)
+
+# Default encoding translation
+# this maps cannonicalized encodings to target encodings
+# see http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#character-encodings-0
+# in addition, gb18030 supercedes gb2312 & gbk
+# the keys are converted using _c18n_encoding and in sorted order
+DEFAULT_ENCODING_TRANSLATION = {
+        'ascii': 'cp1252',
+        'euc_kr': 'cp949',
+        'gb2312': 'gb18030',
+        'gbk': 'gb18030',
+        'iso8859_11': 'cp874',
+        'iso8859_9': 'cp1254',
+        'latin_1': 'cp1252',
+        'macintosh': 'mac_roman',
+        'shift_jis': 'cp932',
+        'tis_620': 'cp874',
+        'win_1251': 'cp1251',
+        'windows_31j': 'cp932',
+        'win_31j': 'cp932',
+        'windows_874': 'cp874',
+        'win_874': 'cp874',
+        'x_sjis': 'cp932',
+        'zh_cn': 'gb18030'
+}
+
+def _c18n_encoding(encoding):
+    """Cannonicalize an encoding name
+
+    This performs normalization and translates aliases using python's 
+    encoding aliases
+    """
+    normed = encodings.normalize_encoding(encoding).lower()
+    return encodings.aliases.aliases.get(normed, normed)
+
+def resolve_encoding(encoding_alias):
+    """Return the encoding the given encoding alias maps to, or None if the
+    encoding cannot be interpreted
+    """
+    c18n_encoding = _c18n_encoding(encoding_alias)
+    translated = DEFAULT_ENCODING_TRANSLATION.get(c18n_encoding, c18n_encoding)
+    try:
+        return codecs.lookup(translated).name
+    except LookupError:
+        return None
+
+_BOM_TABLE = [
+    (codecs.BOM_UTF32_BE, 'utf-32-be'),
+    (codecs.BOM_UTF32_LE, 'utf-32-le'),
+    (codecs.BOM_UTF16_BE, 'utf-16-be'),
+    (codecs.BOM_UTF16_LE, 'utf-16-le'),
+    (codecs.BOM_UTF8, 'utf-8')
+]
+_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
+
+def read_bom(data):
+    """Read the byte order mark in the text, if present, and 
+    return the encoding represented by the BOM and the BOM.
+
+    If no BOM can be detected, (None, None) is returned.
+    """
+    # common case is no BOM, so this is fast
+    if data[0] in _FIRST_CHARS:
+        for bom, encoding in _BOM_TABLE:
+            if data.startswith(bom):
+                return encoding, bom
+    return None, None
+
+# Python decoder doesn't follow unicode standard when handling
+# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
+codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1))
+
+def to_unicode(data_str, encoding):
+    """Convert a str object to unicode using the encoding given
+
+    Characters that cannot be converted will be converted to '\ufffd' (the
+    unicode replacement character).
+    """
+    data_str.decode(encoding, 'w3lib_replace')
+
+def _enc_unicode(data_str, encoding):
+    """convert the data_str to unicode inserting the unicode replacement
+    character where necessary. 
+
+    returns (encoding, unicode)
+    """
+    return encoding, data_str.decode(encoding, 'w3lib_replace')
+
+def html_to_unicode(content_type_header, html_body_str, 
+        default_encoding='utf8', auto_detect_fun=None):
+    """Convert raw html bytes to unicode
+
+    This attempts to make a reasonable guess at the content encoding of the
+    html body, following a similar process as a web browser. 
+
+    It will try in order:
+    * http content type header
+    * BOM (byte-order mark)
+    * meta or xml tag declarations
+    * auto-detection, if the `auto_detect_fun` keyword argument is not None
+    * default encoding in keyword arg (which defaults to utf8)
+
+    If an encoding other than the auto-detected or default encoding is used,
+    overrides will be applied, converting some character encodings to more
+    suitable alternatives.
+
+    If a BOM is found matching the encoding, it will be stripped.
+
+    The `auto_detect_fun` argument can be used to pass a function that will
+    sniff the encoding of the text. This function must take the raw text as an
+    argument and return the name of an encoding that python can process, or
+    None.  To use chardet, for example, you can define the function as:
+        auto_detect_fun=lambda x: chardet.detect(x).get('encoding')
+    or to use UnicodeDammit (shipped with the BeautifulSoup library):
+        auto_detect_fun=lambda x: UnicodeDammit(x).originalEncoding
+
+    If the locale of the website or user language preference is known, then a
+    better default encoding can be supplied.
+
+    If the content type header is not present, None can be passed signifying
+    that the header was not present.
+
+    This method will not fail, if characters cannot be converted to unicode, 
+    '\ufffd' (the unicode replacement character) will be inserted instead.
+
+    returns a tuple of (encoding used, unicode)
+    """
+    enc = http_content_type_encoding(content_type_header)
+    bom_enc, bom = read_bom(html_body_str)
+    if enc is not None:
+            # remove BOM if it agrees with the encoding
+        if enc == bom_enc:
+            html_body_str = html_body_str[len(bom):]
+        elif enc == 'utf-16' or enc == 'utf-32':
+            # read endianness from BOM, or default to big endian 
+            # tools.ietf.org/html/rfc2781 section 4.3
+            if bom_enc is not None and bom_enc.startswith(enc):
+                enc = bom_enc
+                html_body_str = html_body_str[len(bom):]
+            else:
+                enc += '-be'
+        return _enc_unicode(html_body_str, enc)
+    if bom_enc is not None:
+        return _enc_unicode(html_body_str[len(bom):], bom_enc)
+    enc = html_body_declared_encoding(html_body_str)
+    if enc is None and (auto_detect_fun is not None):
+        enc = auto_detect_fun(html_body_str)
+    if enc is None:
+        enc = default_encoding
+    return _enc_unicode(html_body_str, enc)
diff --git a/w3lib/tests/test_encoding.py b/w3lib/tests/test_encoding.py
@@ -0,0 +1,180 @@
+import unittest, codecs
+from w3lib.encoding import (html_body_declared_encoding, read_bom,
+        http_content_type_encoding, resolve_encoding, html_to_unicode)
+
+class RequestEncodingTests(unittest.TestCase):
+    def test_bom(self):
+        # cjk water character in unicode 
+        water_unicode = u'\u6C34'
+        # BOM + water character encoded
+        utf16be = '\xfe\xff\x6c\x34'
+        utf16le = '\xff\xfe\x34\x6c'
+        utf32be = '\x00\x00\xfe\xff\x00\x00\x6c\x34'
+        utf32le = '\xff\xfe\x00\x00\x34\x6c\x00\x00'
+        for string in (utf16be, utf16le, utf32be, utf32le):
+            bom_encoding, bom = read_bom(string)
+            decoded = string[len(bom):].decode(bom_encoding)
+            self.assertEqual(water_unicode, decoded)
+        enc, bom = read_bom("foo")
+        self.assertEqual(enc, None)
+        self.assertEqual(bom, None)
+
+    def test_http_encoding_header(self):
+        header_value = "Content-Type: text/html; charset=ISO-8859-4"
+        extracted = http_content_type_encoding(header_value)
+        self.assertEqual(extracted, "iso8859-4")
+        self.assertEqual(None, http_content_type_encoding("something else"))
+
+    def test_html_body_declared_encoding(self):
+        format1 = """
+            <meta http-equiv="Content-Type" 
+                content="text/html; charset=utf-8">
+        """
+        format2 = """<meta charset="utf-8">"""
+        format3 = """<?xml version="1.0" encoding="utf-8"?>"""
+        format4 = """ bad html still supported < meta http-equiv='Content-Type' 
+            content="text/html; charset=utf-8">"""
+        for fragment in (format1, format2, format3, format4):
+            encoding = html_body_declared_encoding(fragment)
+            self.assertEqual(encoding, 'utf-8')
+        self.assertEqual(None, html_body_declared_encoding("something else"))
+
+class CodecsEncodingTestCase(unittest.TestCase):
+    def test_resolve_encoding(self):
+        self.assertEqual(resolve_encoding('latin1'), 'cp1252')
+        self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252')
+        self.assertEqual(resolve_encoding('unknown encoding'), None)
+
+def ct(charset):
+    return "Content-Type: text/html; charset=" + charset if charset else None
+
+def norm_encoding(enc):
+    return codecs.lookup(enc).name
+
+class HtmlConversionTests(unittest.TestCase):
+
+    def test_unicode_body(self):
+        unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442'
+        original_string = unicode_string.encode('cp1251')
+        encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string)
+        # check body_as_unicode
+        self.assertTrue(isinstance(body_unicode, unicode))
+        self.assertEqual(body_unicode, unicode_string)
+
+    def _assert_encoding(self, content_type, body, expected_encoding, 
+                expected_unicode):
+        encoding, body_unicode = html_to_unicode(ct(content_type), body)
+        self.assertTrue(isinstance(body_unicode, unicode))
+        self.assertEqual(norm_encoding(encoding), 
+                norm_encoding(expected_encoding))
+        self.assertEqual(body_unicode, expected_unicode)
+
+    def test_content_type_and_conversion(self):
+        """Test content type header is interpreted and text converted as
+        expected
+        """
+        self._assert_encoding('utf-8', "\xc2\xa3", 'utf-8', u"\xa3")
+        # something like this in the scrapy tests - but that's invalid?
+        # self._assert_encoding('', "\xa3", 'utf-8', u"\xa3")
+        # iso-8859-1 is overridden to cp1252
+        self._assert_encoding('iso-8859-1', "\xa3", 'cp1252', u"\xa3")
+        self._assert_encoding('', "\xc2\xa3", 'utf-8', u"\xa3")
+        self._assert_encoding('none', "\xc2\xa3", 'utf-8', u"\xa3")
+        self._assert_encoding('gb2312', "\xa8D", 'gb18030', u"\u2015")
+        self._assert_encoding('gbk', "\xa8D", 'gb18030', u"\u2015")
+
+    def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
+        # unlike scrapy, the BOM is stripped
+        self._assert_encoding('utf-8', "\xef\xbb\xbfWORD\xe3\xab", 
+                'utf-8',u'WORD\ufffd\ufffd')
+        self._assert_encoding(None, "\xef\xbb\xbfWORD\xe3\xab", 
+                'utf-8',u'WORD\ufffd\ufffd')
+
+    def test_replace_wrong_encoding(self):
+        """Test invalid chars are replaced properly"""
+        encoding, body_unicode = html_to_unicode(ct('utf-8'), 
+                'PREFIX\xe3\xabSUFFIX')
+        # XXX: Policy for replacing invalid chars may suffer minor variations
+        # but it should always contain the unicode replacement char (u'\ufffd')
+        assert u'\ufffd' in body_unicode, repr(body_unicode)
+        assert u'PREFIX' in body_unicode, repr(body_unicode)
+        assert u'SUFFIX' in body_unicode, repr(body_unicode)
+
+        # Do not destroy html tags due to encoding bugs
+        encoding, body_unicode = html_to_unicode(ct('utf-8'), 
+            '\xf0<span>value</span>')
+        assert u'<span>value</span>' in body_unicode, repr(body_unicode)
+
+    def _assert_encoding_detected(self, content_type, expected_encoding, body, 
+            **kwargs):
+        encoding, body_unicode  = html_to_unicode(ct(content_type), body, **kwargs)
+        self.assertTrue(isinstance(body_unicode, unicode))
+        self.assertEqual(norm_encoding(encoding),  norm_encoding(expected_encoding))
+
+    def test_BOM(self):
+        # utf-16 cases already tested, as is the BOM detection function
+
+        # http header takes precedence, irrespective of BOM
+        bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
+        expected = u'\ufffd\ufffd\x00h\x00i'
+        self._assert_encoding('utf-8', bom_be_str, 'utf-8', expected)
+
+        # BOM is stripped when it agrees with the encoding, or used to
+        # determine encoding
+        bom_utf8_str = codecs.BOM_UTF8 + 'hi'
+        self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi")
+        self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi")
+
+    def test_utf16_32(self):
+        # tools.ietf.org/html/rfc2781 section 4.3
+
+        # USE BOM and strip it
+        bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
+        self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', u"hi")
+        self._assert_encoding(None, bom_be_str, 'utf-16-be', u"hi")
+
+        bom_le_str = codecs.BOM_UTF16_LE + u"hi".encode('utf-16-le')
+        self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi")
+        self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi")
+
+        bom_be_str = codecs.BOM_UTF32_BE + u"hi".encode('utf-32-be')
+        self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', u"hi")
+        self._assert_encoding(None, bom_be_str, 'utf-32-be', u"hi")
+
+        bom_le_str = codecs.BOM_UTF32_LE + u"hi".encode('utf-32-le')
+        self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', u"hi")
+        self._assert_encoding(None, bom_le_str, 'utf-32-le', u"hi")
+
+        # if there is no BOM,  big endian should be chosen
+        self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi")
+        self._assert_encoding('utf-32', u"hi".encode('utf-32-be'), 'utf-32-be', u"hi")
+
+
+
+    def test_html_encoding(self):
+        # extracting the encoding from raw html is tested elsewhere
+        body = """blah blah < meta   http-equiv="Content-Type" 
+            content="text/html; charset=iso-8859-1"> other stuff"""
+        self._assert_encoding_detected(None, 'cp1252', body)
+
+        # header encoding takes precedence
+        self._assert_encoding_detected('utf-8', 'utf-8', body)
+        # BOM encoding takes precedence
+        self._assert_encoding_detected(None, 'utf-8', codecs.BOM_UTF8 + body)
+
+    def test_autodetect(self):
+        asciif = lambda x: 'ascii'
+        body = """<meta charset="utf-8">"""
+        # body encoding takes precedence
+        self._assert_encoding_detected(None, 'utf-8', body, 
+                auto_detect_fun=asciif)
+        # if no other encoding, the auto detect encoding is used.
+        self._assert_encoding_detected(None, 'ascii', "no encoding info", 
+                auto_detect_fun=asciif)
+
+    def test_default_encoding(self):
+        # if no other method available, the default encoding of utf-8 is used
+        self._assert_encoding_detected(None, 'utf-8', "no encoding info")
+        # this can be overridden
+        self._assert_encoding_detected(None, 'ascii', "no encoding info", 
+                default_encoding='ascii')