diff --git a/README.rst b/README.rst index b2cb5c1f..e4b7978f 100644 --- a/README.rst +++ b/README.rst @@ -13,6 +13,7 @@ This is a Python library of web-related functions, such as: * encoding mulitpart/form-data * convert raw HTTP headers to dicts and vice-versa * construct HTTP auth header +* converting HTML pages to unicode * RFC-compliant url joining * sanitize urls (like browsers do) * extract arguments from urls @@ -25,6 +26,7 @@ The w3lib package consists of four modules: * ``w3lib.url`` - functions for working with URLs * ``w3lib.html`` - functions for working with HTML * ``w3lib.http`` - functions for working with HTTP +* ``w3lib.encoding`` - functions for working with character encoding * ``w3lib.form`` - functions for working with web forms Requirements diff --git a/w3lib/encoding.py b/w3lib/encoding.py new file mode 100644 index 00000000..7ca9d7ea --- /dev/null +++ b/w3lib/encoding.py @@ -0,0 +1,187 @@ +""" +Functions for handling encoding of web pages +""" +import re, codecs, encodings + +_HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I) + +def http_content_type_encoding(content_type): + """Extract the encoding in the content-type header""" + if content_type: + match = _HEADER_ENCODING_RE.search(content_type) + if match: + return resolve_encoding(match.group(1)) + +# regexp for parsing HTTP meta tags +_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' +_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type') +_CONTENT_RE = _TEMPLATE % ('content', r'(?P[^;]+);\s*charset=(?P[\w-]+)') +_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P[\w-]+)') +_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P[\w-]+)') + +# check for meta tags, or xml decl. and stop search if a body tag is encountered +_BODY_ENCODING_RE = re.compile( + r'<\s*(?:meta\s+(?:%s\s+%s|%s)|\?xml\s[^>]+%s|body)' % \ + (_HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE), re.I) + +def html_body_declared_encoding(html_body_str): + """encoding specified in meta tags in the html body, or None if no + suitable encoding was found + """ + # html5 suggests the first 1024 bytes are sufficient, we allow for more + chunk = html_body_str[:4096] + match = _BODY_ENCODING_RE.search(chunk) + if match: + encoding = match.group('charset') or match.group('charset2') \ + or match.group('xmlcharset') + return resolve_encoding(encoding) + +# Default encoding translation +# this maps cannonicalized encodings to target encodings +# see http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#character-encodings-0 +# in addition, gb18030 supercedes gb2312 & gbk +# the keys are converted using _c18n_encoding and in sorted order +DEFAULT_ENCODING_TRANSLATION = { + 'ascii': 'cp1252', + 'euc_kr': 'cp949', + 'gb2312': 'gb18030', + 'gbk': 'gb18030', + 'iso8859_11': 'cp874', + 'iso8859_9': 'cp1254', + 'latin_1': 'cp1252', + 'macintosh': 'mac_roman', + 'shift_jis': 'cp932', + 'tis_620': 'cp874', + 'win_1251': 'cp1251', + 'windows_31j': 'cp932', + 'win_31j': 'cp932', + 'windows_874': 'cp874', + 'win_874': 'cp874', + 'x_sjis': 'cp932', + 'zh_cn': 'gb18030' +} + +def _c18n_encoding(encoding): + """Cannonicalize an encoding name + + This performs normalization and translates aliases using python's + encoding aliases + """ + normed = encodings.normalize_encoding(encoding).lower() + return encodings.aliases.aliases.get(normed, normed) + +def resolve_encoding(encoding_alias): + """Return the encoding the given encoding alias maps to, or None if the + encoding cannot be interpreted + """ + c18n_encoding = _c18n_encoding(encoding_alias) + translated = DEFAULT_ENCODING_TRANSLATION.get(c18n_encoding, c18n_encoding) + try: + return codecs.lookup(translated).name + except LookupError: + return None + +_BOM_TABLE = [ + (codecs.BOM_UTF32_BE, 'utf-32-be'), + (codecs.BOM_UTF32_LE, 'utf-32-le'), + (codecs.BOM_UTF16_BE, 'utf-16-be'), + (codecs.BOM_UTF16_LE, 'utf-16-le'), + (codecs.BOM_UTF8, 'utf-8') +] +_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE) + +def read_bom(data): + """Read the byte order mark in the text, if present, and + return the encoding represented by the BOM and the BOM. + + If no BOM can be detected, (None, None) is returned. + """ + # common case is no BOM, so this is fast + if data[0] in _FIRST_CHARS: + for bom, encoding in _BOM_TABLE: + if data.startswith(bom): + return encoding, bom + return None, None + +# Python decoder doesn't follow unicode standard when handling +# bad utf-8 encoded strings. see http://bugs.python.org/issue8271 +codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1)) + +def to_unicode(data_str, encoding): + """Convert a str object to unicode using the encoding given + + Characters that cannot be converted will be converted to '\ufffd' (the + unicode replacement character). + """ + data_str.decode(encoding, 'w3lib_replace') + +def _enc_unicode(data_str, encoding): + """convert the data_str to unicode inserting the unicode replacement + character where necessary. + + returns (encoding, unicode) + """ + return encoding, data_str.decode(encoding, 'w3lib_replace') + +def html_to_unicode(content_type_header, html_body_str, + default_encoding='utf8', auto_detect_fun=None): + """Convert raw html bytes to unicode + + This attempts to make a reasonable guess at the content encoding of the + html body, following a similar process as a web browser. + + It will try in order: + * http content type header + * BOM (byte-order mark) + * meta or xml tag declarations + * auto-detection, if the `auto_detect_fun` keyword argument is not None + * default encoding in keyword arg (which defaults to utf8) + + If an encoding other than the auto-detected or default encoding is used, + overrides will be applied, converting some character encodings to more + suitable alternatives. + + If a BOM is found matching the encoding, it will be stripped. + + The `auto_detect_fun` argument can be used to pass a function that will + sniff the encoding of the text. This function must take the raw text as an + argument and return the name of an encoding that python can process, or + None. To use chardet, for example, you can define the function as: + auto_detect_fun=lambda x: chardet.detect(x).get('encoding') + or to use UnicodeDammit (shipped with the BeautifulSoup library): + auto_detect_fun=lambda x: UnicodeDammit(x).originalEncoding + + If the locale of the website or user language preference is known, then a + better default encoding can be supplied. + + If the content type header is not present, None can be passed signifying + that the header was not present. + + This method will not fail, if characters cannot be converted to unicode, + '\ufffd' (the unicode replacement character) will be inserted instead. + + returns a tuple of (encoding used, unicode) + """ + enc = http_content_type_encoding(content_type_header) + bom_enc, bom = read_bom(html_body_str) + if enc is not None: + # remove BOM if it agrees with the encoding + if enc == bom_enc: + html_body_str = html_body_str[len(bom):] + elif enc == 'utf-16' or enc == 'utf-32': + # read endianness from BOM, or default to big endian + # tools.ietf.org/html/rfc2781 section 4.3 + if bom_enc is not None and bom_enc.startswith(enc): + enc = bom_enc + html_body_str = html_body_str[len(bom):] + else: + enc += '-be' + return _enc_unicode(html_body_str, enc) + if bom_enc is not None: + return _enc_unicode(html_body_str[len(bom):], bom_enc) + enc = html_body_declared_encoding(html_body_str) + if enc is None and (auto_detect_fun is not None): + enc = auto_detect_fun(html_body_str) + if enc is None: + enc = default_encoding + return _enc_unicode(html_body_str, enc) diff --git a/w3lib/tests/test_encoding.py b/w3lib/tests/test_encoding.py new file mode 100644 index 00000000..a5712eb3 --- /dev/null +++ b/w3lib/tests/test_encoding.py @@ -0,0 +1,180 @@ +import unittest, codecs +from w3lib.encoding import (html_body_declared_encoding, read_bom, + http_content_type_encoding, resolve_encoding, html_to_unicode) + +class RequestEncodingTests(unittest.TestCase): + def test_bom(self): + # cjk water character in unicode + water_unicode = u'\u6C34' + # BOM + water character encoded + utf16be = '\xfe\xff\x6c\x34' + utf16le = '\xff\xfe\x34\x6c' + utf32be = '\x00\x00\xfe\xff\x00\x00\x6c\x34' + utf32le = '\xff\xfe\x00\x00\x34\x6c\x00\x00' + for string in (utf16be, utf16le, utf32be, utf32le): + bom_encoding, bom = read_bom(string) + decoded = string[len(bom):].decode(bom_encoding) + self.assertEqual(water_unicode, decoded) + enc, bom = read_bom("foo") + self.assertEqual(enc, None) + self.assertEqual(bom, None) + + def test_http_encoding_header(self): + header_value = "Content-Type: text/html; charset=ISO-8859-4" + extracted = http_content_type_encoding(header_value) + self.assertEqual(extracted, "iso8859-4") + self.assertEqual(None, http_content_type_encoding("something else")) + + def test_html_body_declared_encoding(self): + format1 = """ + + """ + format2 = """""" + format3 = """""" + format4 = """ bad html still supported < meta http-equiv='Content-Type' + content="text/html; charset=utf-8">""" + for fragment in (format1, format2, format3, format4): + encoding = html_body_declared_encoding(fragment) + self.assertEqual(encoding, 'utf-8') + self.assertEqual(None, html_body_declared_encoding("something else")) + +class CodecsEncodingTestCase(unittest.TestCase): + def test_resolve_encoding(self): + self.assertEqual(resolve_encoding('latin1'), 'cp1252') + self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252') + self.assertEqual(resolve_encoding('unknown encoding'), None) + +def ct(charset): + return "Content-Type: text/html; charset=" + charset if charset else None + +def norm_encoding(enc): + return codecs.lookup(enc).name + +class HtmlConversionTests(unittest.TestCase): + + def test_unicode_body(self): + unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442' + original_string = unicode_string.encode('cp1251') + encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string) + # check body_as_unicode + self.assertTrue(isinstance(body_unicode, unicode)) + self.assertEqual(body_unicode, unicode_string) + + def _assert_encoding(self, content_type, body, expected_encoding, + expected_unicode): + encoding, body_unicode = html_to_unicode(ct(content_type), body) + self.assertTrue(isinstance(body_unicode, unicode)) + self.assertEqual(norm_encoding(encoding), + norm_encoding(expected_encoding)) + self.assertEqual(body_unicode, expected_unicode) + + def test_content_type_and_conversion(self): + """Test content type header is interpreted and text converted as + expected + """ + self._assert_encoding('utf-8', "\xc2\xa3", 'utf-8', u"\xa3") + # something like this in the scrapy tests - but that's invalid? + # self._assert_encoding('', "\xa3", 'utf-8', u"\xa3") + # iso-8859-1 is overridden to cp1252 + self._assert_encoding('iso-8859-1', "\xa3", 'cp1252', u"\xa3") + self._assert_encoding('', "\xc2\xa3", 'utf-8', u"\xa3") + self._assert_encoding('none', "\xc2\xa3", 'utf-8', u"\xa3") + self._assert_encoding('gb2312', "\xa8D", 'gb18030', u"\u2015") + self._assert_encoding('gbk', "\xa8D", 'gb18030', u"\u2015") + + def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self): + # unlike scrapy, the BOM is stripped + self._assert_encoding('utf-8', "\xef\xbb\xbfWORD\xe3\xab", + 'utf-8',u'WORD\ufffd\ufffd') + self._assert_encoding(None, "\xef\xbb\xbfWORD\xe3\xab", + 'utf-8',u'WORD\ufffd\ufffd') + + def test_replace_wrong_encoding(self): + """Test invalid chars are replaced properly""" + encoding, body_unicode = html_to_unicode(ct('utf-8'), + 'PREFIX\xe3\xabSUFFIX') + # XXX: Policy for replacing invalid chars may suffer minor variations + # but it should always contain the unicode replacement char (u'\ufffd') + assert u'\ufffd' in body_unicode, repr(body_unicode) + assert u'PREFIX' in body_unicode, repr(body_unicode) + assert u'SUFFIX' in body_unicode, repr(body_unicode) + + # Do not destroy html tags due to encoding bugs + encoding, body_unicode = html_to_unicode(ct('utf-8'), + '\xf0value') + assert u'value' in body_unicode, repr(body_unicode) + + def _assert_encoding_detected(self, content_type, expected_encoding, body, + **kwargs): + encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) + self.assertTrue(isinstance(body_unicode, unicode)) + self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) + + def test_BOM(self): + # utf-16 cases already tested, as is the BOM detection function + + # http header takes precedence, irrespective of BOM + bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be') + expected = u'\ufffd\ufffd\x00h\x00i' + self._assert_encoding('utf-8', bom_be_str, 'utf-8', expected) + + # BOM is stripped when it agrees with the encoding, or used to + # determine encoding + bom_utf8_str = codecs.BOM_UTF8 + 'hi' + self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi") + self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi") + + def test_utf16_32(self): + # tools.ietf.org/html/rfc2781 section 4.3 + + # USE BOM and strip it + bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be') + self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', u"hi") + self._assert_encoding(None, bom_be_str, 'utf-16-be', u"hi") + + bom_le_str = codecs.BOM_UTF16_LE + u"hi".encode('utf-16-le') + self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi") + self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi") + + bom_be_str = codecs.BOM_UTF32_BE + u"hi".encode('utf-32-be') + self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', u"hi") + self._assert_encoding(None, bom_be_str, 'utf-32-be', u"hi") + + bom_le_str = codecs.BOM_UTF32_LE + u"hi".encode('utf-32-le') + self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', u"hi") + self._assert_encoding(None, bom_le_str, 'utf-32-le', u"hi") + + # if there is no BOM, big endian should be chosen + self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi") + self._assert_encoding('utf-32', u"hi".encode('utf-32-be'), 'utf-32-be', u"hi") + + + + def test_html_encoding(self): + # extracting the encoding from raw html is tested elsewhere + body = """blah blah < meta http-equiv="Content-Type" + content="text/html; charset=iso-8859-1"> other stuff""" + self._assert_encoding_detected(None, 'cp1252', body) + + # header encoding takes precedence + self._assert_encoding_detected('utf-8', 'utf-8', body) + # BOM encoding takes precedence + self._assert_encoding_detected(None, 'utf-8', codecs.BOM_UTF8 + body) + + def test_autodetect(self): + asciif = lambda x: 'ascii' + body = """""" + # body encoding takes precedence + self._assert_encoding_detected(None, 'utf-8', body, + auto_detect_fun=asciif) + # if no other encoding, the auto detect encoding is used. + self._assert_encoding_detected(None, 'ascii', "no encoding info", + auto_detect_fun=asciif) + + def test_default_encoding(self): + # if no other method available, the default encoding of utf-8 is used + self._assert_encoding_detected(None, 'utf-8', "no encoding info") + # this can be overridden + self._assert_encoding_detected(None, 'ascii', "no encoding info", + default_encoding='ascii')