From fb78fdefb3420498f083fbbd6a8ab75747c36d19 Mon Sep 17 00:00:00 2001 From: Shane Evans Date: Wed, 8 Feb 2012 22:48:42 +0000 Subject: [PATCH 1/3] add functions for determining the encoding of html This is based on the encoding detection in scrapy --- README.rst | 2 + w3lib/encoding.py | 185 +++++++++++++++++++++++++++++++++++ w3lib/tests/test_encoding.py | 171 ++++++++++++++++++++++++++++++++ 3 files changed, 358 insertions(+) create mode 100644 w3lib/encoding.py create mode 100644 w3lib/tests/test_encoding.py diff --git a/README.rst b/README.rst index b2cb5c1f..e4b7978f 100644 --- a/README.rst +++ b/README.rst @@ -13,6 +13,7 @@ This is a Python library of web-related functions, such as: * encoding mulitpart/form-data * convert raw HTTP headers to dicts and vice-versa * construct HTTP auth header +* converting HTML pages to unicode * RFC-compliant url joining * sanitize urls (like browsers do) * extract arguments from urls @@ -25,6 +26,7 @@ The w3lib package consists of four modules: * ``w3lib.url`` - functions for working with URLs * ``w3lib.html`` - functions for working with HTML * ``w3lib.http`` - functions for working with HTTP +* ``w3lib.encoding`` - functions for working with character encoding * ``w3lib.form`` - functions for working with web forms Requirements diff --git a/w3lib/encoding.py b/w3lib/encoding.py new file mode 100644 index 00000000..095821bc --- /dev/null +++ b/w3lib/encoding.py @@ -0,0 +1,185 @@ +""" +Functions for handling encoding of web pages +""" +import re, codecs, encodings + +_HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I) + +def http_content_type_encoding(content_type): + """Extract the encoding in the content-type header""" + if content_type: + match = _HEADER_ENCODING_RE.search(content_type) + if match: + return resolve_encoding(match.group(1)) + +# regexp for parsing HTTP meta tags +_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' +_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type') +_CONTENT_RE = _TEMPLATE % ('content', r'(?P[^;]+);\s*charset=(?P[\w-]+)') +_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P[\w-]+)') +_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P[\w-]+)') + +# check for meta tags, or xml decl. and stop search if a body tag is encountered +_BODY_ENCODING_RE = re.compile( + r'<\s*(?:meta\s+(?:%s\s+%s|%s)|\?xml\s[^>]+%s|body)' % \ + (_HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE), re.I) + +def html_body_declared_encoding(html_body_str): + """encoding specified in meta tags in the html body, or None if no + suitable encoding was found + """ + # html5 suggests the first 1024 bytes are sufficient, we allow for more + chunk = html_body_str[:4096] + match = _BODY_ENCODING_RE.search(chunk) + if match: + encoding = match.group('charset') or match.group('charset2') \ + or match.group('xmlcharset') + return resolve_encoding(encoding) + +# Default encoding translation +# this maps cannonicalized encodings to target encodings +# see http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#character-encodings-0 +# in addition, gb18030 supercedes gb2312 & gbk +# the keys are converted using _c18n_encoding and in sorted order +DEFAULT_ENCODING_TRANSLATION = { + 'ascii': 'cp1252', + 'euc_kr': 'cp949', + 'gb2312': 'gb18030', + 'gbk': 'gb18030', + 'iso8859_11': 'cp874', + 'iso8859_9': 'cp1254', + 'latin_1': 'cp1252', + 'macintosh': 'mac_roman', + 'shift_jis': 'cp932', + 'tis_620': 'cp874', + 'win_1251': 'cp1251', + 'windows_31j': 'cp932', + 'win_31j': 'cp932', + 'windows_874': 'cp874', + 'win_874': 'cp874', + 'x_sjis': 'cp932', + 'zh_cn': 'gb18030' +} + +def _c18n_encoding(encoding): + """Cannonicalize an encoding name + + This performs normalization and translates aliases using python's + encoding aliases + """ + normed = encodings.normalize_encoding(encoding).lower() + return encodings.aliases.aliases.get(normed, normed) + +def resolve_encoding(encoding_alias): + """Return the encoding the given encoding alias maps to, or None if the + encoding cannot be interpreted + """ + c18n_encoding = _c18n_encoding(encoding_alias) + translated = DEFAULT_ENCODING_TRANSLATION.get(c18n_encoding, c18n_encoding) + try: + return codecs.lookup(translated).name + except LookupError: + return None + +_BOM_TABLE = [ + (codecs.BOM_UTF32_BE, 'utf-32-be'), + (codecs.BOM_UTF32_LE, 'utf-32-le'), + (codecs.BOM_UTF16_BE, 'utf-16-be'), + (codecs.BOM_UTF16_LE, 'utf-16-le'), + (codecs.BOM_UTF8, 'utf-8') +] +_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE) + +def read_bom(data): + """Read the byte order mark in the text, if present, and + return the encoding represented by BOM and the remainder of the text. + """ + # common case is no BOM, so this is fast + if data[0] in _FIRST_CHARS: + for bom, encoding in _BOM_TABLE: + if data.startswith(bom): + return encoding, data[len(bom):] + return None, data + +# Python decoder doesn't follow unicode standard when handling +# bad utf-8 encoded strings. see http://bugs.python.org/issue8271 +codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1)) + +def to_unicode(data_str, encoding): + """Convert a str object to unicode using the encoding given + + Characters that cannot be converted will be converted to '\ufffd' (the + unicode replacement character). + """ + data_str.decode(encoding, 'w3lib_replace') + +def _enc_unicode(data_str, encoding): + """convert the data_str to unicode inserting the unicode replacement + character where necessary. + + returns (encoding, unicode) + """ + return encoding, data_str.decode(encoding, 'w3lib_replace') + +def html_to_unicode(content_type_header, html_body_str, + default_encoding='utf8', auto_detect_fun=None): + """Convert raw html bytes to unicode + + This attempts to make a reasonable guess at the content encoding of the + html body, following a similar process as a web browser. + + It will try in order: + * http content type header + * BOM (byte-order mark) + * meta or xml tag declarations + * auto-detection, if the `auto_detect_fun` keyword argument is not None + * default encoding in keyword arg (which defaults to utf8) + + If an encoding other than the auto-detected or default encoding is used, + overrides will be applied, converting some character encodings to more + suitable alternatives. + + If a BOM is found matching the encoding, it will be stripped. + + The `auto_detect_fun` argument can be used to pass a function that will + sniff the encoding of the text. This function must take the raw text as an + argument and return the name of an encoding that python can process, or + None. To use chardet, for example, you can define the function as: + auto_detect_fun=lambda x: chardet.detect(x).get('encoding') + or to use UnicodeDammit (shipped with the BeautifulSoup library): + auto_detect_fun=lambda x: UnicodeDammit(x).originalEncoding + + If the locale of the website or user language preference is known, then a + better default encoding can be supplied. + + If the content type header is not present, None can be passed instead and + it will be ignored. + + This method will not fail, if characters cannot be converted to unicode, + '\ufffd' (the unicode replacement character) will be inserted instead. + + returns a tuple of (encoding used, unicode) + """ + enc = http_content_type_encoding(content_type_header) + bom_enc, rest_of_data = read_bom(html_body_str) + if enc is not None: + # remove BOM if it agrees with the encoding + if enc == bom_enc: + html_body_str = rest_of_data + elif enc == 'utf-16': + # read endianness from BOM, or default to big endian + # tools.ietf.org/html/rfc2781 section 4.3 + if bom_enc is not None and bom_enc.startswith('utf-16'): + enc = bom_enc + html_body_str = rest_of_data + else: + enc = 'utf-16-be' + return _enc_unicode(html_body_str, enc) + if bom_enc is not None: + return _enc_unicode(rest_of_data, bom_enc) + enc = html_body_declared_encoding(html_body_str) + if enc is None and (auto_detect_fun is not None): + enc = auto_detect_fun(html_body_str) + if enc is None: + enc = default_encoding + return _enc_unicode(html_body_str, enc) diff --git a/w3lib/tests/test_encoding.py b/w3lib/tests/test_encoding.py new file mode 100644 index 00000000..3ed63ac1 --- /dev/null +++ b/w3lib/tests/test_encoding.py @@ -0,0 +1,171 @@ +import unittest, codecs +from w3lib.encoding import (html_body_declared_encoding, read_bom, + http_content_type_encoding, resolve_encoding, html_to_unicode) + +class RequestEncodingTests(unittest.TestCase): + def test_bom(self): + # cjk water character in unicode + water_unicode = u'\u6C34' + # BOM + water character encoded + utf16be = '\xfe\xff\x6c\x34' + utf16le = '\xff\xfe\x34\x6c' + utf32be = '\x00\x00\xfe\xff\x00\x00\x6c\x34' + utf32le = '\xff\xfe\x00\x00\x34\x6c\x00\x00' + for string in (utf16be, utf16le, utf32be, utf32le): + bom_encoding, data = read_bom(string) + decoded = data.decode(bom_encoding) + self.assertEqual(water_unicode, decoded) + # text is left untouched and None is returned as encoding if no BOM + enc, text = read_bom("foo") + self.assertEqual(enc, None) + self.assertEqual(text, "foo") + + def test_http_encoding_header(self): + header_value = "Content-Type: text/html; charset=ISO-8859-4" + extracted = http_content_type_encoding(header_value) + self.assertEqual(extracted, "iso8859-4") + self.assertEqual(None, http_content_type_encoding("something else")) + + def test_html_body_declared_encoding(self): + format1 = """ + + """ + format2 = """""" + format3 = """""" + format4 = """ bad html still supported < meta http-equiv='Content-Type' + content="text/html; charset=utf-8">""" + for fragment in (format1, format2, format3, format4): + encoding = html_body_declared_encoding(fragment) + self.assertEqual(encoding, 'utf-8') + self.assertEqual(None, html_body_declared_encoding("something else")) + +class CodecsEncodingTestCase(unittest.TestCase): + def test_resolve_encoding(self): + self.assertEqual(resolve_encoding('latin1'), 'cp1252') + self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252') + self.assertEqual(resolve_encoding('unknown encoding'), None) + +def ct(charset): + return "Content-Type: text/html; charset=" + charset if charset else None + +def norm_encoding(enc): + return codecs.lookup(enc).name + +class HtmlConversionTests(unittest.TestCase): + + def test_unicode_body(self): + unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442' + original_string = unicode_string.encode('cp1251') + encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string) + # check body_as_unicode + self.assertTrue(isinstance(body_unicode, unicode)) + self.assertEqual(body_unicode, unicode_string) + + def _assert_encoding(self, content_type, body, expected_encoding, + expected_unicode): + encoding, body_unicode = html_to_unicode(ct(content_type), body) + self.assertTrue(isinstance(body_unicode, unicode)) + self.assertEqual(norm_encoding(encoding), + norm_encoding(expected_encoding)) + self.assertEqual(body_unicode, expected_unicode) + + def test_content_type_and_conversion(self): + """Test content type header is interpreted and text converted as + expected + """ + self._assert_encoding('utf-8', "\xc2\xa3", 'utf-8', u"\xa3") + # something like this in the scrapy tests - but that's invalid? + # self._assert_encoding('', "\xa3", 'utf-8', u"\xa3") + # iso-8859-1 is overridden to cp1252 + self._assert_encoding('iso-8859-1', "\xa3", 'cp1252', u"\xa3") + self._assert_encoding('', "\xc2\xa3", 'utf-8', u"\xa3") + self._assert_encoding('none', "\xc2\xa3", 'utf-8', u"\xa3") + self._assert_encoding('gb2312', "\xa8D", 'gb18030', u"\u2015") + self._assert_encoding('gbk', "\xa8D", 'gb18030', u"\u2015") + + def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self): + # unlike scrapy, the BOM is stripped + self._assert_encoding('utf-8', "\xef\xbb\xbfWORD\xe3\xab", + 'utf-8',u'WORD\ufffd\ufffd') + self._assert_encoding(None, "\xef\xbb\xbfWORD\xe3\xab", + 'utf-8',u'WORD\ufffd\ufffd') + + def test_replace_wrong_encoding(self): + """Test invalid chars are replaced properly""" + encoding, body_unicode = html_to_unicode(ct('utf-8'), + 'PREFIX\xe3\xabSUFFIX') + # XXX: Policy for replacing invalid chars may suffer minor variations + # but it should always contain the unicode replacement char (u'\ufffd') + assert u'\ufffd' in body_unicode, repr(body_unicode) + assert u'PREFIX' in body_unicode, repr(body_unicode) + assert u'SUFFIX' in body_unicode, repr(body_unicode) + + # Do not destroy html tags due to encoding bugs + encoding, body_unicode = html_to_unicode(ct('utf-8'), + '\xf0value') + assert u'value' in body_unicode, repr(body_unicode) + + def _assert_encoding_detected(self, content_type, expected_encoding, body, + **kwargs): + encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) + self.assertTrue(isinstance(body_unicode, unicode)) + self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) + + def test_BOM(self): + # utf-16 cases already tested, as is the BOM detection function + + # http header takes precedence, irrespective of BOM + bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be') + expected = u'\ufffd\ufffd\x00h\x00i' + self._assert_encoding('utf-8', bom_be_str, 'utf-8', expected) + + # BOM is stripped when it agrees with the encoding, or used to + # determine encoding + bom_utf8_str = codecs.BOM_UTF8 + 'hi' + self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi") + self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi") + + def test_utf16(self): + # tools.ietf.org/html/rfc2781 section 4.3 + + # USE BOM and strip it + bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be') + self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', u"hi") + self._assert_encoding(None, bom_be_str, 'utf-16-be', u"hi") + + bom_le_str = codecs.BOM_UTF16_LE + u"hi".encode('utf-16-le') + self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi") + self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi") + + # if there is no BOM, but the data is known to be utf-16, + # big endian should be chosen + self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi") + + def test_html_encoding(self): + # extracting the encoding from raw html is tested elsewhere + body = """blah blah < meta http-equiv="Content-Type" + content="text/html; charset=iso-8859-1"> other stuff""" + self._assert_encoding_detected(None, 'cp1252', body) + + # header encoding takes precedence + self._assert_encoding_detected('utf-8', 'utf-8', body) + # BOM encoding takes precedence + self._assert_encoding_detected(None, 'utf-8', codecs.BOM_UTF8 + body) + + def test_autodetect(self): + asciif = lambda x: 'ascii' + body = """""" + # body encoding takes precedence + self._assert_encoding_detected(None, 'utf-8', body, + auto_detect_fun=asciif) + # if no other encoding, the auto detect encoding is used. + self._assert_encoding_detected(None, 'ascii', "no encoding info", + auto_detect_fun=asciif) + + def test_default_encoding(self): + # if no other method available, the default encoding of utf-8 is used + self._assert_encoding_detected(None, 'utf-8', "no encoding info") + # this can be overridden + self._assert_encoding_detected(None, 'ascii', "no encoding info", + default_encoding='ascii') From 1aabc7372ecb301bf655d15b317e078f095d1f3d Mon Sep 17 00:00:00 2001 From: Shane Evans Date: Mon, 13 Feb 2012 17:25:37 +0000 Subject: [PATCH 2/3] extend BOM detection to utf-32 and perf. improvement --- w3lib/encoding.py | 22 ++++++++++++---------- w3lib/tests/test_encoding.py | 25 +++++++++++++++++-------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 095821bc..bd21f076 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -92,14 +92,16 @@ def resolve_encoding(encoding_alias): def read_bom(data): """Read the byte order mark in the text, if present, and - return the encoding represented by BOM and the remainder of the text. + return the encoding represented by the BOM and the BOM. + + If no BOM can be detected, (None, None) is returned. """ # common case is no BOM, so this is fast if data[0] in _FIRST_CHARS: for bom, encoding in _BOM_TABLE: if data.startswith(bom): - return encoding, data[len(bom):] - return None, data + return encoding, bom + return None, None # Python decoder doesn't follow unicode standard when handling # bad utf-8 encoded strings. see http://bugs.python.org/issue8271 @@ -161,22 +163,22 @@ def html_to_unicode(content_type_header, html_body_str, returns a tuple of (encoding used, unicode) """ enc = http_content_type_encoding(content_type_header) - bom_enc, rest_of_data = read_bom(html_body_str) + bom_enc, bom = read_bom(html_body_str) if enc is not None: # remove BOM if it agrees with the encoding if enc == bom_enc: - html_body_str = rest_of_data - elif enc == 'utf-16': + html_body_str = html_body_str[len(bom):] + elif enc == 'utf-16' or enc == 'utf-32': # read endianness from BOM, or default to big endian # tools.ietf.org/html/rfc2781 section 4.3 - if bom_enc is not None and bom_enc.startswith('utf-16'): + if bom_enc is not None and bom_enc.startswith(enc): enc = bom_enc - html_body_str = rest_of_data + html_body_str = html_body_str[len(bom):] else: - enc = 'utf-16-be' + enc += '-be' return _enc_unicode(html_body_str, enc) if bom_enc is not None: - return _enc_unicode(rest_of_data, bom_enc) + return _enc_unicode(html_body_str[len(bom):], bom_enc) enc = html_body_declared_encoding(html_body_str) if enc is None and (auto_detect_fun is not None): enc = auto_detect_fun(html_body_str) diff --git a/w3lib/tests/test_encoding.py b/w3lib/tests/test_encoding.py index 3ed63ac1..a5712eb3 100644 --- a/w3lib/tests/test_encoding.py +++ b/w3lib/tests/test_encoding.py @@ -12,13 +12,12 @@ def test_bom(self): utf32be = '\x00\x00\xfe\xff\x00\x00\x6c\x34' utf32le = '\xff\xfe\x00\x00\x34\x6c\x00\x00' for string in (utf16be, utf16le, utf32be, utf32le): - bom_encoding, data = read_bom(string) - decoded = data.decode(bom_encoding) + bom_encoding, bom = read_bom(string) + decoded = string[len(bom):].decode(bom_encoding) self.assertEqual(water_unicode, decoded) - # text is left untouched and None is returned as encoding if no BOM - enc, text = read_bom("foo") + enc, bom = read_bom("foo") self.assertEqual(enc, None) - self.assertEqual(text, "foo") + self.assertEqual(bom, None) def test_http_encoding_header(self): header_value = "Content-Type: text/html; charset=ISO-8859-4" @@ -126,7 +125,7 @@ def test_BOM(self): self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi") self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi") - def test_utf16(self): + def test_utf16_32(self): # tools.ietf.org/html/rfc2781 section 4.3 # USE BOM and strip it @@ -138,9 +137,19 @@ def test_utf16(self): self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi") self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi") - # if there is no BOM, but the data is known to be utf-16, - # big endian should be chosen + bom_be_str = codecs.BOM_UTF32_BE + u"hi".encode('utf-32-be') + self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', u"hi") + self._assert_encoding(None, bom_be_str, 'utf-32-be', u"hi") + + bom_le_str = codecs.BOM_UTF32_LE + u"hi".encode('utf-32-le') + self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', u"hi") + self._assert_encoding(None, bom_le_str, 'utf-32-le', u"hi") + + # if there is no BOM, big endian should be chosen self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi") + self._assert_encoding('utf-32', u"hi".encode('utf-32-be'), 'utf-32-be', u"hi") + + def test_html_encoding(self): # extracting the encoding from raw html is tested elsewhere From 2ef15191356c5a209f744bb3b569bf1433eab737 Mon Sep 17 00:00:00 2001 From: Shane Evans Date: Tue, 14 Feb 2012 10:56:42 +0000 Subject: [PATCH 3/3] small improvement to documentation The content type header parameter to html_to_unicode has been documented more clearly. --- w3lib/encoding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/w3lib/encoding.py b/w3lib/encoding.py index bd21f076..7ca9d7ea 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -154,9 +154,9 @@ def html_to_unicode(content_type_header, html_body_str, If the locale of the website or user language preference is known, then a better default encoding can be supplied. - If the content type header is not present, None can be passed instead and - it will be ignored. - + If the content type header is not present, None can be passed signifying + that the header was not present. + This method will not fail, if characters cannot be converted to unicode, '\ufffd' (the unicode replacement character) will be inserted instead.