Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ This is a Python library of web-related functions, such as:
* encoding mulitpart/form-data
* convert raw HTTP headers to dicts and vice-versa
* construct HTTP auth header
* converting HTML pages to unicode
* RFC-compliant url joining
* sanitize urls (like browsers do)
* extract arguments from urls
Expand All @@ -25,6 +26,7 @@ The w3lib package consists of four modules:
* ``w3lib.url`` - functions for working with URLs
* ``w3lib.html`` - functions for working with HTML
* ``w3lib.http`` - functions for working with HTTP
* ``w3lib.encoding`` - functions for working with character encoding
* ``w3lib.form`` - functions for working with web forms

Requirements
Expand Down
187 changes: 187 additions & 0 deletions w3lib/encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
"""
Functions for handling encoding of web pages
"""
import re, codecs, encodings

_HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)

def http_content_type_encoding(content_type):
"""Extract the encoding in the content-type header"""
if content_type:
match = _HEADER_ENCODING_RE.search(content_type)
if match:
return resolve_encoding(match.group(1))

# regexp for parsing HTTP meta tags
_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type')
_CONTENT_RE = _TEMPLATE % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P<charset2>[\w-]+)')
_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')

# check for meta tags, or xml decl. and stop search if a body tag is encountered
_BODY_ENCODING_RE = re.compile(
r'<\s*(?:meta\s+(?:%s\s+%s|%s)|\?xml\s[^>]+%s|body)' % \
(_HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE), re.I)

def html_body_declared_encoding(html_body_str):
"""encoding specified in meta tags in the html body, or None if no
suitable encoding was found
"""
# html5 suggests the first 1024 bytes are sufficient, we allow for more
chunk = html_body_str[:4096]
match = _BODY_ENCODING_RE.search(chunk)
if match:
encoding = match.group('charset') or match.group('charset2') \
or match.group('xmlcharset')
return resolve_encoding(encoding)

# Default encoding translation
# this maps cannonicalized encodings to target encodings
# see http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#character-encodings-0
# in addition, gb18030 supercedes gb2312 & gbk
# the keys are converted using _c18n_encoding and in sorted order
DEFAULT_ENCODING_TRANSLATION = {
'ascii': 'cp1252',
'euc_kr': 'cp949',
'gb2312': 'gb18030',
'gbk': 'gb18030',
'iso8859_11': 'cp874',
'iso8859_9': 'cp1254',
'latin_1': 'cp1252',
'macintosh': 'mac_roman',
'shift_jis': 'cp932',
'tis_620': 'cp874',
'win_1251': 'cp1251',
'windows_31j': 'cp932',
'win_31j': 'cp932',
'windows_874': 'cp874',
'win_874': 'cp874',
'x_sjis': 'cp932',
'zh_cn': 'gb18030'
}

def _c18n_encoding(encoding):
"""Cannonicalize an encoding name

This performs normalization and translates aliases using python's
encoding aliases
"""
normed = encodings.normalize_encoding(encoding).lower()
return encodings.aliases.aliases.get(normed, normed)

def resolve_encoding(encoding_alias):
"""Return the encoding the given encoding alias maps to, or None if the
encoding cannot be interpreted
"""
c18n_encoding = _c18n_encoding(encoding_alias)
translated = DEFAULT_ENCODING_TRANSLATION.get(c18n_encoding, c18n_encoding)
try:
return codecs.lookup(translated).name
except LookupError:
return None

_BOM_TABLE = [
(codecs.BOM_UTF32_BE, 'utf-32-be'),
(codecs.BOM_UTF32_LE, 'utf-32-le'),
(codecs.BOM_UTF16_BE, 'utf-16-be'),
(codecs.BOM_UTF16_LE, 'utf-16-le'),
(codecs.BOM_UTF8, 'utf-8')
]
_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)

def read_bom(data):
"""Read the byte order mark in the text, if present, and
return the encoding represented by the BOM and the BOM.

If no BOM can be detected, (None, None) is returned.
"""
# common case is no BOM, so this is fast
if data[0] in _FIRST_CHARS:
for bom, encoding in _BOM_TABLE:
if data.startswith(bom):
return encoding, bom
return None, None

# Python decoder doesn't follow unicode standard when handling
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1))

def to_unicode(data_str, encoding):
"""Convert a str object to unicode using the encoding given

Characters that cannot be converted will be converted to '\ufffd' (the
unicode replacement character).
"""
data_str.decode(encoding, 'w3lib_replace')

def _enc_unicode(data_str, encoding):
"""convert the data_str to unicode inserting the unicode replacement
character where necessary.

returns (encoding, unicode)
"""
return encoding, data_str.decode(encoding, 'w3lib_replace')

def html_to_unicode(content_type_header, html_body_str,
default_encoding='utf8', auto_detect_fun=None):
"""Convert raw html bytes to unicode

This attempts to make a reasonable guess at the content encoding of the
html body, following a similar process as a web browser.

It will try in order:
* http content type header
* BOM (byte-order mark)
* meta or xml tag declarations
* auto-detection, if the `auto_detect_fun` keyword argument is not None
* default encoding in keyword arg (which defaults to utf8)

If an encoding other than the auto-detected or default encoding is used,
overrides will be applied, converting some character encodings to more
suitable alternatives.

If a BOM is found matching the encoding, it will be stripped.

The `auto_detect_fun` argument can be used to pass a function that will
sniff the encoding of the text. This function must take the raw text as an
argument and return the name of an encoding that python can process, or
None. To use chardet, for example, you can define the function as:
auto_detect_fun=lambda x: chardet.detect(x).get('encoding')
or to use UnicodeDammit (shipped with the BeautifulSoup library):
auto_detect_fun=lambda x: UnicodeDammit(x).originalEncoding

If the locale of the website or user language preference is known, then a
better default encoding can be supplied.

If the content type header is not present, None can be passed signifying
that the header was not present.

This method will not fail, if characters cannot be converted to unicode,
'\ufffd' (the unicode replacement character) will be inserted instead.

returns a tuple of (encoding used, unicode)
"""
enc = http_content_type_encoding(content_type_header)
bom_enc, bom = read_bom(html_body_str)
if enc is not None:
# remove BOM if it agrees with the encoding
if enc == bom_enc:
html_body_str = html_body_str[len(bom):]
elif enc == 'utf-16' or enc == 'utf-32':
# read endianness from BOM, or default to big endian
# tools.ietf.org/html/rfc2781 section 4.3
if bom_enc is not None and bom_enc.startswith(enc):
enc = bom_enc
html_body_str = html_body_str[len(bom):]
else:
enc += '-be'
return _enc_unicode(html_body_str, enc)
if bom_enc is not None:
return _enc_unicode(html_body_str[len(bom):], bom_enc)
enc = html_body_declared_encoding(html_body_str)
if enc is None and (auto_detect_fun is not None):
enc = auto_detect_fun(html_body_str)
if enc is None:
enc = default_encoding
return _enc_unicode(html_body_str, enc)
180 changes: 180 additions & 0 deletions w3lib/tests/test_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import unittest, codecs
from w3lib.encoding import (html_body_declared_encoding, read_bom,
http_content_type_encoding, resolve_encoding, html_to_unicode)

class RequestEncodingTests(unittest.TestCase):
def test_bom(self):
# cjk water character in unicode
water_unicode = u'\u6C34'
# BOM + water character encoded
utf16be = '\xfe\xff\x6c\x34'
utf16le = '\xff\xfe\x34\x6c'
utf32be = '\x00\x00\xfe\xff\x00\x00\x6c\x34'
utf32le = '\xff\xfe\x00\x00\x34\x6c\x00\x00'
for string in (utf16be, utf16le, utf32be, utf32le):
bom_encoding, bom = read_bom(string)
decoded = string[len(bom):].decode(bom_encoding)
self.assertEqual(water_unicode, decoded)
enc, bom = read_bom("foo")
self.assertEqual(enc, None)
self.assertEqual(bom, None)

def test_http_encoding_header(self):
header_value = "Content-Type: text/html; charset=ISO-8859-4"
extracted = http_content_type_encoding(header_value)
self.assertEqual(extracted, "iso8859-4")
self.assertEqual(None, http_content_type_encoding("something else"))

def test_html_body_declared_encoding(self):
format1 = """
<meta http-equiv="Content-Type"
content="text/html; charset=utf-8">
"""
format2 = """<meta charset="utf-8">"""
format3 = """<?xml version="1.0" encoding="utf-8"?>"""
format4 = """ bad html still supported < meta http-equiv='Content-Type'
content="text/html; charset=utf-8">"""
for fragment in (format1, format2, format3, format4):
encoding = html_body_declared_encoding(fragment)
self.assertEqual(encoding, 'utf-8')
self.assertEqual(None, html_body_declared_encoding("something else"))

class CodecsEncodingTestCase(unittest.TestCase):
def test_resolve_encoding(self):
self.assertEqual(resolve_encoding('latin1'), 'cp1252')
self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252')
self.assertEqual(resolve_encoding('unknown encoding'), None)

def ct(charset):
return "Content-Type: text/html; charset=" + charset if charset else None

def norm_encoding(enc):
return codecs.lookup(enc).name

class HtmlConversionTests(unittest.TestCase):

def test_unicode_body(self):
unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442'
original_string = unicode_string.encode('cp1251')
encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string)
# check body_as_unicode
self.assertTrue(isinstance(body_unicode, unicode))
self.assertEqual(body_unicode, unicode_string)

def _assert_encoding(self, content_type, body, expected_encoding,
expected_unicode):
encoding, body_unicode = html_to_unicode(ct(content_type), body)
self.assertTrue(isinstance(body_unicode, unicode))
self.assertEqual(norm_encoding(encoding),
norm_encoding(expected_encoding))
self.assertEqual(body_unicode, expected_unicode)

def test_content_type_and_conversion(self):
"""Test content type header is interpreted and text converted as
expected
"""
self._assert_encoding('utf-8', "\xc2\xa3", 'utf-8', u"\xa3")
# something like this in the scrapy tests - but that's invalid?
# self._assert_encoding('', "\xa3", 'utf-8', u"\xa3")
# iso-8859-1 is overridden to cp1252
self._assert_encoding('iso-8859-1', "\xa3", 'cp1252', u"\xa3")
self._assert_encoding('', "\xc2\xa3", 'utf-8', u"\xa3")
self._assert_encoding('none', "\xc2\xa3", 'utf-8', u"\xa3")
self._assert_encoding('gb2312', "\xa8D", 'gb18030', u"\u2015")
self._assert_encoding('gbk', "\xa8D", 'gb18030', u"\u2015")

def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
# unlike scrapy, the BOM is stripped
self._assert_encoding('utf-8', "\xef\xbb\xbfWORD\xe3\xab",
'utf-8',u'WORD\ufffd\ufffd')
self._assert_encoding(None, "\xef\xbb\xbfWORD\xe3\xab",
'utf-8',u'WORD\ufffd\ufffd')

def test_replace_wrong_encoding(self):
"""Test invalid chars are replaced properly"""
encoding, body_unicode = html_to_unicode(ct('utf-8'),
'PREFIX\xe3\xabSUFFIX')
# XXX: Policy for replacing invalid chars may suffer minor variations
# but it should always contain the unicode replacement char (u'\ufffd')
assert u'\ufffd' in body_unicode, repr(body_unicode)
assert u'PREFIX' in body_unicode, repr(body_unicode)
assert u'SUFFIX' in body_unicode, repr(body_unicode)

# Do not destroy html tags due to encoding bugs
encoding, body_unicode = html_to_unicode(ct('utf-8'),
'\xf0<span>value</span>')
assert u'<span>value</span>' in body_unicode, repr(body_unicode)

def _assert_encoding_detected(self, content_type, expected_encoding, body,
**kwargs):
encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs)
self.assertTrue(isinstance(body_unicode, unicode))
self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))

def test_BOM(self):
# utf-16 cases already tested, as is the BOM detection function

# http header takes precedence, irrespective of BOM
bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
expected = u'\ufffd\ufffd\x00h\x00i'
self._assert_encoding('utf-8', bom_be_str, 'utf-8', expected)

# BOM is stripped when it agrees with the encoding, or used to
# determine encoding
bom_utf8_str = codecs.BOM_UTF8 + 'hi'
self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi")
self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi")

def test_utf16_32(self):
# tools.ietf.org/html/rfc2781 section 4.3

# USE BOM and strip it
bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', u"hi")
self._assert_encoding(None, bom_be_str, 'utf-16-be', u"hi")

bom_le_str = codecs.BOM_UTF16_LE + u"hi".encode('utf-16-le')
self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi")
self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi")

bom_be_str = codecs.BOM_UTF32_BE + u"hi".encode('utf-32-be')
self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', u"hi")
self._assert_encoding(None, bom_be_str, 'utf-32-be', u"hi")

bom_le_str = codecs.BOM_UTF32_LE + u"hi".encode('utf-32-le')
self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', u"hi")
self._assert_encoding(None, bom_le_str, 'utf-32-le', u"hi")

# if there is no BOM, big endian should be chosen
self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi")
self._assert_encoding('utf-32', u"hi".encode('utf-32-be'), 'utf-32-be', u"hi")



def test_html_encoding(self):
# extracting the encoding from raw html is tested elsewhere
body = """blah blah < meta http-equiv="Content-Type"
content="text/html; charset=iso-8859-1"> other stuff"""
self._assert_encoding_detected(None, 'cp1252', body)

# header encoding takes precedence
self._assert_encoding_detected('utf-8', 'utf-8', body)
# BOM encoding takes precedence
self._assert_encoding_detected(None, 'utf-8', codecs.BOM_UTF8 + body)

def test_autodetect(self):
asciif = lambda x: 'ascii'
body = """<meta charset="utf-8">"""
# body encoding takes precedence
self._assert_encoding_detected(None, 'utf-8', body,
auto_detect_fun=asciif)
# if no other encoding, the auto detect encoding is used.
self._assert_encoding_detected(None, 'ascii', "no encoding info",
auto_detect_fun=asciif)

def test_default_encoding(self):
# if no other method available, the default encoding of utf-8 is used
self._assert_encoding_detected(None, 'utf-8', "no encoding info")
# this can be overridden
self._assert_encoding_detected(None, 'ascii', "no encoding info",
default_encoding='ascii')