From fb78fdefb3420498f083fbbd6a8ab75747c36d19 Mon Sep 17 00:00:00 2001
From: Shane Evans <shane.evans@gmail.com>
Date: Wed, 8 Feb 2012 22:48:42 +0000
Subject: [PATCH 1/3] add functions for determining the encoding of html

This is based on the encoding detection in scrapy
---
 README.rst                   |   2 +
 w3lib/encoding.py            | 185 +++++++++++++++++++++++++++++++++++
 w3lib/tests/test_encoding.py | 171 ++++++++++++++++++++++++++++++++
 3 files changed, 358 insertions(+)
 create mode 100644 w3lib/encoding.py
 create mode 100644 w3lib/tests/test_encoding.py
diff --git a/README.rst b/README.rst
index b2cb5c1f..e4b7978f 100644
--- a/README.rst
+++ b/README.rst
@@ -13,6 +13,7 @@ This is a Python library of web-related functions, such as:
 * encoding mulitpart/form-data
 * convert raw HTTP headers to dicts and vice-versa
 * construct HTTP auth header
+* converting HTML pages to unicode
 * RFC-compliant url joining
 * sanitize urls (like browsers do)
 * extract arguments from urls
@@ -25,6 +26,7 @@ The w3lib package consists of four modules:
 * ``w3lib.url`` - functions for working with URLs
 * ``w3lib.html`` - functions for working with HTML
 * ``w3lib.http`` - functions for working with HTTP
+* ``w3lib.encoding`` - functions for working with character encoding
 * ``w3lib.form`` - functions for working with web forms
 
 Requirements
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
new file mode 100644
index 00000000..095821bc
--- /dev/null
+++ b/w3lib/encoding.py
@@ -0,0 +1,185 @@
+"""
+Functions for handling encoding of web pages
+"""
+import re, codecs, encodings
+
+_HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
+
+def http_content_type_encoding(content_type):
+    """Extract the encoding in the content-type header"""
+    if content_type:
+        match = _HEADER_ENCODING_RE.search(content_type)
+        if match:
+            return resolve_encoding(match.group(1))
+
+# regexp for parsing HTTP meta tags
+_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
+_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type')
+_CONTENT_RE = _TEMPLATE % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
+_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P<charset2>[\w-]+)')
+_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
+
+# check for meta tags, or xml decl. and stop search if a body tag is encountered
+_BODY_ENCODING_RE = re.compile(
+    r'<\s*(?:meta\s+(?:%s\s+%s|%s)|\?xml\s[^>]+%s|body)' % \
+        (_HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE), re.I)
+
+def html_body_declared_encoding(html_body_str):
+    """encoding specified in meta tags in the html body, or None if no 
+    suitable encoding was found
+    """
+    # html5 suggests the first 1024 bytes are sufficient, we allow for more
+    chunk = html_body_str[:4096]
+    match = _BODY_ENCODING_RE.search(chunk)
+    if match:
+        encoding = match.group('charset') or match.group('charset2') \
+                or match.group('xmlcharset')
+        return resolve_encoding(encoding)
+
+# Default encoding translation
+# this maps cannonicalized encodings to target encodings
+# see http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#character-encodings-0
+# in addition, gb18030 supercedes gb2312 & gbk
+# the keys are converted using _c18n_encoding and in sorted order
+DEFAULT_ENCODING_TRANSLATION = {
+        'ascii': 'cp1252',
+        'euc_kr': 'cp949',
+        'gb2312': 'gb18030',
+        'gbk': 'gb18030',
+        'iso8859_11': 'cp874',
+        'iso8859_9': 'cp1254',
+        'latin_1': 'cp1252',
+        'macintosh': 'mac_roman',
+        'shift_jis': 'cp932',
+        'tis_620': 'cp874',
+        'win_1251': 'cp1251',
+        'windows_31j': 'cp932',
+        'win_31j': 'cp932',
+        'windows_874': 'cp874',
+        'win_874': 'cp874',
+        'x_sjis': 'cp932',
+        'zh_cn': 'gb18030'
+}
+
+def _c18n_encoding(encoding):
+    """Cannonicalize an encoding name
+
+    This performs normalization and translates aliases using python's 
+    encoding aliases
+    """
+    normed = encodings.normalize_encoding(encoding).lower()
+    return encodings.aliases.aliases.get(normed, normed)
+
+def resolve_encoding(encoding_alias):
+    """Return the encoding the given encoding alias maps to, or None if the
+    encoding cannot be interpreted
+    """
+    c18n_encoding = _c18n_encoding(encoding_alias)
+    translated = DEFAULT_ENCODING_TRANSLATION.get(c18n_encoding, c18n_encoding)
+    try:
+        return codecs.lookup(translated).name
+    except LookupError:
+        return None
+
+_BOM_TABLE = [
+    (codecs.BOM_UTF32_BE, 'utf-32-be'),
+    (codecs.BOM_UTF32_LE, 'utf-32-le'),
+    (codecs.BOM_UTF16_BE, 'utf-16-be'),
+    (codecs.BOM_UTF16_LE, 'utf-16-le'),
+    (codecs.BOM_UTF8, 'utf-8')
+]
+_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
+
+def read_bom(data):
+    """Read the byte order mark in the text, if present, and 
+    return the encoding represented by BOM and the remainder of the text.
+    """
+    # common case is no BOM, so this is fast
+    if data[0] in _FIRST_CHARS:
+        for bom, encoding in _BOM_TABLE:
+            if data.startswith(bom):
+                return encoding, data[len(bom):]
+    return None, data
+
+# Python decoder doesn't follow unicode standard when handling
+# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
+codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1))
+
+def to_unicode(data_str, encoding):
+    """Convert a str object to unicode using the encoding given
+
+    Characters that cannot be converted will be converted to '\ufffd' (the
+    unicode replacement character).
+    """
+    data_str.decode(encoding, 'w3lib_replace')
+
+def _enc_unicode(data_str, encoding):
+    """convert the data_str to unicode inserting the unicode replacement
+    character where necessary. 
+    
+    returns (encoding, unicode)
+    """
+    return encoding, data_str.decode(encoding, 'w3lib_replace')
+
+def html_to_unicode(content_type_header, html_body_str, 
+        default_encoding='utf8', auto_detect_fun=None):
+    """Convert raw html bytes to unicode
+    
+    This attempts to make a reasonable guess at the content encoding of the
+    html body, following a similar process as a web browser. 
+
+    It will try in order:
+    * http content type header
+    * BOM (byte-order mark)
+    * meta or xml tag declarations
+    * auto-detection, if the `auto_detect_fun` keyword argument is not None
+    * default encoding in keyword arg (which defaults to utf8)
+    
+    If an encoding other than the auto-detected or default encoding is used,
+    overrides will be applied, converting some character encodings to more
+    suitable alternatives.
+    
+    If a BOM is found matching the encoding, it will be stripped.
+    
+    The `auto_detect_fun` argument can be used to pass a function that will
+    sniff the encoding of the text. This function must take the raw text as an
+    argument and return the name of an encoding that python can process, or
+    None.  To use chardet, for example, you can define the function as:
+        auto_detect_fun=lambda x: chardet.detect(x).get('encoding')
+    or to use UnicodeDammit (shipped with the BeautifulSoup library):
+        auto_detect_fun=lambda x: UnicodeDammit(x).originalEncoding
+
+    If the locale of the website or user language preference is known, then a
+    better default encoding can be supplied.
+
+    If the content type header is not present, None can be passed instead and 
+    it will be ignored.
+    
+    This method will not fail, if characters cannot be converted to unicode, 
+    '\ufffd' (the unicode replacement character) will be inserted instead.
+
+    returns a tuple of (encoding used, unicode)
+    """
+    enc = http_content_type_encoding(content_type_header)
+    bom_enc, rest_of_data = read_bom(html_body_str)
+    if enc is not None:
+            # remove BOM if it agrees with the encoding
+        if enc == bom_enc:
+            html_body_str = rest_of_data
+        elif enc == 'utf-16':
+            # read endianness from BOM, or default to big endian 
+            # tools.ietf.org/html/rfc2781 section 4.3
+            if bom_enc is not None and bom_enc.startswith('utf-16'):
+                enc = bom_enc
+                html_body_str = rest_of_data
+            else:
+                enc = 'utf-16-be'
+        return _enc_unicode(html_body_str, enc)
+    if bom_enc is not None:
+        return _enc_unicode(rest_of_data, bom_enc)
+    enc = html_body_declared_encoding(html_body_str)
+    if enc is None and (auto_detect_fun is not None):
+        enc = auto_detect_fun(html_body_str)
+    if enc is None:
+        enc = default_encoding
+    return _enc_unicode(html_body_str, enc)
diff --git a/w3lib/tests/test_encoding.py b/w3lib/tests/test_encoding.py
new file mode 100644
index 00000000..3ed63ac1
--- /dev/null
+++ b/w3lib/tests/test_encoding.py
@@ -0,0 +1,171 @@
+import unittest, codecs
+from w3lib.encoding import (html_body_declared_encoding, read_bom,
+        http_content_type_encoding, resolve_encoding, html_to_unicode)
+
+class RequestEncodingTests(unittest.TestCase):
+    def test_bom(self):
+        # cjk water character in unicode 
+        water_unicode = u'\u6C34'
+        # BOM + water character encoded
+        utf16be = '\xfe\xff\x6c\x34'
+        utf16le = '\xff\xfe\x34\x6c'
+        utf32be = '\x00\x00\xfe\xff\x00\x00\x6c\x34'
+        utf32le = '\xff\xfe\x00\x00\x34\x6c\x00\x00'
+        for string in (utf16be, utf16le, utf32be, utf32le):
+            bom_encoding, data = read_bom(string)
+            decoded = data.decode(bom_encoding)
+            self.assertEqual(water_unicode, decoded)
+        # text is left untouched and None is returned as encoding if no BOM
+        enc, text = read_bom("foo")
+        self.assertEqual(enc, None)
+        self.assertEqual(text, "foo")
+
+    def test_http_encoding_header(self):
+        header_value = "Content-Type: text/html; charset=ISO-8859-4"
+        extracted = http_content_type_encoding(header_value)
+        self.assertEqual(extracted, "iso8859-4")
+        self.assertEqual(None, http_content_type_encoding("something else"))
+
+    def test_html_body_declared_encoding(self):
+        format1 = """
+            <meta http-equiv="Content-Type" 
+                content="text/html; charset=utf-8">
+        """
+        format2 = """<meta charset="utf-8">"""
+        format3 = """<?xml version="1.0" encoding="utf-8"?>"""
+        format4 = """ bad html still supported < meta http-equiv='Content-Type' 
+            content="text/html; charset=utf-8">"""
+        for fragment in (format1, format2, format3, format4):
+            encoding = html_body_declared_encoding(fragment)
+            self.assertEqual(encoding, 'utf-8')
+        self.assertEqual(None, html_body_declared_encoding("something else"))
+
+class CodecsEncodingTestCase(unittest.TestCase):
+    def test_resolve_encoding(self):
+        self.assertEqual(resolve_encoding('latin1'), 'cp1252')
+        self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252')
+        self.assertEqual(resolve_encoding('unknown encoding'), None)
+
+def ct(charset):
+    return "Content-Type: text/html; charset=" + charset if charset else None
+
+def norm_encoding(enc):
+    return codecs.lookup(enc).name
+
+class HtmlConversionTests(unittest.TestCase):
+    
+    def test_unicode_body(self):
+        unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442'
+        original_string = unicode_string.encode('cp1251')
+        encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string)
+        # check body_as_unicode
+        self.assertTrue(isinstance(body_unicode, unicode))
+        self.assertEqual(body_unicode, unicode_string)
+    
+    def _assert_encoding(self, content_type, body, expected_encoding, 
+                expected_unicode):
+        encoding, body_unicode = html_to_unicode(ct(content_type), body)
+        self.assertTrue(isinstance(body_unicode, unicode))
+        self.assertEqual(norm_encoding(encoding), 
+                norm_encoding(expected_encoding))
+        self.assertEqual(body_unicode, expected_unicode)
+
+    def test_content_type_and_conversion(self):
+        """Test content type header is interpreted and text converted as
+        expected
+        """
+        self._assert_encoding('utf-8', "\xc2\xa3", 'utf-8', u"\xa3")
+        # something like this in the scrapy tests - but that's invalid?
+        # self._assert_encoding('', "\xa3", 'utf-8', u"\xa3")
+        # iso-8859-1 is overridden to cp1252
+        self._assert_encoding('iso-8859-1', "\xa3", 'cp1252', u"\xa3")
+        self._assert_encoding('', "\xc2\xa3", 'utf-8', u"\xa3")
+        self._assert_encoding('none', "\xc2\xa3", 'utf-8', u"\xa3")
+        self._assert_encoding('gb2312', "\xa8D", 'gb18030', u"\u2015")
+        self._assert_encoding('gbk', "\xa8D", 'gb18030', u"\u2015")
+
+    def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self):
+        # unlike scrapy, the BOM is stripped
+        self._assert_encoding('utf-8', "\xef\xbb\xbfWORD\xe3\xab", 
+                'utf-8',u'WORD\ufffd\ufffd')
+        self._assert_encoding(None, "\xef\xbb\xbfWORD\xe3\xab", 
+                'utf-8',u'WORD\ufffd\ufffd')
+
+    def test_replace_wrong_encoding(self):
+        """Test invalid chars are replaced properly"""
+        encoding, body_unicode = html_to_unicode(ct('utf-8'), 
+                'PREFIX\xe3\xabSUFFIX')
+        # XXX: Policy for replacing invalid chars may suffer minor variations
+        # but it should always contain the unicode replacement char (u'\ufffd')
+        assert u'\ufffd' in body_unicode, repr(body_unicode)
+        assert u'PREFIX' in body_unicode, repr(body_unicode)
+        assert u'SUFFIX' in body_unicode, repr(body_unicode)
+
+        # Do not destroy html tags due to encoding bugs
+        encoding, body_unicode = html_to_unicode(ct('utf-8'), 
+            '\xf0<span>value</span>')
+        assert u'<span>value</span>' in body_unicode, repr(body_unicode)
+    
+    def _assert_encoding_detected(self, content_type, expected_encoding, body, 
+            **kwargs):
+        encoding, body_unicode  = html_to_unicode(ct(content_type), body, **kwargs)
+        self.assertTrue(isinstance(body_unicode, unicode))
+        self.assertEqual(norm_encoding(encoding),  norm_encoding(expected_encoding))
+
+    def test_BOM(self):
+        # utf-16 cases already tested, as is the BOM detection function
+        
+        # http header takes precedence, irrespective of BOM
+        bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
+        expected = u'\ufffd\ufffd\x00h\x00i'
+        self._assert_encoding('utf-8', bom_be_str, 'utf-8', expected)
+    
+        # BOM is stripped when it agrees with the encoding, or used to
+        # determine encoding
+        bom_utf8_str = codecs.BOM_UTF8 + 'hi'
+        self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi")
+        self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi")
+    
+    def test_utf16(self):
+        # tools.ietf.org/html/rfc2781 section 4.3
+        
+        # USE BOM and strip it
+        bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be')
+        self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', u"hi")
+        self._assert_encoding(None, bom_be_str, 'utf-16-be', u"hi")
+        
+        bom_le_str = codecs.BOM_UTF16_LE + u"hi".encode('utf-16-le')
+        self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi")
+        self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi")
+
+        # if there is no BOM, but the data is known to be utf-16, 
+        # big endian should be chosen
+        self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi")
+    
+    def test_html_encoding(self):
+        # extracting the encoding from raw html is tested elsewhere
+        body = """blah blah < meta   http-equiv="Content-Type" 
+            content="text/html; charset=iso-8859-1"> other stuff"""
+        self._assert_encoding_detected(None, 'cp1252', body)
+    
+        # header encoding takes precedence
+        self._assert_encoding_detected('utf-8', 'utf-8', body)
+        # BOM encoding takes precedence
+        self._assert_encoding_detected(None, 'utf-8', codecs.BOM_UTF8 + body)
+        
+    def test_autodetect(self):
+        asciif = lambda x: 'ascii'
+        body = """<meta charset="utf-8">"""
+        # body encoding takes precedence
+        self._assert_encoding_detected(None, 'utf-8', body, 
+                auto_detect_fun=asciif)
+        # if no other encoding, the auto detect encoding is used.
+        self._assert_encoding_detected(None, 'ascii', "no encoding info", 
+                auto_detect_fun=asciif)
+    
+    def test_default_encoding(self):
+        # if no other method available, the default encoding of utf-8 is used
+        self._assert_encoding_detected(None, 'utf-8', "no encoding info")
+        # this can be overridden
+        self._assert_encoding_detected(None, 'ascii', "no encoding info", 
+                default_encoding='ascii')

From 1aabc7372ecb301bf655d15b317e078f095d1f3d Mon Sep 17 00:00:00 2001
From: Shane Evans <shane.evans@gmail.com>
Date: Mon, 13 Feb 2012 17:25:37 +0000
Subject: [PATCH 2/3] extend BOM detection to utf-32 and perf. improvement

---
 w3lib/encoding.py            | 22 ++++++++++++----------
 w3lib/tests/test_encoding.py | 25 +++++++++++++++++--------
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index 095821bc..bd21f076 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -92,14 +92,16 @@ def resolve_encoding(encoding_alias):
 
 def read_bom(data):
     """Read the byte order mark in the text, if present, and 
-    return the encoding represented by BOM and the remainder of the text.
+    return the encoding represented by the BOM and the BOM.
+
+    If no BOM can be detected, (None, None) is returned.
     """
     # common case is no BOM, so this is fast
     if data[0] in _FIRST_CHARS:
         for bom, encoding in _BOM_TABLE:
             if data.startswith(bom):
-                return encoding, data[len(bom):]
-    return None, data
+                return encoding, bom
+    return None, None
 
 # Python decoder doesn't follow unicode standard when handling
 # bad utf-8 encoded strings. see http://bugs.python.org/issue8271
@@ -161,22 +163,22 @@ def html_to_unicode(content_type_header, html_body_str,
     returns a tuple of (encoding used, unicode)
     """
     enc = http_content_type_encoding(content_type_header)
-    bom_enc, rest_of_data = read_bom(html_body_str)
+    bom_enc, bom = read_bom(html_body_str)
     if enc is not None:
             # remove BOM if it agrees with the encoding
         if enc == bom_enc:
-            html_body_str = rest_of_data
-        elif enc == 'utf-16':
+            html_body_str = html_body_str[len(bom):]
+        elif enc == 'utf-16' or enc == 'utf-32':
             # read endianness from BOM, or default to big endian 
             # tools.ietf.org/html/rfc2781 section 4.3
-            if bom_enc is not None and bom_enc.startswith('utf-16'):
+            if bom_enc is not None and bom_enc.startswith(enc):
                 enc = bom_enc
-                html_body_str = rest_of_data
+                html_body_str = html_body_str[len(bom):]
             else:
-                enc = 'utf-16-be'
+                enc += '-be'
         return _enc_unicode(html_body_str, enc)
     if bom_enc is not None:
-        return _enc_unicode(rest_of_data, bom_enc)
+        return _enc_unicode(html_body_str[len(bom):], bom_enc)
     enc = html_body_declared_encoding(html_body_str)
     if enc is None and (auto_detect_fun is not None):
         enc = auto_detect_fun(html_body_str)
diff --git a/w3lib/tests/test_encoding.py b/w3lib/tests/test_encoding.py
index 3ed63ac1..a5712eb3 100644
--- a/w3lib/tests/test_encoding.py
+++ b/w3lib/tests/test_encoding.py
@@ -12,13 +12,12 @@ def test_bom(self):
         utf32be = '\x00\x00\xfe\xff\x00\x00\x6c\x34'
         utf32le = '\xff\xfe\x00\x00\x34\x6c\x00\x00'
         for string in (utf16be, utf16le, utf32be, utf32le):
-            bom_encoding, data = read_bom(string)
-            decoded = data.decode(bom_encoding)
+            bom_encoding, bom = read_bom(string)
+            decoded = string[len(bom):].decode(bom_encoding)
             self.assertEqual(water_unicode, decoded)
-        # text is left untouched and None is returned as encoding if no BOM
-        enc, text = read_bom("foo")
+        enc, bom = read_bom("foo")
         self.assertEqual(enc, None)
-        self.assertEqual(text, "foo")
+        self.assertEqual(bom, None)
 
     def test_http_encoding_header(self):
         header_value = "Content-Type: text/html; charset=ISO-8859-4"
@@ -126,7 +125,7 @@ def test_BOM(self):
         self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi")
         self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi")
     
-    def test_utf16(self):
+    def test_utf16_32(self):
         # tools.ietf.org/html/rfc2781 section 4.3
         
         # USE BOM and strip it
@@ -138,9 +137,19 @@ def test_utf16(self):
         self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi")
         self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi")
 
-        # if there is no BOM, but the data is known to be utf-16, 
-        # big endian should be chosen
+        bom_be_str = codecs.BOM_UTF32_BE + u"hi".encode('utf-32-be')
+        self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', u"hi")
+        self._assert_encoding(None, bom_be_str, 'utf-32-be', u"hi")
+        
+        bom_le_str = codecs.BOM_UTF32_LE + u"hi".encode('utf-32-le')
+        self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', u"hi")
+        self._assert_encoding(None, bom_le_str, 'utf-32-le', u"hi")
+
+        # if there is no BOM,  big endian should be chosen
         self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi")
+        self._assert_encoding('utf-32', u"hi".encode('utf-32-be'), 'utf-32-be', u"hi")
+        
+
     
     def test_html_encoding(self):
         # extracting the encoding from raw html is tested elsewhere

From 2ef15191356c5a209f744bb3b569bf1433eab737 Mon Sep 17 00:00:00 2001
From: Shane Evans <shane.evans@gmail.com>
Date: Tue, 14 Feb 2012 10:56:42 +0000
Subject: [PATCH 3/3] small improvement to documentation

The content type header parameter to html_to_unicode has been documented
more clearly.
---
 w3lib/encoding.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index bd21f076..7ca9d7ea 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -154,9 +154,9 @@ def html_to_unicode(content_type_header, html_body_str,
     If the locale of the website or user language preference is known, then a
     better default encoding can be supplied.
 
-    If the content type header is not present, None can be passed instead and 
-    it will be ignored.
-    
+    If the content type header is not present, None can be passed signifying
+    that the header was not present.
+
     This method will not fail, if characters cannot be converted to unicode, 
     '\ufffd' (the unicode replacement character) will be inserted instead.