diff --git a/NEWS b/NEWS index 82f9ee70..a7d9b84a 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,9 @@ Changes to safe_url_string: Package is now properly marked as ``zip_safe``. +html_body_declared_encoding also detects encoding +when not sole attribute in + 1.13.0 (2015-11-05) ------------------- diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 6e856a3e..df2e5ce4 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -8,10 +8,14 @@ class RequestEncodingTests(unittest.TestCase): # Content-Type as meta http-equiv b"""""", b"""\n""", + b"""""", + b"""""", + b"""""", b"""""", b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""", # html5 meta charset b"""""", + b"""""", # xml encoding b"""""", ] diff --git a/w3lib/encoding.py b/w3lib/encoding.py index dfac016b..b7370367 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -22,14 +22,25 @@ def http_content_type_encoding(content_type): # regexp for parsing HTTP meta tags _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' +_SKIP_ATTRS = '''(?x)(?:\\s+ + [^=<>/\\s"'\x00-\x1f\x7f]+ # Attribute name + (?:\\s*=\\s* + (?: # ' and " are entity encoded (', "), so no need for \', \" + '[^']*' # attr in ' + | + "[^"]*" # attr in " + | + [^'"\\s]+ # attr having no ' nor " + ))? +)*?''' _HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type') _CONTENT_RE = _TEMPLATE % ('content', r'(?P[^;]+);\s*charset=(?P[\w-]+)') _CONTENT2_RE = _TEMPLATE % ('charset', r'(?P[\w-]+)') _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P[\w-]+)') # check for meta tags, or xml decl. and stop search if a body tag is encountered -_BODY_ENCODING_PATTERN = r'<\s*(?:meta(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % \ - (_HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE) +_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % ( + _SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE) _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I) _BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I)