From 04415e4908bf67b2b7e4ce65923464cfab58bf4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=9D=CE=B9=CE=BA=CF=8C=CE=BB=CE=B1=CE=BF=CF=82-=CE=94?= =?UTF-8?q?=CE=B9=CE=B3=CE=B5=CE=BD=CE=AE=CF=82=20=CE=9A=CE=B1=CF=81=CE=B1?= =?UTF-8?q?=CE=B3=CE=B9=CE=AC=CE=BD=CE=BD=CE=B7=CF=82?= Date: Mon, 8 Feb 2016 17:17:10 +0200 Subject: [PATCH 1/4] add test for encoding as a separate attribute --- tests/test_encoding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 6e856a3e..39955ad3 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -8,6 +8,7 @@ class RequestEncodingTests(unittest.TestCase): # Content-Type as meta http-equiv b"""""", b"""\n""", + b"""""", b"""""", b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""", # html5 meta charset From 1970e87d27000178de659fe6c867d5660d953b73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=9D=CE=B9=CE=BA=CF=8C=CE=BB=CE=B1=CE=BF=CF=82-=CE=94?= =?UTF-8?q?=CE=B9=CE=B3=CE=B5=CE=BD=CE=AE=CF=82=20=CE=9A=CE=B1=CF=81=CE=B1?= =?UTF-8?q?=CE=B3=CE=B9=CE=AC=CE=BD=CE=BD=CE=B7=CF=82?= Date: Mon, 4 Apr 2016 16:40:01 +0300 Subject: [PATCH 2/4] test for unusual quotes and spaces --- tests/test_encoding.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 39955ad3..df2e5ce4 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -9,10 +9,13 @@ class RequestEncodingTests(unittest.TestCase): b"""""", b"""\n""", b"""""", + b"""""", + b"""""", b"""""", b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""", # html5 meta charset b"""""", + b"""""", # xml encoding b"""""", ] From 5d088f0a85aeef6c5dfd1ed1f7fd092614734913 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=9D=CE=B9=CE=BA=CF=8C=CE=BB=CE=B1=CE=BF=CF=82-=CE=94?= =?UTF-8?q?=CE=B9=CE=B3=CE=B5=CE=BD=CE=AE=CF=82=20=CE=9A=CE=B1=CF=81=CE=B1?= =?UTF-8?q?=CE=B3=CE=B9=CE=AC=CE=BD=CE=BD=CE=B7=CF=82?= Date: Mon, 8 Feb 2016 18:39:27 +0200 Subject: [PATCH 3/4] Fix charset detection when meta has many attrs --- w3lib/encoding.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/w3lib/encoding.py b/w3lib/encoding.py index dfac016b..b7370367 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -22,14 +22,25 @@ def http_content_type_encoding(content_type): # regexp for parsing HTTP meta tags _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' +_SKIP_ATTRS = '''(?x)(?:\\s+ + [^=<>/\\s"'\x00-\x1f\x7f]+ # Attribute name + (?:\\s*=\\s* + (?: # ' and " are entity encoded (', "), so no need for \', \" + '[^']*' # attr in ' + | + "[^"]*" # attr in " + | + [^'"\\s]+ # attr having no ' nor " + ))? +)*?''' _HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type') _CONTENT_RE = _TEMPLATE % ('content', r'(?P[^;]+);\s*charset=(?P[\w-]+)') _CONTENT2_RE = _TEMPLATE % ('charset', r'(?P[\w-]+)') _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P[\w-]+)') # check for meta tags, or xml decl. and stop search if a body tag is encountered -_BODY_ENCODING_PATTERN = r'<\s*(?:meta(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % \ - (_HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE) +_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % ( + _SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE) _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I) _BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I) From f7f48f86dd42c21841a6da3f37eaae38ee609c96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=CE=9D=CE=B9=CE=BA=CF=8C=CE=BB=CE=B1=CE=BF=CF=82-=CE=94?= =?UTF-8?q?=CE=B9=CE=B3=CE=B5=CE=BD=CE=AE=CF=82=20=CE=9A=CE=B1=CF=81=CE=B1?= =?UTF-8?q?=CE=B3=CE=B9=CE=AC=CE=BD=CE=BD=CE=B7=CF=82?= Date: Mon, 4 Apr 2016 22:33:52 +0300 Subject: [PATCH 4/4] release note for html_body_declared_encoding patch --- NEWS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS b/NEWS index 82f9ee70..a7d9b84a 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,9 @@ Changes to safe_url_string: Package is now properly marked as ``zip_safe``. +html_body_declared_encoding also detects encoding +when not sole attribute in + 1.13.0 (2015-11-05) -------------------