diff --git a/NEWS b/NEWS
index 82f9ee70..a7d9b84a 100644
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,9 @@ Changes to safe_url_string:
Package is now properly marked as ``zip_safe``.
+html_body_declared_encoding also detects encoding
+when not sole attribute in
+
1.13.0 (2015-11-05)
-------------------
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
index 6e856a3e..df2e5ce4 100644
--- a/tests/test_encoding.py
+++ b/tests/test_encoding.py
@@ -8,10 +8,14 @@ class RequestEncodingTests(unittest.TestCase):
# Content-Type as meta http-equiv
b"""""",
b"""\n""",
+ b"""""",
+ b"""""",
+ b"""""",
b"""""",
b""" bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""",
# html5 meta charset
b"""""",
+ b"""""",
# xml encoding
b"""""",
]
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index dfac016b..b7370367 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -22,14 +22,25 @@ def http_content_type_encoding(content_type):
# regexp for parsing HTTP meta tags
_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
+_SKIP_ATTRS = '''(?x)(?:\\s+
+ [^=<>/\\s"'\x00-\x1f\x7f]+ # Attribute name
+ (?:\\s*=\\s*
+ (?: # ' and " are entity encoded (', "), so no need for \', \"
+ '[^']*' # attr in '
+ |
+ "[^"]*" # attr in "
+ |
+ [^'"\\s]+ # attr having no ' nor "
+ ))?
+)*?'''
_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type')
_CONTENT_RE = _TEMPLATE % ('content', r'(?P[^;]+);\s*charset=(?P[\w-]+)')
_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P[\w-]+)')
_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P[\w-]+)')
# check for meta tags, or xml decl. and stop search if a body tag is encountered
-_BODY_ENCODING_PATTERN = r'<\s*(?:meta(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % \
- (_HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
+_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
+ _SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
_BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I)
_BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I)