scrapy-plugins · elacuesta · Mar 24, 2022 · Mar 23, 2022 · Mar 23, 2022 · Mar 23, 2022
diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -6,7 +6,7 @@
 from inspect import isawaitable
 from ipaddress import ip_address
 from time import time
-from typing import Callable, Dict, Optional, Type, TypeVar
+from typing import Callable, Dict, Generator, Optional, Tuple, Type, TypeVar
 
 from playwright.async_api import (
     BrowserContext,
@@ -283,8 +283,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
 
         headers = Headers(response.headers)
         headers.pop("Content-Encoding", None)
-        encoding = _get_response_encoding(headers, body_str) or "utf-8"
-        body = body_str.encode(encoding)
+        body, encoding = _encode_body(headers=headers, text=body_str)
         respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
         return respcls(
             url=page.url,
@@ -352,11 +351,19 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
         return _request_handler
 
 
-def _get_response_encoding(headers: Headers, body: str) -> Optional[str]:
-    encoding = None
+def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]:
     if headers.get("content-type"):
         content_type = to_unicode(headers["content-type"])
-        encoding = http_content_type_encoding(content_type)
-    if not encoding:
-        encoding = html_body_declared_encoding(body)
-    return encoding
+        yield http_content_type_encoding(content_type)
+    yield html_body_declared_encoding(text)
+
+
+def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]:
+    for encoding in filter(None, _possible_encodings(headers, text)):
+        try:
+            body = text.encode(encoding)
+        except UnicodeEncodeError:
+            pass
+        else:
+            return body, encoding
+    return text.encode("utf-8"), "utf-8"  # fallback
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
@@ -0,0 +1,62 @@
+import pytest
+from scrapy.http.headers import Headers
+
+from scrapy_playwright.handler import _encode_body
+
+
+def body_str(charset: str, content: str = "áéíóú") -> str:
+    return f"""
+        <!doctype html>
+        <html>
+        <head>
+        <meta charset="{charset}">
+        </head>
+        <body>
+        <p>{content}</p>
+        </body>
+        </html>
+    """.strip()
+
+
+@pytest.mark.asyncio
+async def test_encode_from_headers():
+    """Charset declared in headers takes precedence"""
+    text = body_str("gb2312")
+    body, encoding = _encode_body(
+        headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
+        text=text,
+    )
+    assert encoding == "cp1252"
+    assert body == text.encode(encoding)
+
+
+@pytest.mark.asyncio
+async def test_encode_from_body():
+    """No charset declared in headers, use the one declared in the body"""
+    text = body_str("gb2312")
+    body, encoding = _encode_body(headers=Headers({}), text=text)
+    assert encoding == "gb18030"
+    assert body == text.encode(encoding)
+
+
+@pytest.mark.asyncio
+async def test_encode_fallback():
+    """No charset declared, use utf-8 as fallback"""
+    text = "<html>áéíóú</html>"
+    body, encoding = _encode_body(headers=Headers(), text=text)
+    assert encoding == "utf-8"
+    assert body == text.encode(encoding)
+
+
+@pytest.mark.asyncio
+async def test_encode_mismatch():
+    """Charset declared in headers and body do not match, and the headers
+    one fails to encode: use the one in the body (first one that works)
+    """
+    text = body_str("gb2312", content="空手道")
+    body, encoding = _encode_body(
+        headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
+        text=text,
+    )
+    assert encoding == "gb18030"
+    assert body == text.encode(encoding)
diff --git a/tests/test_misc.py b/tests/test_misc.py