From dba1ed538ba7ce6e889876a884ec1b767c695f52 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Tue, 22 Mar 2022 22:00:08 -0300
Subject: [PATCH 1/3] Try body encodings

---
 scrapy_playwright/handler.py | 25 +++++++++++------
 tests/test_encoding.py       | 54 ++++++++++++++++++++++++++++++++++++
 tests/test_misc.py           | 29 -------------------
 3 files changed, 70 insertions(+), 38 deletions(-)
 create mode 100644 tests/test_encoding.py
 delete mode 100644 tests/test_misc.py
diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
index d8bc5b92..d121a9e1 100644
--- a/scrapy_playwright/handler.py
+++ b/scrapy_playwright/handler.py
@@ -6,7 +6,7 @@
 from inspect import isawaitable
 from ipaddress import ip_address
 from time import time
-from typing import Callable, Dict, Optional, Type, TypeVar
+from typing import Callable, Dict, Generator, Optional, Tuple, Type, TypeVar
 
 from playwright.async_api import (
     BrowserContext,
@@ -284,8 +284,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
 
         headers = Headers(response.headers)
         headers.pop("Content-Encoding", None)
-        encoding = _get_response_encoding(headers, body_str) or "utf-8"
-        body = body_str.encode(encoding)
+        body, encoding = _encode_body(headers=headers, text=body_str)
         respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
         return respcls(
             url=page.url,
@@ -353,11 +352,19 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
         return _request_handler
 
 
-def _get_response_encoding(headers: Headers, body: str) -> Optional[str]:
-    encoding = None
+def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]:
     if headers.get("content-type"):
         content_type = to_unicode(headers["content-type"])
-        encoding = http_content_type_encoding(content_type)
-    if not encoding:
-        encoding = html_body_declared_encoding(body)
-    return encoding
+        yield http_content_type_encoding(content_type)
+    yield html_body_declared_encoding(text)
+
+
+def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]:
+    for encoding in filter(None, _possible_encodings(headers, text)):
+        try:
+            body = text.encode(encoding)
+        except UnicodeEncodeError:
+            pass
+        else:
+            return body, encoding
+    return text.encode("utf-8"), "utf-8"  # fallback
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
new file mode 100644
index 00000000..073135b2
--- /dev/null
+++ b/tests/test_encoding.py
@@ -0,0 +1,54 @@
+import pytest
+from scrapy.http.headers import Headers
+
+from scrapy_playwright.handler import _encode_body
+
+
+def body_str(charset: str, content: str = "áéíóú") -> str:
+    return f"""
+        <!doctype html>
+        <html>
+        <head>
+        <p>{content}</p>
+        <meta charset="{charset}">
+        </head>
+        </html>
+    """.strip()
+
+
+@pytest.mark.asyncio
+async def test_encode_from_headers():
+    text = body_str("gb2312")
+    body, encoding = _encode_body(
+        headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
+        text=text,
+    )
+    assert encoding == "cp1252"
+    assert body == text.encode(encoding)
+
+
+@pytest.mark.asyncio
+async def test_encode_from_body():
+    text = body_str("gb2312")
+    body, encoding = _encode_body(headers=Headers({}), text=text)
+    assert encoding == "gb18030"
+    assert body == body_str("gb2312").encode(encoding)
+
+
+@pytest.mark.asyncio
+async def test_encode_fallback():
+    text = "<html>áéíóú</html>"
+    body, encoding = _encode_body(headers=Headers(), text=text)
+    assert encoding == "utf-8"
+    assert body == "<html>áéíóú</html>".encode(encoding)
+
+
+@pytest.mark.asyncio
+async def test_encode_mismatch():
+    text = body_str("gb2312", content="空手道")
+    body, encoding = _encode_body(
+        headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
+        text=text,
+    )
+    assert encoding == "gb18030"
+    assert body == text.encode(encoding)
diff --git a/tests/test_misc.py b/tests/test_misc.py
deleted file mode 100644
index 9db94d63..00000000
--- a/tests/test_misc.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import pytest
-from scrapy.http.headers import Headers
-
-from scrapy_playwright.handler import _get_response_encoding
-
-
-@pytest.mark.asyncio
-async def test_get_response_encoding():
-    assert (
-        _get_response_encoding(
-            headers=Headers({"content-type": "text/html; charset=UTF-8"}),
-            body="",
-        )
-        == "utf-8"
-    )
-    assert (
-        _get_response_encoding(
-            headers=Headers(),
-            body="""<!doctype html>
-<html lang="cn">
-<head>
-  <meta charset="gb2312">
-</head>
-</html>
-""",
-        )
-        == "gb18030"
-    )
-    assert _get_response_encoding(headers=Headers(), body="") is None

From 63228164e0701b5ea8340af797613fb9c52f3a8e Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Tue, 22 Mar 2022 22:08:27 -0300
Subject: [PATCH 2/3] Test docstrings

---
 tests/test_encoding.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/test_encoding.py b/tests/test_encoding.py
index 073135b2..0efaed4c 100644
--- a/tests/test_encoding.py
+++ b/tests/test_encoding.py
@@ -18,6 +18,7 @@ def body_str(charset: str, content: str = "áéíóú") -> str:
 
 @pytest.mark.asyncio
 async def test_encode_from_headers():
+    """Charset declared in headers takes precedence"""
     text = body_str("gb2312")
     body, encoding = _encode_body(
         headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
@@ -29,22 +30,27 @@ async def test_encode_from_headers():
 
 @pytest.mark.asyncio
 async def test_encode_from_body():
+    """No charset declared in headers, use the one declared in the body"""
     text = body_str("gb2312")
     body, encoding = _encode_body(headers=Headers({}), text=text)
     assert encoding == "gb18030"
-    assert body == body_str("gb2312").encode(encoding)
+    assert body == text.encode(encoding)
 
 
 @pytest.mark.asyncio
 async def test_encode_fallback():
+    """No charset declared, use utf-8 as fallback"""
     text = "<html>áéíóú</html>"
     body, encoding = _encode_body(headers=Headers(), text=text)
     assert encoding == "utf-8"
-    assert body == "<html>áéíóú</html>".encode(encoding)
+    assert body == text.encode(encoding)
 
 
 @pytest.mark.asyncio
 async def test_encode_mismatch():
+    """Charset declared in headers and body do not match, and the headers
+    one fails to encode: use the one in the body (first one that works)
+    """
     text = body_str("gb2312", content="空手道")
     body, encoding = _encode_body(
         headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),

From 0a7ee63ff1ba790ca1e79ce01801b748d52b9a1f Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Wed, 23 Mar 2022 09:53:32 -0300
Subject: [PATCH 3/3] Update html in test

---
 tests/test_encoding.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_encoding.py b/tests/test_encoding.py
index 0efaed4c..4d6bf1aa 100644
--- a/tests/test_encoding.py
+++ b/tests/test_encoding.py
@@ -9,9 +9,11 @@ def body_str(charset: str, content: str = "áéíóú") -> str:
         <!doctype html>
         <html>
         <head>
-        <p>{content}</p>
         <meta charset="{charset}">
         </head>
+        <body>
+        <p>{content}</p>
+        </body>
         </html>
     """.strip()