From dba1ed538ba7ce6e889876a884ec1b767c695f52 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 22 Mar 2022 22:00:08 -0300 Subject: [PATCH 1/3] Try body encodings --- scrapy_playwright/handler.py | 25 +++++++++++------ tests/test_encoding.py | 54 ++++++++++++++++++++++++++++++++++++ tests/test_misc.py | 29 ------------------- 3 files changed, 70 insertions(+), 38 deletions(-) create mode 100644 tests/test_encoding.py delete mode 100644 tests/test_misc.py diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index d8bc5b92..d121a9e1 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -6,7 +6,7 @@ from inspect import isawaitable from ipaddress import ip_address from time import time -from typing import Callable, Dict, Optional, Type, TypeVar +from typing import Callable, Dict, Generator, Optional, Tuple, Type, TypeVar from playwright.async_api import ( BrowserContext, @@ -284,8 +284,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res headers = Headers(response.headers) headers.pop("Content-Encoding", None) - encoding = _get_response_encoding(headers, body_str) or "utf-8" - body = body_str.encode(encoding) + body, encoding = _encode_body(headers=headers, text=body_str) respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) return respcls( url=page.url, @@ -353,11 +352,19 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) return _request_handler -def _get_response_encoding(headers: Headers, body: str) -> Optional[str]: - encoding = None +def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]: if headers.get("content-type"): content_type = to_unicode(headers["content-type"]) - encoding = http_content_type_encoding(content_type) - if not encoding: - encoding = html_body_declared_encoding(body) - return encoding + yield http_content_type_encoding(content_type) + yield html_body_declared_encoding(text) + + +def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]: + for encoding in filter(None, _possible_encodings(headers, text)): + try: + body = text.encode(encoding) + except UnicodeEncodeError: + pass + else: + return body, encoding + return text.encode("utf-8"), "utf-8" # fallback diff --git a/tests/test_encoding.py b/tests/test_encoding.py new file mode 100644 index 00000000..073135b2 --- /dev/null +++ b/tests/test_encoding.py @@ -0,0 +1,54 @@ +import pytest +from scrapy.http.headers import Headers + +from scrapy_playwright.handler import _encode_body + + +def body_str(charset: str, content: str = "áéíóú") -> str: + return f""" + + + +

{content}

+ + + + """.strip() + + +@pytest.mark.asyncio +async def test_encode_from_headers(): + text = body_str("gb2312") + body, encoding = _encode_body( + headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}), + text=text, + ) + assert encoding == "cp1252" + assert body == text.encode(encoding) + + +@pytest.mark.asyncio +async def test_encode_from_body(): + text = body_str("gb2312") + body, encoding = _encode_body(headers=Headers({}), text=text) + assert encoding == "gb18030" + assert body == body_str("gb2312").encode(encoding) + + +@pytest.mark.asyncio +async def test_encode_fallback(): + text = "áéíóú" + body, encoding = _encode_body(headers=Headers(), text=text) + assert encoding == "utf-8" + assert body == "áéíóú".encode(encoding) + + +@pytest.mark.asyncio +async def test_encode_mismatch(): + text = body_str("gb2312", content="空手道") + body, encoding = _encode_body( + headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}), + text=text, + ) + assert encoding == "gb18030" + assert body == text.encode(encoding) diff --git a/tests/test_misc.py b/tests/test_misc.py deleted file mode 100644 index 9db94d63..00000000 --- a/tests/test_misc.py +++ /dev/null @@ -1,29 +0,0 @@ -import pytest -from scrapy.http.headers import Headers - -from scrapy_playwright.handler import _get_response_encoding - - -@pytest.mark.asyncio -async def test_get_response_encoding(): - assert ( - _get_response_encoding( - headers=Headers({"content-type": "text/html; charset=UTF-8"}), - body="", - ) - == "utf-8" - ) - assert ( - _get_response_encoding( - headers=Headers(), - body=""" - - - - - -""", - ) - == "gb18030" - ) - assert _get_response_encoding(headers=Headers(), body="") is None From 63228164e0701b5ea8340af797613fb9c52f3a8e Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 22 Mar 2022 22:08:27 -0300 Subject: [PATCH 2/3] Test docstrings --- tests/test_encoding.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 073135b2..0efaed4c 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -18,6 +18,7 @@ def body_str(charset: str, content: str = "áéíóú") -> str: @pytest.mark.asyncio async def test_encode_from_headers(): + """Charset declared in headers takes precedence""" text = body_str("gb2312") body, encoding = _encode_body( headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}), @@ -29,22 +30,27 @@ async def test_encode_from_headers(): @pytest.mark.asyncio async def test_encode_from_body(): + """No charset declared in headers, use the one declared in the body""" text = body_str("gb2312") body, encoding = _encode_body(headers=Headers({}), text=text) assert encoding == "gb18030" - assert body == body_str("gb2312").encode(encoding) + assert body == text.encode(encoding) @pytest.mark.asyncio async def test_encode_fallback(): + """No charset declared, use utf-8 as fallback""" text = "áéíóú" body, encoding = _encode_body(headers=Headers(), text=text) assert encoding == "utf-8" - assert body == "áéíóú".encode(encoding) + assert body == text.encode(encoding) @pytest.mark.asyncio async def test_encode_mismatch(): + """Charset declared in headers and body do not match, and the headers + one fails to encode: use the one in the body (first one that works) + """ text = body_str("gb2312", content="空手道") body, encoding = _encode_body( headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}), From 0a7ee63ff1ba790ca1e79ce01801b748d52b9a1f Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 23 Mar 2022 09:53:32 -0300 Subject: [PATCH 3/3] Update html in test --- tests/test_encoding.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 0efaed4c..4d6bf1aa 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -9,9 +9,11 @@ def body_str(charset: str, content: str = "áéíóú") -> str: -

{content}

+ +

{content}

+ """.strip()