diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index f075f2e8..030f2c53 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -6,7 +6,7 @@ from inspect import isawaitable from ipaddress import ip_address from time import time -from typing import Callable, Dict, Optional, Type, TypeVar +from typing import Callable, Dict, Generator, Optional, Tuple, Type, TypeVar from playwright.async_api import ( BrowserContext, @@ -283,8 +283,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res headers = Headers(response.headers) headers.pop("Content-Encoding", None) - encoding = _get_response_encoding(headers, body_str) or "utf-8" - body = body_str.encode(encoding) + body, encoding = _encode_body(headers=headers, text=body_str) respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) return respcls( url=page.url, @@ -352,11 +351,19 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) return _request_handler -def _get_response_encoding(headers: Headers, body: str) -> Optional[str]: - encoding = None +def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]: if headers.get("content-type"): content_type = to_unicode(headers["content-type"]) - encoding = http_content_type_encoding(content_type) - if not encoding: - encoding = html_body_declared_encoding(body) - return encoding + yield http_content_type_encoding(content_type) + yield html_body_declared_encoding(text) + + +def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]: + for encoding in filter(None, _possible_encodings(headers, text)): + try: + body = text.encode(encoding) + except UnicodeEncodeError: + pass + else: + return body, encoding + return text.encode("utf-8"), "utf-8" # fallback diff --git a/tests/test_encoding.py b/tests/test_encoding.py new file mode 100644 index 00000000..4d6bf1aa --- /dev/null +++ b/tests/test_encoding.py @@ -0,0 +1,62 @@ +import pytest +from scrapy.http.headers import Headers + +from scrapy_playwright.handler import _encode_body + + +def body_str(charset: str, content: str = "áéíóú") -> str: + return f""" + + + + + + +

{content}

+ + + """.strip() + + +@pytest.mark.asyncio +async def test_encode_from_headers(): + """Charset declared in headers takes precedence""" + text = body_str("gb2312") + body, encoding = _encode_body( + headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}), + text=text, + ) + assert encoding == "cp1252" + assert body == text.encode(encoding) + + +@pytest.mark.asyncio +async def test_encode_from_body(): + """No charset declared in headers, use the one declared in the body""" + text = body_str("gb2312") + body, encoding = _encode_body(headers=Headers({}), text=text) + assert encoding == "gb18030" + assert body == text.encode(encoding) + + +@pytest.mark.asyncio +async def test_encode_fallback(): + """No charset declared, use utf-8 as fallback""" + text = "áéíóú" + body, encoding = _encode_body(headers=Headers(), text=text) + assert encoding == "utf-8" + assert body == text.encode(encoding) + + +@pytest.mark.asyncio +async def test_encode_mismatch(): + """Charset declared in headers and body do not match, and the headers + one fails to encode: use the one in the body (first one that works) + """ + text = body_str("gb2312", content="空手道") + body, encoding = _encode_body( + headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}), + text=text, + ) + assert encoding == "gb18030" + assert body == text.encode(encoding) diff --git a/tests/test_misc.py b/tests/test_misc.py deleted file mode 100644 index 9db94d63..00000000 --- a/tests/test_misc.py +++ /dev/null @@ -1,29 +0,0 @@ -import pytest -from scrapy.http.headers import Headers - -from scrapy_playwright.handler import _get_response_encoding - - -@pytest.mark.asyncio -async def test_get_response_encoding(): - assert ( - _get_response_encoding( - headers=Headers({"content-type": "text/html; charset=UTF-8"}), - body="", - ) - == "utf-8" - ) - assert ( - _get_response_encoding( - headers=Headers(), - body=""" - - - - - -""", - ) - == "gb18030" - ) - assert _get_response_encoding(headers=Headers(), body="") is None