Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from inspect import isawaitable
from ipaddress import ip_address
from time import time
from typing import Callable, Dict, Optional, Type, TypeVar
from typing import Callable, Dict, Generator, Optional, Tuple, Type, TypeVar

from playwright.async_api import (
BrowserContext,
Expand Down Expand Up @@ -283,8 +283,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res

headers = Headers(response.headers)
headers.pop("Content-Encoding", None)
encoding = _get_response_encoding(headers, body_str) or "utf-8"
body = body_str.encode(encoding)
body, encoding = _encode_body(headers=headers, text=body_str)
respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
return respcls(
url=page.url,
Expand Down Expand Up @@ -352,11 +351,19 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
return _request_handler


def _get_response_encoding(headers: Headers, body: str) -> Optional[str]:
encoding = None
def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]:
if headers.get("content-type"):
content_type = to_unicode(headers["content-type"])
encoding = http_content_type_encoding(content_type)
if not encoding:
encoding = html_body_declared_encoding(body)
return encoding
yield http_content_type_encoding(content_type)
yield html_body_declared_encoding(text)


def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]:
for encoding in filter(None, _possible_encodings(headers, text)):
try:
body = text.encode(encoding)
except UnicodeEncodeError:
pass
else:
return body, encoding
return text.encode("utf-8"), "utf-8" # fallback
62 changes: 62 additions & 0 deletions tests/test_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pytest
from scrapy.http.headers import Headers

from scrapy_playwright.handler import _encode_body


def body_str(charset: str, content: str = "áéíóú") -> str:
return f"""
<!doctype html>
<html>
<head>
<meta charset="{charset}">
</head>
<body>
<p>{content}</p>
</body>
</html>
""".strip()


@pytest.mark.asyncio
async def test_encode_from_headers():
"""Charset declared in headers takes precedence"""
text = body_str("gb2312")
body, encoding = _encode_body(
headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
text=text,
)
assert encoding == "cp1252"
assert body == text.encode(encoding)


@pytest.mark.asyncio
async def test_encode_from_body():
"""No charset declared in headers, use the one declared in the body"""
text = body_str("gb2312")
body, encoding = _encode_body(headers=Headers({}), text=text)
assert encoding == "gb18030"
assert body == text.encode(encoding)


@pytest.mark.asyncio
async def test_encode_fallback():
"""No charset declared, use utf-8 as fallback"""
text = "<html>áéíóú</html>"
body, encoding = _encode_body(headers=Headers(), text=text)
assert encoding == "utf-8"
assert body == text.encode(encoding)


@pytest.mark.asyncio
async def test_encode_mismatch():
"""Charset declared in headers and body do not match, and the headers
one fails to encode: use the one in the body (first one that works)
"""
text = body_str("gb2312", content="空手道")
body, encoding = _encode_body(
headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
text=text,
)
assert encoding == "gb18030"
assert body == text.encode(encoding)
29 changes: 0 additions & 29 deletions tests/test_misc.py

This file was deleted.