diff --git a/README.md b/README.md index e4a6a4e8..2bbf0026 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,25 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" the default value will be used (30000 ms at the time of writing this). See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser_contextset_default_navigation_timeouttimeout). +* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `str`, default `scrapy_playwright.headers.use_scrapy_headers`) + + The path to a coroutine function (`async def`) that processes headers for a given request + and returns a dictionary with the headers to be used (note that, depending on the browser, + additional default headers will be sent as well). + + The function must return a `dict` object, and receives the following keyword arguments: + + ```python + browser_type: str, playwright_request: playwright.async_api.Request, scrapy_headers: scrapy.http.headers.Headers + ``` + + The default value (`scrapy_playwright.headers.use_scrapy_headers`) tries to emulate Scrapy's + behaviour for navigation requests, i.e. overriding headers with their values from the Scrapy request. + For non-navigation requests (e.g. images, stylesheets, scripts, etc), only the `User-Agent` header + is overriden, for consistency. + + There is nother function available: `scrapy_playwright.headers.use_playwright_headers`, + which will return the headers from the Playwright request without any changes. ## Basic usage @@ -135,8 +154,8 @@ class AwesomeSpider(scrapy.Spider): By default, outgoing requests include the `User-Agent` set by Scrapy (either with the `USER_AGENT` or `DEFAULT_REQUEST_HEADERS` settings or via the `Request.headers` attribute). This could cause some sites to react in unexpected ways, for instance if the user agent -does not match the Browser being used. If you prefer to send the `User-Agent` from the Browser, -set the Scrapy user agent to `None`. +does not match the running Browser. If you prefer the `User-Agent` sent by +default by the specific browser you're using, set the Scrapy user agent to `None`. ## Receiving the Page object in the callback diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index a0ac8853..5edada12 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -5,7 +5,6 @@ from contextlib import suppress from time import time from typing import Callable, Dict, Optional, Type, TypeVar -from urllib.parse import urlparse from playwright.async_api import ( BrowserContext, @@ -22,9 +21,11 @@ from scrapy.http.headers import Headers from scrapy.responsetypes import responsetypes from scrapy.utils.defer import deferred_from_coro +from scrapy.utils.misc import load_object from scrapy.utils.reactor import verify_installed_reactor from twisted.internet.defer import Deferred, inlineCallbacks +from scrapy_playwright.headers import use_scrapy_headers from scrapy_playwright.page import PageCoroutine @@ -66,6 +67,7 @@ def __init__(self, crawler: Crawler) -> None: self.browser_type: str = crawler.settings.get("PLAYWRIGHT_BROWSER_TYPE") or "chromium" self.launch_options: dict = crawler.settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {} + self.default_navigation_timeout: Optional[float] = None if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler.settings: with suppress(TypeError, ValueError): @@ -73,6 +75,13 @@ def __init__(self, crawler: Crawler) -> None: crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT") ) + if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"): + self.process_request_headers = load_object( + crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] + ) + else: + self.process_request_headers = use_scrapy_headers + default_context_kwargs: dict = {} if "PLAYWRIGHT_CONTEXT_ARGS" in crawler.settings: default_context_kwargs = crawler.settings.getdict("PLAYWRIGHT_CONTEXT_ARGS") @@ -180,9 +189,8 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: await page.route( "**", self._make_request_handler( - url=request.url, method=request.method, - headers=request.headers.to_unicode_dict(), + scrapy_headers=request.headers, body=request.body, encoding=getattr(request, "encoding", None), ), @@ -249,23 +257,24 @@ def close_browser_context_callback() -> None: return close_browser_context_callback def _make_request_handler( - self, url: str, method: str, headers: dict, body: Optional[bytes], encoding: str = "utf8" + self, method: str, scrapy_headers: Headers, body: Optional[bytes], encoding: str = "utf8" ) -> Callable: - def request_handler(route: Route, pw_request: PlaywrightRequest) -> None: + async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None: """Override request headers, method and body.""" - headers.setdefault("user-agent", pw_request.headers.get("user-agent")) - if pw_request.url == url: - overrides: dict = {"method": method, "headers": headers} + processed_headers = await self.process_request_headers( + self.browser_type, playwright_request, scrapy_headers + ) + + # the request that reaches the callback should contain the headers that were sent + scrapy_headers.clear() + scrapy_headers.update(processed_headers) + + overrides: dict = {"headers": processed_headers} + if playwright_request.is_navigation_request(): + overrides["method"] = method if body is not None: overrides["post_data"] = body.decode(encoding) - if self.browser_type == "firefox": - # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET - overrides["headers"]["host"] = urlparse(pw_request.url).netloc - else: - overrides = {"headers": pw_request.headers.copy()} - # override user agent, for consistency with other requests - if headers.get("user-agent"): - overrides["headers"]["user-agent"] = headers["user-agent"] - asyncio.create_task(route.continue_(**overrides)) - - return request_handler + + await route.continue_(**overrides) + + return _request_handler diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py new file mode 100644 index 00000000..d3771195 --- /dev/null +++ b/scrapy_playwright/headers.py @@ -0,0 +1,44 @@ +from urllib.parse import urlparse + +from playwright.async_api import Request as PlaywrightRequest +from scrapy.http.headers import Headers + + +""" +This module includes functions to process request headers. +Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information. +""" + + +async def use_scrapy_headers( + browser_type: str, + playwright_request: PlaywrightRequest, + scrapy_headers: Headers, +) -> dict: + """Scrapy headers take precedence over Playwright headers for navigation requests. + For non-navigation requests, only User-Agent is taken from the Scrapy headers.""" + + headers = scrapy_headers.to_unicode_dict() + + # Scrapy's user agent has priority over Playwright's + headers.setdefault("user-agent", playwright_request.headers.get("user-agent")) + + if playwright_request.is_navigation_request(): + if browser_type == "firefox": + # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET + headers["host"] = urlparse(playwright_request.url).netloc + return headers + else: + # override user agent, for consistency with other requests + if headers.get("user-agent"): + playwright_request.headers["user-agent"] = headers["user-agent"] + return playwright_request.headers + + +async def use_playwright_headers( + browser_type: str, + playwright_request: PlaywrightRequest, + scrapy_headers: Headers, +) -> dict: + """Return headers from the Playwright request, unaltered""" + return playwright_request.headers diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index bb293a8b..ecb6cc48 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -259,6 +259,53 @@ async def test_user_agent(self): headers = {key.lower(): value for key, value in headers.items()} assert headers["user-agent"] == "foobar" + @pytest.mark.asyncio + async def test_use_playwright_headers(self): + """Ignore Scrapy headers""" + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": "scrapy_playwright.headers.use_playwright_headers", # noqa: E501 + } + async with make_handler(settings_dict) as handler: + with MockServer() as server: + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar", "Asdf": "qwerty"}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["user-agent"] == self.browser_type + assert "asdf" not in headers + + @pytest.mark.asyncio + async def test_use_custom_headers(self): + """Custom header processing function""" + + async def important_headers(*args, **kwargs) -> dict: + return {"foo": "bar"} + + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers, + } + async with make_handler(settings_dict) as handler: + with MockServer() as server: + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar", "Asdf": "qwerty"}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["foo"] == "bar" + assert headers.get("user-agent") not in (self.browser_type, "foobar") + assert "asdf" not in headers + @pytest.mark.asyncio async def test_event_handler_dialog_callable(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: