From dfcc3ff943daa72228f7099f76b684fd094eca4f Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 20 Oct 2021 12:26:54 -0300 Subject: [PATCH 1/9] Process request headers --- scrapy_playwright/handler.py | 38 +++++++++++++++++-------------- scrapy_playwright/headers.py | 33 +++++++++++++++++++++++++++ tests/test_playwright_requests.py | 21 +++++++++++++++++ 3 files changed, 75 insertions(+), 17 deletions(-) create mode 100644 scrapy_playwright/headers.py diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 8e63ca98..ec8df6ca 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -4,7 +4,6 @@ from collections import defaultdict from time import time from typing import Callable, Dict, Optional, Type, TypeVar -from urllib.parse import urlparse from playwright.async_api import ( BrowserContext, @@ -21,9 +20,11 @@ from scrapy.http.headers import Headers from scrapy.responsetypes import responsetypes from scrapy.utils.defer import deferred_from_coro +from scrapy.utils.misc import load_object from scrapy.utils.reactor import verify_installed_reactor from twisted.internet.defer import Deferred, inlineCallbacks +from scrapy_playwright.headers import use_scrapy_headers from scrapy_playwright.page import PageCoroutine @@ -68,6 +69,12 @@ def __init__(self, crawler: Crawler) -> None: self.default_navigation_timeout: Optional[int] = ( crawler.settings.getint("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT") or None ) + if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"): + self.process_request_headers = load_object( + crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] + ) + else: + self.process_request_headers = use_scrapy_headers default_context_kwargs: dict = {} if "PLAYWRIGHT_CONTEXT_ARGS" in crawler.settings: @@ -176,9 +183,8 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: await page.route( "**", self._make_request_handler( - url=request.url, method=request.method, - headers=request.headers.to_unicode_dict(), + scrapy_headers=request.headers, body=request.body, encoding=getattr(request, "encoding", None), ), @@ -245,23 +251,21 @@ def close_browser_context_callback() -> None: return close_browser_context_callback def _make_request_handler( - self, url: str, method: str, headers: dict, body: Optional[bytes], encoding: str = "utf8" + self, method: str, scrapy_headers: dict, body: Optional[bytes], encoding: str = "utf8" ) -> Callable: - def request_handler(route: Route, pw_request: PlaywrightRequest) -> None: + async def request_handler(route: Route, playwright_request: PlaywrightRequest) -> None: """Override request headers, method and body.""" - headers.setdefault("user-agent", pw_request.headers.get("user-agent")) - if pw_request.url == url: - overrides: dict = {"method": method, "headers": headers} + processed_headers = await self.process_request_headers( + self.browser_type, playwright_request, scrapy_headers + ) + scrapy_headers.clear() + scrapy_headers.update(processed_headers) + overrides: dict = {"headers": processed_headers} + if playwright_request.is_navigation_request(): + overrides["method"] = method if body is not None: overrides["post_data"] = body.decode(encoding) - if self.browser_type == "firefox": - # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET - overrides["headers"]["host"] = urlparse(pw_request.url).netloc - else: - overrides = {"headers": pw_request.headers.copy()} - # override user agent, for consistency with other requests - if headers.get("user-agent"): - overrides["headers"]["user-agent"] = headers["user-agent"] - asyncio.create_task(route.continue_(**overrides)) + + await route.continue_(**overrides) return request_handler diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py new file mode 100644 index 00000000..6c427f93 --- /dev/null +++ b/scrapy_playwright/headers.py @@ -0,0 +1,33 @@ +from urllib.parse import urlparse + +from playwright.async_api import Request as PlaywrightRequest + + +async def use_scrapy_headers( + browser_type: str, + playwright_request: PlaywrightRequest, + scrapy_headers: dict, +) -> dict: + headers = scrapy_headers.to_unicode_dict() + + # Scrapy's user agent has priority over Playwright's + headers.setdefault("user-agent", playwright_request.headers.get("user-agent")) + + if playwright_request.is_navigation_request(): + if browser_type == "firefox": + # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET + headers["host"] = urlparse(playwright_request.url).netloc + return headers + else: + # override user agent, for consistency with other requests + if headers.get("user-agent"): + playwright_request.headers["user-agent"] = headers["user-agent"] + return playwright_request.headers + + +async def use_playwright_headers( + browser_type: str, + playwright_request: PlaywrightRequest, + scrapy_headers: dict, +) -> dict: + return playwright_request.headers diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index 38c9087e..5e170eb5 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -224,6 +224,27 @@ async def test_user_agent(self): headers = {key.lower(): value for key, value in headers.items()} assert headers["user-agent"] == "foobar" + @pytest.mark.asyncio + async def test_use_playwright_headers(self): + """Ignore Scrapy headers""" + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": "scrapy_playwright.headers.use_playwright_headers", # noqa: E501 + } + async with make_handler(settings_dict) as handler: + with MockServer() as server: + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar", "Asdf": "qwerty"}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["user-agent"] == self.browser_type + assert "asdf" not in headers + @pytest.mark.asyncio async def test_event_handler_dialog_callable(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: From a46b1db62b6fca9e5d4030ab3f68fa76c5c7f865 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 20 Oct 2021 12:33:12 -0300 Subject: [PATCH 2/9] Typing fix --- scrapy_playwright/handler.py | 2 +- scrapy_playwright/headers.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index ec8df6ca..23cdd85e 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -251,7 +251,7 @@ def close_browser_context_callback() -> None: return close_browser_context_callback def _make_request_handler( - self, method: str, scrapy_headers: dict, body: Optional[bytes], encoding: str = "utf8" + self, method: str, scrapy_headers: Headers, body: Optional[bytes], encoding: str = "utf8" ) -> Callable: async def request_handler(route: Route, playwright_request: PlaywrightRequest) -> None: """Override request headers, method and body.""" diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py index 6c427f93..26982c6c 100644 --- a/scrapy_playwright/headers.py +++ b/scrapy_playwright/headers.py @@ -1,12 +1,13 @@ from urllib.parse import urlparse from playwright.async_api import Request as PlaywrightRequest +from scrapy.http.headers import Headers async def use_scrapy_headers( browser_type: str, playwright_request: PlaywrightRequest, - scrapy_headers: dict, + scrapy_headers: Headers, ) -> dict: headers = scrapy_headers.to_unicode_dict() From c7225c44fc96809a5949d8e24276e03fd90748c9 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 20 Oct 2021 12:36:22 -0300 Subject: [PATCH 3/9] Rename private request handler --- scrapy_playwright/handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 23cdd85e..2f01c6b1 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -253,7 +253,7 @@ def close_browser_context_callback() -> None: def _make_request_handler( self, method: str, scrapy_headers: Headers, body: Optional[bytes], encoding: str = "utf8" ) -> Callable: - async def request_handler(route: Route, playwright_request: PlaywrightRequest) -> None: + async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None: """Override request headers, method and body.""" processed_headers = await self.process_request_headers( self.browser_type, playwright_request, scrapy_headers @@ -268,4 +268,4 @@ async def request_handler(route: Route, playwright_request: PlaywrightRequest) - await route.continue_(**overrides) - return request_handler + return _request_handler From 7ed9d28b39bb627ea4860c3f9016bee1ad96618c Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 20 Oct 2021 12:37:42 -0300 Subject: [PATCH 4/9] Additional typing fix --- scrapy_playwright/headers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py index 26982c6c..9cc33d51 100644 --- a/scrapy_playwright/headers.py +++ b/scrapy_playwright/headers.py @@ -29,6 +29,6 @@ async def use_scrapy_headers( async def use_playwright_headers( browser_type: str, playwright_request: PlaywrightRequest, - scrapy_headers: dict, + scrapy_headers: Headers, ) -> dict: return playwright_request.headers From b10bfb19aca2008caa1df15d976fc5155e450dd5 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 20 Oct 2021 12:50:15 -0300 Subject: [PATCH 5/9] Clarifying comment --- scrapy_playwright/handler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 2f01c6b1..34445241 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -258,8 +258,11 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) processed_headers = await self.process_request_headers( self.browser_type, playwright_request, scrapy_headers ) + + # the request that reches the callback should contain the headers that were sent scrapy_headers.clear() scrapy_headers.update(processed_headers) + overrides: dict = {"headers": processed_headers} if playwright_request.is_navigation_request(): overrides["method"] = method From 690d0020b7c1a47f3a989a3bb5d37f38ec77b3c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 20 Oct 2021 18:53:48 +0200 Subject: [PATCH 6/9] =?UTF-8?q?scrapy=5Fplaywright/handler.py:=20fix=20typ?= =?UTF-8?q?o:=20reches=20=E2=86=92=20reaches?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapy_playwright/handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 34445241..a11d5063 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -259,7 +259,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) self.browser_type, playwright_request, scrapy_headers ) - # the request that reches the callback should contain the headers that were sent + # the request that reaches the callback should contain the headers that were sent scrapy_headers.clear() scrapy_headers.update(processed_headers) From 53e925987aadf77b0d90d30faaddac2e6c17fc1b Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 26 Jan 2022 10:14:23 -0300 Subject: [PATCH 7/9] Add test/docstring --- scrapy_playwright/headers.py | 10 ++++++++++ tests/test_playwright_requests.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py index 9cc33d51..d3771195 100644 --- a/scrapy_playwright/headers.py +++ b/scrapy_playwright/headers.py @@ -4,11 +4,20 @@ from scrapy.http.headers import Headers +""" +This module includes functions to process request headers. +Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information. +""" + + async def use_scrapy_headers( browser_type: str, playwright_request: PlaywrightRequest, scrapy_headers: Headers, ) -> dict: + """Scrapy headers take precedence over Playwright headers for navigation requests. + For non-navigation requests, only User-Agent is taken from the Scrapy headers.""" + headers = scrapy_headers.to_unicode_dict() # Scrapy's user agent has priority over Playwright's @@ -31,4 +40,5 @@ async def use_playwright_headers( playwright_request: PlaywrightRequest, scrapy_headers: Headers, ) -> dict: + """Return headers from the Playwright request, unaltered""" return playwright_request.headers diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index 32d1c566..ecb6cc48 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -280,6 +280,32 @@ async def test_use_playwright_headers(self): assert headers["user-agent"] == self.browser_type assert "asdf" not in headers + @pytest.mark.asyncio + async def test_use_custom_headers(self): + """Custom header processing function""" + + async def important_headers(*args, **kwargs) -> dict: + return {"foo": "bar"} + + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers, + } + async with make_handler(settings_dict) as handler: + with MockServer() as server: + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar", "Asdf": "qwerty"}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["foo"] == "bar" + assert headers.get("user-agent") not in (self.browser_type, "foobar") + assert "asdf" not in headers + @pytest.mark.asyncio async def test_event_handler_dialog_callable(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: From 16ac2a2d57a2628e738d368a502effbc10cf3f2a Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 26 Jan 2022 12:56:00 -0300 Subject: [PATCH 8/9] Docs for PLAYWRIGHT_PROCESS_REQUEST_HEADERS --- README.md | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e4a6a4e8..03f5c15e 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,25 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" the default value will be used (30000 ms at the time of writing this). See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser_contextset_default_navigation_timeouttimeout). +* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `str|Callable`, default `scrapy_playwright.headers.use_scrapy_headers`) + + A coroutine function (`async def`), or the path to one, that processes headers for a given request + and returns a dictionary with the headers to be use (note that, depending on the browser, additional + default headers will be sent as well). + + The function must return a `dict` object, and receives the following keyword arguments: + + ```python + browser_type: str, playwright_request: playwright.async_api.Request, scrapy_headers: scrapy.http.headers.Headers + ``` + + The default value (`scrapy_playwright.headers.use_scrapy_headers`) tries to emulate Scrapy's + behaviour for navigation requests, i.e. overriding headers with their values from the Scrapy request. + For non-navigation requests (e.g. images, stylesheets, scripts, etc), only the `User-Agent` header + is overriden, for consistency. + + There is nother function available: `scrapy_playwright.headers.use_playwright_headers`, + which will return the headers from the Playwright request without any changes. ## Basic usage @@ -135,8 +154,8 @@ class AwesomeSpider(scrapy.Spider): By default, outgoing requests include the `User-Agent` set by Scrapy (either with the `USER_AGENT` or `DEFAULT_REQUEST_HEADERS` settings or via the `Request.headers` attribute). This could cause some sites to react in unexpected ways, for instance if the user agent -does not match the Browser being used. If you prefer to send the `User-Agent` from the Browser, -set the Scrapy user agent to `None`. +does not match the running Browser. If you prefer the `User-Agent` sent by +default by the specific browser you're using, set the Scrapy user agent to `None`. ## Receiving the Page object in the callback From 42a1280766b36317d3f6dfa97e5405e1839c6470 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 26 Jan 2022 17:15:55 -0300 Subject: [PATCH 9/9] Update readme --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 03f5c15e..2bbf0026 100644 --- a/README.md +++ b/README.md @@ -103,11 +103,11 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" the default value will be used (30000 ms at the time of writing this). See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser_contextset_default_navigation_timeouttimeout). -* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `str|Callable`, default `scrapy_playwright.headers.use_scrapy_headers`) +* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `str`, default `scrapy_playwright.headers.use_scrapy_headers`) - A coroutine function (`async def`), or the path to one, that processes headers for a given request - and returns a dictionary with the headers to be use (note that, depending on the browser, additional - default headers will be sent as well). + The path to a coroutine function (`async def`) that processes headers for a given request + and returns a dictionary with the headers to be used (note that, depending on the browser, + additional default headers will be sent as well). The function must return a `dict` object, and receives the following keyword arguments: