Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,25 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
the default value will be used (30000 ms at the time of writing this).
See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser_contextset_default_navigation_timeouttimeout).

* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `str`, default `scrapy_playwright.headers.use_scrapy_headers`)

The path to a coroutine function (`async def`) that processes headers for a given request
and returns a dictionary with the headers to be used (note that, depending on the browser,
additional default headers will be sent as well).

The function must return a `dict` object, and receives the following keyword arguments:

```python
browser_type: str, playwright_request: playwright.async_api.Request, scrapy_headers: scrapy.http.headers.Headers
```

The default value (`scrapy_playwright.headers.use_scrapy_headers`) tries to emulate Scrapy's
behaviour for navigation requests, i.e. overriding headers with their values from the Scrapy request.
For non-navigation requests (e.g. images, stylesheets, scripts, etc), only the `User-Agent` header
is overriden, for consistency.

There is nother function available: `scrapy_playwright.headers.use_playwright_headers`,
which will return the headers from the Playwright request without any changes.

## Basic usage

Expand Down Expand Up @@ -135,8 +154,8 @@ class AwesomeSpider(scrapy.Spider):
By default, outgoing requests include the `User-Agent` set by Scrapy (either with the
`USER_AGENT` or `DEFAULT_REQUEST_HEADERS` settings or via the `Request.headers` attribute).
This could cause some sites to react in unexpected ways, for instance if the user agent
does not match the Browser being used. If you prefer to send the `User-Agent` from the Browser,
set the Scrapy user agent to `None`.
does not match the running Browser. If you prefer the `User-Agent` sent by
default by the specific browser you're using, set the Scrapy user agent to `None`.


## Receiving the Page object in the callback
Expand Down
47 changes: 28 additions & 19 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from contextlib import suppress
from time import time
from typing import Callable, Dict, Optional, Type, TypeVar
from urllib.parse import urlparse

from playwright.async_api import (
BrowserContext,
Expand All @@ -22,9 +21,11 @@
from scrapy.http.headers import Headers
from scrapy.responsetypes import responsetypes
from scrapy.utils.defer import deferred_from_coro
from scrapy.utils.misc import load_object
from scrapy.utils.reactor import verify_installed_reactor
from twisted.internet.defer import Deferred, inlineCallbacks

from scrapy_playwright.headers import use_scrapy_headers
from scrapy_playwright.page import PageCoroutine


Expand Down Expand Up @@ -66,13 +67,21 @@ def __init__(self, crawler: Crawler) -> None:

self.browser_type: str = crawler.settings.get("PLAYWRIGHT_BROWSER_TYPE") or "chromium"
self.launch_options: dict = crawler.settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}

self.default_navigation_timeout: Optional[float] = None
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler.settings:
with suppress(TypeError, ValueError):
self.default_navigation_timeout = float(
crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
)

if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"):
self.process_request_headers = load_object(
crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
)
else:
self.process_request_headers = use_scrapy_headers
Comment on lines +78 to +83
Copy link
Contributor

@Gallaecio Gallaecio Oct 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shame, this could be a 1-liner in Scrapy 2.4+ 🙁

(technically also here, but I’m guessing you don’t want to have load_object parse a string in the default scenario)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, I'd like to avoid any (admittedly small) overhead if we already have the object.

This is a nice catch, I was planning on documenting the setting as accepting either paths or functions directly, but didn't remember that was only valid on Scrapy 2.4+. I'll be sure to mention that when I write the docs, thanks!


default_context_kwargs: dict = {}
if "PLAYWRIGHT_CONTEXT_ARGS" in crawler.settings:
default_context_kwargs = crawler.settings.getdict("PLAYWRIGHT_CONTEXT_ARGS")
Expand Down Expand Up @@ -180,9 +189,8 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
await page.route(
"**",
self._make_request_handler(
url=request.url,
method=request.method,
headers=request.headers.to_unicode_dict(),
scrapy_headers=request.headers,
body=request.body,
encoding=getattr(request, "encoding", None),
),
Expand Down Expand Up @@ -249,23 +257,24 @@ def close_browser_context_callback() -> None:
return close_browser_context_callback

def _make_request_handler(
self, url: str, method: str, headers: dict, body: Optional[bytes], encoding: str = "utf8"
self, method: str, scrapy_headers: Headers, body: Optional[bytes], encoding: str = "utf8"
) -> Callable:
def request_handler(route: Route, pw_request: PlaywrightRequest) -> None:
async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None:
"""Override request headers, method and body."""
headers.setdefault("user-agent", pw_request.headers.get("user-agent"))
if pw_request.url == url:
overrides: dict = {"method": method, "headers": headers}
processed_headers = await self.process_request_headers(
self.browser_type, playwright_request, scrapy_headers
)

# the request that reaches the callback should contain the headers that were sent
scrapy_headers.clear()
scrapy_headers.update(processed_headers)

overrides: dict = {"headers": processed_headers}
if playwright_request.is_navigation_request():
overrides["method"] = method
if body is not None:
overrides["post_data"] = body.decode(encoding)
if self.browser_type == "firefox":
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
overrides["headers"]["host"] = urlparse(pw_request.url).netloc
else:
overrides = {"headers": pw_request.headers.copy()}
# override user agent, for consistency with other requests
if headers.get("user-agent"):
overrides["headers"]["user-agent"] = headers["user-agent"]
asyncio.create_task(route.continue_(**overrides))

return request_handler

await route.continue_(**overrides)

return _request_handler
44 changes: 44 additions & 0 deletions scrapy_playwright/headers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from urllib.parse import urlparse

from playwright.async_api import Request as PlaywrightRequest
from scrapy.http.headers import Headers


"""
This module includes functions to process request headers.
Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
"""


async def use_scrapy_headers(
browser_type: str,
playwright_request: PlaywrightRequest,
scrapy_headers: Headers,
) -> dict:
"""Scrapy headers take precedence over Playwright headers for navigation requests.
For non-navigation requests, only User-Agent is taken from the Scrapy headers."""

headers = scrapy_headers.to_unicode_dict()

# Scrapy's user agent has priority over Playwright's
headers.setdefault("user-agent", playwright_request.headers.get("user-agent"))

if playwright_request.is_navigation_request():
if browser_type == "firefox":
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
headers["host"] = urlparse(playwright_request.url).netloc
return headers
else:
# override user agent, for consistency with other requests
if headers.get("user-agent"):
playwright_request.headers["user-agent"] = headers["user-agent"]
return playwright_request.headers


async def use_playwright_headers(
browser_type: str,
playwright_request: PlaywrightRequest,
scrapy_headers: Headers,
) -> dict:
"""Return headers from the Playwright request, unaltered"""
return playwright_request.headers
47 changes: 47 additions & 0 deletions tests/test_playwright_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,53 @@ async def test_user_agent(self):
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == "foobar"

@pytest.mark.asyncio
async def test_use_playwright_headers(self):
"""Ignore Scrapy headers"""
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": "scrapy_playwright.headers.use_playwright_headers", # noqa: E501
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == self.browser_type
assert "asdf" not in headers

@pytest.mark.asyncio
async def test_use_custom_headers(self):
"""Custom header processing function"""

async def important_headers(*args, **kwargs) -> dict:
return {"foo": "bar"}

settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers,
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["foo"] == "bar"
assert headers.get("user-agent") not in (self.browser_type, "foobar")
assert "asdf" not in headers

@pytest.mark.asyncio
async def test_event_handler_dialog_callable(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
Expand Down