Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ to integrate `asyncio`-based projects such as `Playwright`.

* Python >= 3.7
* Scrapy >= 2.0 (!= 2.4.0)
* Playwright >= 1.8.0a1
* Playwright >= 1.15


## Installation
Expand Down Expand Up @@ -97,13 +97,17 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
the default value will be used (30000 ms at the time of writing this).
See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-set-default-navigation-timeout).

* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Union[Callable, str]`, default `scrapy_playwright.headers.use_scrapy_headers`)
* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Optional[Union[Callable, str]]`, default `scrapy_playwright.headers.use_scrapy_headers`)

A function (or the path to a function) that processes headers for a given request
and returns a dictionary with the headers to be used (note that, depending on the browser,
additional default headers will be sent as well). Coroutine functions (`async def`) are
additional default headers could be sent as well). Coroutine functions (`async def`) are
supported.

This will be called at least once for each Scrapy request (receiving said request and the
corresponding Playwright request), but it could be called additional times if the given
resource generates more requests (e.g. to retrieve assets like images or scripts).

The function must return a `dict` object, and receives the following keyword arguments:

```python
Expand All @@ -117,10 +121,11 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
For non-navigation requests (e.g. images, stylesheets, scripts, etc), only the `User-Agent` header
is overriden, for consistency.

There is another built-in function available: `scrapy_playwright.headers.use_playwright_headers`,
which will return the headers from the Playwright request unmodified.
When using this alternative, please keep in mind that headers passed via the `Request.headers`
attribute or set by Scrapy components are ignored (including cookies set via the `Request.cookies`
Setting `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` will give complete control of the headers to
Playwright, i.e. headers from Scrapy requests will be ignored and only headers set by
Playwright will be sent.
When doing this, please keep in mind that headers passed via the `Request.headers` attribute
or set by Scrapy components are ignored (including cookies set via the `Request.cookies`
attribute).

* `PLAYWRIGHT_MAX_PAGES_PER_CONTEXT` (type `int`, defaults to the value of Scrapy's `CONCURRENT_REQUESTS` setting)
Expand Down Expand Up @@ -562,6 +567,12 @@ for more information about deprecations and removals.

### Currently deprecated features

* `scrapy_playwright.headers.use_playwright_headers` function

Deprecated since
[`v0.0.16`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.16),
set `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` instead

* `scrapy_playwright.page.PageCoroutine` class

Deprecated since
Expand Down
7 changes: 7 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# scrapy-playwright changelog


### [v0.0.16](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.16) (2022-NN-NN)

* Use new headers API introduced in Playwright 1.15 (bump required Playwright version)
* Deprecate `scrapy_playwright.headers.use_playwright_headers`, set `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` instead


### [v0.0.15](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.15) (2022-05-08)

* Remove deprecated `PLAYWRIGHT_CONTEXT_ARGS` setting
Expand Down
51 changes: 34 additions & 17 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from twisted.internet.defer import Deferred, inlineCallbacks
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding

from scrapy_playwright.headers import use_scrapy_headers
from scrapy_playwright.headers import use_scrapy_headers, use_playwright_headers
from scrapy_playwright.page import PageMethod


Expand Down Expand Up @@ -61,10 +61,22 @@ def __init__(self, crawler: Crawler) -> None:
crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
)

if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"):
self.process_request_headers = load_object(
crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
)
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings:
if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
self.process_request_headers = None # use headers from the Playwright request
else:
self.process_request_headers = load_object(
crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
)
if self.process_request_headers is use_playwright_headers:
warnings.warn(
"The 'scrapy_playwright.headers.use_playwright_headers' function is"
" deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
" instead.",
category=ScrapyDeprecationWarning,
stacklevel=1,
)
self.process_request_headers = None
else:
self.process_request_headers = use_scrapy_headers

Expand Down Expand Up @@ -233,7 +245,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
with suppress(AttributeError):
request.meta["playwright_security_details"] = await response.security_details()

headers = Headers(response.headers)
headers = Headers(await response.all_headers())
headers.pop("Content-Encoding", None)
body, encoding = _encode_body(headers=headers, text=body_str)
respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
Expand Down Expand Up @@ -317,15 +329,18 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
self.stats.inc_value("playwright/request_count/aborted")
return None

processed_headers = await _await_if_necessary(
self.process_request_headers(self.browser_type, playwright_request, scrapy_headers)
)
overrides: dict = {}

# the request that reaches the callback should contain the headers that were sent
scrapy_headers.clear()
scrapy_headers.update(processed_headers)
if self.process_request_headers is not None:
overrides["headers"] = await _await_if_necessary(
self.process_request_headers(
self.browser_type, playwright_request, scrapy_headers
)
)
# the request that reaches the callback should contain the final headers
scrapy_headers.clear()
scrapy_headers.update(overrides["headers"])

overrides: dict = {"headers": processed_headers}
if playwright_request.is_navigation_request():
overrides["method"] = method
if body is not None:
Expand All @@ -351,20 +366,22 @@ async def _await_if_necessary(obj):


def _make_request_logger(context_name: str) -> Callable:
def _log_request(request: PlaywrightRequest) -> None:
async def _log_request(request: PlaywrightRequest) -> None:
referrer = await request.header_value("referer")
logger.debug(
f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
f"(resource type: {request.resource_type}, referrer: {referrer})"
)

return _log_request


def _make_response_logger(context_name: str) -> Callable:
def _log_request(response: PlaywrightResponse) -> None:
async def _log_request(response: PlaywrightResponse) -> None:
referrer = await response.header_value("referer")
logger.debug(
f"[Context={context_name}] Response: <{response.status} {response.url}> "
f"(referrer: {response.headers.get('referer')})"
f"(referrer: {referrer})"
)

return _log_request
Expand Down
31 changes: 18 additions & 13 deletions scrapy_playwright/headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
This module includes functions to process request headers.
Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
"""

import warnings
from urllib.parse import urlparse

from playwright.async_api import Request as PlaywrightRequest
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http.headers import Headers


Expand All @@ -17,30 +18,34 @@ async def use_scrapy_headers(
"""Scrapy headers take precedence over Playwright headers for navigation requests.
For non-navigation requests, only User-Agent is taken from the Scrapy headers."""

headers = scrapy_headers.to_unicode_dict()
scrapy_headers_str = scrapy_headers.to_unicode_dict()
playwright_headers = await playwright_request.all_headers()

# Scrapy's user agent has priority over Playwright's
headers.setdefault("user-agent", playwright_request.headers.get("user-agent"))
scrapy_headers_str.setdefault("user-agent", playwright_headers.get("user-agent"))

if playwright_request.is_navigation_request():
if browser_type == "firefox":
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
headers["host"] = urlparse(playwright_request.url).netloc
return headers
scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc
return scrapy_headers_str

# override user agent, for consistency with other requests
if headers.get("user-agent"):
return {
**playwright_request.headers,
"user-agent": headers["user-agent"],
}
return playwright_request.headers
if scrapy_headers_str.get("user-agent"):
playwright_headers["user-agent"] = scrapy_headers_str["user-agent"]
return playwright_headers


async def use_playwright_headers(
browser_type: str,
playwright_request: PlaywrightRequest,
scrapy_headers: Headers,
) -> dict:
"""Return headers from the Playwright request, unaltered"""
return playwright_request.headers
warnings.warn(
"The 'scrapy_playwright.headers.use_playwright_headers' function is"
" deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
" instead.",
category=ScrapyDeprecationWarning,
stacklevel=1,
)
return await playwright_request.all_headers()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,6 @@
python_requires=">=3.7",
install_requires=[
"scrapy>=2.0,!=2.4.0",
"playwright>=1.8.0a1",
"playwright>=1.15",
],
)
153 changes: 153 additions & 0 deletions tests/test_headers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import json
import platform
import sys
import warnings

import pytest
from scrapy import Spider, Request
from scrapy.http.headers import Headers

from tests import make_handler
from tests.mockserver import MockServer

from scrapy_playwright.headers import use_playwright_headers


@pytest.mark.skipif(sys.version_info < (3, 8), reason="AsyncMock was added on Python 3.8")
@pytest.mark.asyncio
async def test_use_playwright_headers_deprecated():
from unittest.mock import AsyncMock

headers = {"foo": "bar", "a": "b"}
playwright_request = AsyncMock()
playwright_request.all_headers.return_value = headers
with warnings.catch_warnings(record=True) as warning_list:
processed_headers = await use_playwright_headers("foobar", playwright_request, Headers({}))
assert processed_headers == headers
assert str(warning_list[0].message) == (
"The 'scrapy_playwright.headers.use_playwright_headers' function is"
" deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
" instead."
)


class MixinProcessHeadersTestCase:
@pytest.mark.asyncio
async def test_user_agent(self):
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"USER_AGENT": None,
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
# if Scrapy's user agent is None, use the one from the Browser
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == self.browser_type

# if Scrapy's user agent is set to some value, use it
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == "foobar"

@pytest.mark.asyncio
async def test_use_playwright_headers(self):
"""Ignore Scrapy headers"""
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 2000,
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == self.browser_type
assert "asdf" not in headers

@pytest.mark.asyncio
async def test_use_playwright_headers_deprecated(self):
"""Ignore Scrapy headers"""
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": use_playwright_headers,
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 2000,
}
with warnings.catch_warnings(record=True) as warning_list:
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == self.browser_type
assert "asdf" not in headers

assert str(warning_list[0].message) == (
"The 'scrapy_playwright.headers.use_playwright_headers' function is"
" deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
" instead."
)

@pytest.mark.asyncio
async def test_use_custom_headers(self):
"""Custom header processing function"""

async def important_headers(*args, **kwargs) -> dict:
return {"foo": "bar"}

settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers,
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["foo"] == "bar"
assert headers.get("user-agent") not in (self.browser_type, "foobar")
assert "asdf" not in headers


class TestProcessHeadersChromium(MixinProcessHeadersTestCase):
browser_type = "chromium"


class TestProcessHeadersFirefox(MixinProcessHeadersTestCase):
browser_type = "firefox"


@pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin")
class TestProcessHeadersWebkit(MixinProcessHeadersTestCase):
browser_type = "webkit"
Loading