diff --git a/README.md b/README.md index ca454008..c15e1757 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ to integrate `asyncio`-based projects such as `Playwright`. * Python >= 3.7 * Scrapy >= 2.0 (!= 2.4.0) -* Playwright >= 1.8.0a1 +* Playwright >= 1.15 ## Installation @@ -97,13 +97,17 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" the default value will be used (30000 ms at the time of writing this). See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-set-default-navigation-timeout). -* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Union[Callable, str]`, default `scrapy_playwright.headers.use_scrapy_headers`) +* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Optional[Union[Callable, str]]`, default `scrapy_playwright.headers.use_scrapy_headers`) A function (or the path to a function) that processes headers for a given request and returns a dictionary with the headers to be used (note that, depending on the browser, - additional default headers will be sent as well). Coroutine functions (`async def`) are + additional default headers could be sent as well). Coroutine functions (`async def`) are supported. + This will be called at least once for each Scrapy request (receiving said request and the + corresponding Playwright request), but it could be called additional times if the given + resource generates more requests (e.g. to retrieve assets like images or scripts). + The function must return a `dict` object, and receives the following keyword arguments: ```python @@ -117,10 +121,11 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" For non-navigation requests (e.g. images, stylesheets, scripts, etc), only the `User-Agent` header is overriden, for consistency. - There is another built-in function available: `scrapy_playwright.headers.use_playwright_headers`, - which will return the headers from the Playwright request unmodified. - When using this alternative, please keep in mind that headers passed via the `Request.headers` - attribute or set by Scrapy components are ignored (including cookies set via the `Request.cookies` + Setting `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` will give complete control of the headers to + Playwright, i.e. headers from Scrapy requests will be ignored and only headers set by + Playwright will be sent. + When doing this, please keep in mind that headers passed via the `Request.headers` attribute + or set by Scrapy components are ignored (including cookies set via the `Request.cookies` attribute). * `PLAYWRIGHT_MAX_PAGES_PER_CONTEXT` (type `int`, defaults to the value of Scrapy's `CONCURRENT_REQUESTS` setting) @@ -562,6 +567,12 @@ for more information about deprecations and removals. ### Currently deprecated features +* `scrapy_playwright.headers.use_playwright_headers` function + + Deprecated since + [`v0.0.16`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.16), + set `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` instead + * `scrapy_playwright.page.PageCoroutine` class Deprecated since diff --git a/changelog.md b/changelog.md index 357b2063..11a83722 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,12 @@ # scrapy-playwright changelog + +### [v0.0.16](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.16) (2022-NN-NN) + +* Use new headers API introduced in Playwright 1.15 (bump required Playwright version) +* Deprecate `scrapy_playwright.headers.use_playwright_headers`, set `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` instead + + ### [v0.0.15](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.15) (2022-05-08) * Remove deprecated `PLAYWRIGHT_CONTEXT_ARGS` setting diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 888108c4..64453697 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -28,7 +28,7 @@ from twisted.internet.defer import Deferred, inlineCallbacks from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding -from scrapy_playwright.headers import use_scrapy_headers +from scrapy_playwright.headers import use_scrapy_headers, use_playwright_headers from scrapy_playwright.page import PageMethod @@ -61,10 +61,22 @@ def __init__(self, crawler: Crawler) -> None: crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT") ) - if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"): - self.process_request_headers = load_object( - crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] - ) + if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings: + if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None: + self.process_request_headers = None # use headers from the Playwright request + else: + self.process_request_headers = load_object( + crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] + ) + if self.process_request_headers is use_playwright_headers: + warnings.warn( + "The 'scrapy_playwright.headers.use_playwright_headers' function is" + " deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'" + " instead.", + category=ScrapyDeprecationWarning, + stacklevel=1, + ) + self.process_request_headers = None else: self.process_request_headers = use_scrapy_headers @@ -233,7 +245,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res with suppress(AttributeError): request.meta["playwright_security_details"] = await response.security_details() - headers = Headers(response.headers) + headers = Headers(await response.all_headers()) headers.pop("Content-Encoding", None) body, encoding = _encode_body(headers=headers, text=body_str) respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) @@ -317,15 +329,18 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) self.stats.inc_value("playwright/request_count/aborted") return None - processed_headers = await _await_if_necessary( - self.process_request_headers(self.browser_type, playwright_request, scrapy_headers) - ) + overrides: dict = {} - # the request that reaches the callback should contain the headers that were sent - scrapy_headers.clear() - scrapy_headers.update(processed_headers) + if self.process_request_headers is not None: + overrides["headers"] = await _await_if_necessary( + self.process_request_headers( + self.browser_type, playwright_request, scrapy_headers + ) + ) + # the request that reaches the callback should contain the final headers + scrapy_headers.clear() + scrapy_headers.update(overrides["headers"]) - overrides: dict = {"headers": processed_headers} if playwright_request.is_navigation_request(): overrides["method"] = method if body is not None: @@ -351,20 +366,22 @@ async def _await_if_necessary(obj): def _make_request_logger(context_name: str) -> Callable: - def _log_request(request: PlaywrightRequest) -> None: + async def _log_request(request: PlaywrightRequest) -> None: + referrer = await request.header_value("referer") logger.debug( f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> " - f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})" + f"(resource type: {request.resource_type}, referrer: {referrer})" ) return _log_request def _make_response_logger(context_name: str) -> Callable: - def _log_request(response: PlaywrightResponse) -> None: + async def _log_request(response: PlaywrightResponse) -> None: + referrer = await response.header_value("referer") logger.debug( f"[Context={context_name}] Response: <{response.status} {response.url}> " - f"(referrer: {response.headers.get('referer')})" + f"(referrer: {referrer})" ) return _log_request diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py index d450db3a..330fa6d7 100644 --- a/scrapy_playwright/headers.py +++ b/scrapy_playwright/headers.py @@ -2,10 +2,11 @@ This module includes functions to process request headers. Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information. """ - +import warnings from urllib.parse import urlparse from playwright.async_api import Request as PlaywrightRequest +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http.headers import Headers @@ -17,24 +18,22 @@ async def use_scrapy_headers( """Scrapy headers take precedence over Playwright headers for navigation requests. For non-navigation requests, only User-Agent is taken from the Scrapy headers.""" - headers = scrapy_headers.to_unicode_dict() + scrapy_headers_str = scrapy_headers.to_unicode_dict() + playwright_headers = await playwright_request.all_headers() # Scrapy's user agent has priority over Playwright's - headers.setdefault("user-agent", playwright_request.headers.get("user-agent")) + scrapy_headers_str.setdefault("user-agent", playwright_headers.get("user-agent")) if playwright_request.is_navigation_request(): if browser_type == "firefox": # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET - headers["host"] = urlparse(playwright_request.url).netloc - return headers + scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc + return scrapy_headers_str # override user agent, for consistency with other requests - if headers.get("user-agent"): - return { - **playwright_request.headers, - "user-agent": headers["user-agent"], - } - return playwright_request.headers + if scrapy_headers_str.get("user-agent"): + playwright_headers["user-agent"] = scrapy_headers_str["user-agent"] + return playwright_headers async def use_playwright_headers( @@ -42,5 +41,11 @@ async def use_playwright_headers( playwright_request: PlaywrightRequest, scrapy_headers: Headers, ) -> dict: - """Return headers from the Playwright request, unaltered""" - return playwright_request.headers + warnings.warn( + "The 'scrapy_playwright.headers.use_playwright_headers' function is" + " deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'" + " instead.", + category=ScrapyDeprecationWarning, + stacklevel=1, + ) + return await playwright_request.all_headers() diff --git a/setup.py b/setup.py index deeffc7a..b22c5529 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,6 @@ python_requires=">=3.7", install_requires=[ "scrapy>=2.0,!=2.4.0", - "playwright>=1.8.0a1", + "playwright>=1.15", ], ) diff --git a/tests/test_headers.py b/tests/test_headers.py new file mode 100644 index 00000000..9a6bc346 --- /dev/null +++ b/tests/test_headers.py @@ -0,0 +1,153 @@ +import json +import platform +import sys +import warnings + +import pytest +from scrapy import Spider, Request +from scrapy.http.headers import Headers + +from tests import make_handler +from tests.mockserver import MockServer + +from scrapy_playwright.headers import use_playwright_headers + + +@pytest.mark.skipif(sys.version_info < (3, 8), reason="AsyncMock was added on Python 3.8") +@pytest.mark.asyncio +async def test_use_playwright_headers_deprecated(): + from unittest.mock import AsyncMock + + headers = {"foo": "bar", "a": "b"} + playwright_request = AsyncMock() + playwright_request.all_headers.return_value = headers + with warnings.catch_warnings(record=True) as warning_list: + processed_headers = await use_playwright_headers("foobar", playwright_request, Headers({})) + assert processed_headers == headers + assert str(warning_list[0].message) == ( + "The 'scrapy_playwright.headers.use_playwright_headers' function is" + " deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'" + " instead." + ) + + +class MixinProcessHeadersTestCase: + @pytest.mark.asyncio + async def test_user_agent(self): + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "USER_AGENT": None, + } + async with make_handler(settings_dict) as handler: + with MockServer() as server: + # if Scrapy's user agent is None, use the one from the Browser + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["user-agent"] == self.browser_type + + # if Scrapy's user agent is set to some value, use it + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar"}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["user-agent"] == "foobar" + + @pytest.mark.asyncio + async def test_use_playwright_headers(self): + """Ignore Scrapy headers""" + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None, + "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 2000, + } + async with make_handler(settings_dict) as handler: + with MockServer() as server: + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar", "Asdf": "qwerty"}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["user-agent"] == self.browser_type + assert "asdf" not in headers + + @pytest.mark.asyncio + async def test_use_playwright_headers_deprecated(self): + """Ignore Scrapy headers""" + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": use_playwright_headers, + "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 2000, + } + with warnings.catch_warnings(record=True) as warning_list: + async with make_handler(settings_dict) as handler: + with MockServer() as server: + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar", "Asdf": "qwerty"}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["user-agent"] == self.browser_type + assert "asdf" not in headers + + assert str(warning_list[0].message) == ( + "The 'scrapy_playwright.headers.use_playwright_headers' function is" + " deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'" + " instead." + ) + + @pytest.mark.asyncio + async def test_use_custom_headers(self): + """Custom header processing function""" + + async def important_headers(*args, **kwargs) -> dict: + return {"foo": "bar"} + + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers, + } + async with make_handler(settings_dict) as handler: + with MockServer() as server: + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar", "Asdf": "qwerty"}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["foo"] == "bar" + assert headers.get("user-agent") not in (self.browser_type, "foobar") + assert "asdf" not in headers + + +class TestProcessHeadersChromium(MixinProcessHeadersTestCase): + browser_type = "chromium" + + +class TestProcessHeadersFirefox(MixinProcessHeadersTestCase): + browser_type = "firefox" + + +@pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin") +class TestProcessHeadersWebkit(MixinProcessHeadersTestCase): + browser_type = "webkit" diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index adb65770..d207c14d 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -1,4 +1,3 @@ -import json import logging import platform import subprocess @@ -182,18 +181,19 @@ async def test_timeout_error(self, caplog): f"Closing page due to failed request: {req} ({type(excinfo.value)})", ) in caplog.record_tuples - @pytest.mark.skipif(sys.version_info < (3, 8), reason="Fails on py37") + @pytest.mark.skipif(sys.version_info < (3, 8), reason="AsyncMock was added on Python 3.8") @patch("scrapy_playwright.handler.logger") @pytest.mark.asyncio async def test_route_continue_exception(self, logger): - """This test fails on Python 3.7 in the "logger.warning.assert_called_with" assertion, - but the "Captured log call" section shows the exact message that is expected :shrug: - """ + from unittest.mock import AsyncMock + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: req_handler = handler._make_request_handler("GET", Headers({}), body=None) route = MagicMock() - playwright_request = MagicMock() + playwright_request = AsyncMock() playwright_request.url = "https//example.org" + playwright_request.is_navigation_request = MagicMock(return_value=True) + playwright_request.all_headers.return_value = {} # safe error, only warn exc = Exception("Target page, context or browser has been closed") @@ -273,85 +273,6 @@ async def test_page_method_pdf(self): assert pdf_file.file.read() == req.meta["playwright_page_methods"]["pdf"].result assert get_mimetype(pdf_file) == "application/pdf" - @pytest.mark.asyncio - async def test_user_agent(self): - settings_dict = { - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, - "USER_AGENT": None, - } - async with make_handler(settings_dict) as handler: - with MockServer() as server: - # if Scrapy's user agent is None, use the one from the Browser - req = Request( - url=server.urljoin("/headers"), - meta={"playwright": True}, - ) - resp = await handler._download_request(req, Spider("foo")) - headers = json.loads(resp.css("pre::text").get()) - headers = {key.lower(): value for key, value in headers.items()} - assert headers["user-agent"] == self.browser_type - - # if Scrapy's user agent is set to some value, use it - req = Request( - url=server.urljoin("/headers"), - meta={"playwright": True}, - headers={"User-Agent": "foobar"}, - ) - resp = await handler._download_request(req, Spider("foo")) - headers = json.loads(resp.css("pre::text").get()) - headers = {key.lower(): value for key, value in headers.items()} - assert headers["user-agent"] == "foobar" - - @pytest.mark.asyncio - async def test_use_playwright_headers(self): - """Ignore Scrapy headers""" - from scrapy_playwright.headers import use_playwright_headers - - settings_dict = { - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, - "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": use_playwright_headers, - } - async with make_handler(settings_dict) as handler: - with MockServer() as server: - req = Request( - url=server.urljoin("/headers"), - meta={"playwright": True}, - headers={"User-Agent": "foobar", "Asdf": "qwerty"}, - ) - resp = await handler._download_request(req, Spider("foo")) - headers = json.loads(resp.css("pre::text").get()) - headers = {key.lower(): value for key, value in headers.items()} - assert headers["user-agent"] == self.browser_type - assert "asdf" not in headers - - @pytest.mark.asyncio - async def test_use_custom_headers(self): - """Custom header processing function""" - - async def important_headers(*args, **kwargs) -> dict: - return {"foo": "bar"} - - settings_dict = { - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, - "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers, - } - async with make_handler(settings_dict) as handler: - with MockServer() as server: - req = Request( - url=server.urljoin("/headers"), - meta={"playwright": True}, - headers={"User-Agent": "foobar", "Asdf": "qwerty"}, - ) - resp = await handler._download_request(req, Spider("foo")) - headers = json.loads(resp.css("pre::text").get()) - headers = {key.lower(): value for key, value in headers.items()} - assert headers["foo"] == "bar" - assert headers.get("user-agent") not in (self.browser_type, "foobar") - assert "asdf" not in headers - @pytest.mark.asyncio async def test_event_handler_dialog_callable(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: diff --git a/tox.ini b/tox.ini index c4370382..1dfd43b1 100644 --- a/tox.ini +++ b/tox.ini @@ -3,8 +3,6 @@ envlist = bandit,black,flake8,typing,pylint,py [testenv] deps = - playwright>=1.8.0a1 - scrapy>=2.0,!=2.4.0 pytest==6.2.5 pytest-asyncio==0.10 pytest-cov==3.0.0