scrapy-plugins · elacuesta · May 14, 2022 · May 11, 2022 · May 11, 2022 · May 11, 2022
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ to integrate `asyncio`-based projects such as `Playwright`.
 
 * Python >= 3.7
 * Scrapy >= 2.0 (!= 2.4.0)
-* Playwright >= 1.8.0a1
+* Playwright >= 1.15
 
 
 ## Installation
@@ -97,13 +97,17 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
     the default value will be used (30000 ms at the time of writing this).
     See the docs for [BrowserContext.set_default_navigation_timeout](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-set-default-navigation-timeout).
 
-* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Union[Callable, str]`, default `scrapy_playwright.headers.use_scrapy_headers`)
+* `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` (type `Optional[Union[Callable, str]]`, default `scrapy_playwright.headers.use_scrapy_headers`)
 
     A function (or the path to a function) that processes headers for a given request
     and returns a dictionary with the headers to be used (note that, depending on the browser,
-    additional default headers will be sent as well). Coroutine functions (`async def`) are
+    additional default headers could be sent as well). Coroutine functions (`async def`) are
     supported.
 
+    This will be called at least once for each Scrapy request (receiving said request and the
+    corresponding Playwright request), but it could be called additional times if the given
+    resource generates more requests (e.g. to retrieve assets like images or scripts).
+
     The function must return a `dict` object, and receives the following keyword arguments:
 
     ```python
@@ -117,10 +121,11 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
     For non-navigation requests (e.g. images, stylesheets, scripts, etc), only the `User-Agent` header
     is overriden, for consistency.
 
-    There is another built-in function available: `scrapy_playwright.headers.use_playwright_headers`,
-    which will return the headers from the Playwright request unmodified.
-    When using this alternative, please keep in mind that headers passed via the `Request.headers`
-    attribute or set by Scrapy components are ignored (including cookies set via the `Request.cookies`
+    Setting `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` will give complete control of the headers to
+    Playwright, i.e. headers from Scrapy requests will be ignored and only headers set by
+    Playwright will be sent.
+    When doing this, please keep in mind that headers passed via the `Request.headers` attribute
+    or set by Scrapy components are ignored (including cookies set via the `Request.cookies`
     attribute).
 
 * `PLAYWRIGHT_MAX_PAGES_PER_CONTEXT` (type `int`, defaults to the value of Scrapy's `CONCURRENT_REQUESTS` setting)
@@ -562,6 +567,12 @@ for more information about deprecations and removals.
 
 ### Currently deprecated features
 
+* `scrapy_playwright.headers.use_playwright_headers` function
+
+    Deprecated since
+    [`v0.0.16`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.16),
+    set `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` instead
+
 * `scrapy_playwright.page.PageCoroutine` class
 
     Deprecated since

diff --git a/changelog.md b/changelog.md
@@ -1,5 +1,12 @@
 # scrapy-playwright changelog
 
+
+### [v0.0.16](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.16) (2022-NN-NN)
+
+* Use new headers API introduced in Playwright 1.15 (bump required Playwright version)
+* Deprecate `scrapy_playwright.headers.use_playwright_headers`, set `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` instead
+
+
 ### [v0.0.15](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.15) (2022-05-08)
 
 * Remove deprecated `PLAYWRIGHT_CONTEXT_ARGS` setting

diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -28,7 +28,7 @@
 from twisted.internet.defer import Deferred, inlineCallbacks
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 
-from scrapy_playwright.headers import use_scrapy_headers
+from scrapy_playwright.headers import use_scrapy_headers, use_playwright_headers
 from scrapy_playwright.page import PageMethod
 
 
@@ -61,10 +61,22 @@ def __init__(self, crawler: Crawler) -> None:
                     crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
                 )
 
-        if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"):
-            self.process_request_headers = load_object(
-                crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
-            )
+        if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings:
+            if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
+                self.process_request_headers = None  # use headers from the Playwright request
+            else:
+                self.process_request_headers = load_object(
+                    crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
+                )
+                if self.process_request_headers is use_playwright_headers:
+                    warnings.warn(
+                        "The 'scrapy_playwright.headers.use_playwright_headers' function is"
+                        " deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
+                        " instead.",
+                        category=ScrapyDeprecationWarning,
+                        stacklevel=1,
+                    )
+                    self.process_request_headers = None
         else:
             self.process_request_headers = use_scrapy_headers
 
@@ -233,7 +245,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
         with suppress(AttributeError):
             request.meta["playwright_security_details"] = await response.security_details()
 
-        headers = Headers(response.headers)
+        headers = Headers(await response.all_headers())
         headers.pop("Content-Encoding", None)
         body, encoding = _encode_body(headers=headers, text=body_str)
         respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
@@ -317,15 +329,18 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
                     self.stats.inc_value("playwright/request_count/aborted")
                     return None
 
-            processed_headers = await _await_if_necessary(
-                self.process_request_headers(self.browser_type, playwright_request, scrapy_headers)
-            )
+            overrides: dict = {}
 
-            # the request that reaches the callback should contain the headers that were sent
-            scrapy_headers.clear()
-            scrapy_headers.update(processed_headers)
+            if self.process_request_headers is not None:
+                overrides["headers"] = await _await_if_necessary(
+                    self.process_request_headers(
+                        self.browser_type, playwright_request, scrapy_headers
+                    )
+                )
+                # the request that reaches the callback should contain the final headers
+                scrapy_headers.clear()
+                scrapy_headers.update(overrides["headers"])
 
-            overrides: dict = {"headers": processed_headers}
             if playwright_request.is_navigation_request():
                 overrides["method"] = method
                 if body is not None:
@@ -351,20 +366,22 @@ async def _await_if_necessary(obj):
 
 
 def _make_request_logger(context_name: str) -> Callable:
-    def _log_request(request: PlaywrightRequest) -> None:
+    async def _log_request(request: PlaywrightRequest) -> None:
+        referrer = await request.header_value("referer")
         logger.debug(
             f"[Context={context_name}] Request: <{request.method.upper()} {request.url}> "
-            f"(resource type: {request.resource_type}, referrer: {request.headers.get('referer')})"
+            f"(resource type: {request.resource_type}, referrer: {referrer})"
         )
 
     return _log_request
 
 
 def _make_response_logger(context_name: str) -> Callable:
-    def _log_request(response: PlaywrightResponse) -> None:
+    async def _log_request(response: PlaywrightResponse) -> None:
+        referrer = await response.header_value("referer")
         logger.debug(
             f"[Context={context_name}] Response: <{response.status} {response.url}> "
-            f"(referrer: {response.headers.get('referer')})"
+            f"(referrer: {referrer})"
         )
 
     return _log_request

diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py
@@ -2,10 +2,11 @@
 This module includes functions to process request headers.
 Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
 """
-
+import warnings
 from urllib.parse import urlparse
 
 from playwright.async_api import Request as PlaywrightRequest
+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http.headers import Headers
 
 
@@ -17,30 +18,34 @@ async def use_scrapy_headers(
     """Scrapy headers take precedence over Playwright headers for navigation requests.
     For non-navigation requests, only User-Agent is taken from the Scrapy headers."""
 
-    headers = scrapy_headers.to_unicode_dict()
+    scrapy_headers_str = scrapy_headers.to_unicode_dict()
+    playwright_headers = await playwright_request.all_headers()
 
     # Scrapy's user agent has priority over Playwright's
-    headers.setdefault("user-agent", playwright_request.headers.get("user-agent"))
+    scrapy_headers_str.setdefault("user-agent", playwright_headers.get("user-agent"))
 
     if playwright_request.is_navigation_request():
         if browser_type == "firefox":
             # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
-            headers["host"] = urlparse(playwright_request.url).netloc
-        return headers
+            scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc
+        return scrapy_headers_str
 
     # override user agent, for consistency with other requests
-    if headers.get("user-agent"):
-        return {
-            **playwright_request.headers,
-            "user-agent": headers["user-agent"],
-        }
-    return playwright_request.headers
+    if scrapy_headers_str.get("user-agent"):
+        playwright_headers["user-agent"] = scrapy_headers_str["user-agent"]
+    return playwright_headers
 
 
 async def use_playwright_headers(
     browser_type: str,
     playwright_request: PlaywrightRequest,
     scrapy_headers: Headers,
 ) -> dict:
-    """Return headers from the Playwright request, unaltered"""
-    return playwright_request.headers
+    warnings.warn(
+        "The 'scrapy_playwright.headers.use_playwright_headers' function is"
+        " deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
+        " instead.",
+        category=ScrapyDeprecationWarning,
+        stacklevel=1,
+    )
+    return await playwright_request.all_headers()
diff --git a/setup.py b/setup.py
@@ -35,6 +35,6 @@
     python_requires=">=3.7",
     install_requires=[
         "scrapy>=2.0,!=2.4.0",
-        "playwright>=1.8.0a1",
+        "playwright>=1.15",
     ],
 )
diff --git a/tests/test_headers.py b/tests/test_headers.py
@@ -0,0 +1,153 @@
+import json
+import platform
+import sys
+import warnings
+
+import pytest
+from scrapy import Spider, Request
+from scrapy.http.headers import Headers
+
+from tests import make_handler
+from tests.mockserver import MockServer
+
+from scrapy_playwright.headers import use_playwright_headers
+
+
+@pytest.mark.skipif(sys.version_info < (3, 8), reason="AsyncMock was added on Python 3.8")
+@pytest.mark.asyncio
+async def test_use_playwright_headers_deprecated():
+    from unittest.mock import AsyncMock
+
+    headers = {"foo": "bar", "a": "b"}
+    playwright_request = AsyncMock()
+    playwright_request.all_headers.return_value = headers
+    with warnings.catch_warnings(record=True) as warning_list:
+        processed_headers = await use_playwright_headers("foobar", playwright_request, Headers({}))
+    assert processed_headers == headers
+    assert str(warning_list[0].message) == (
+        "The 'scrapy_playwright.headers.use_playwright_headers' function is"
+        " deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
+        " instead."
+    )
+
+
+class MixinProcessHeadersTestCase:
+    @pytest.mark.asyncio
+    async def test_user_agent(self):
+        settings_dict = {
+            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
+            "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
+            "USER_AGENT": None,
+        }
+        async with make_handler(settings_dict) as handler:
+            with MockServer() as server:
+                # if Scrapy's user agent is None, use the one from the Browser
+                req = Request(
+                    url=server.urljoin("/headers"),
+                    meta={"playwright": True},
+                )
+                resp = await handler._download_request(req, Spider("foo"))
+                headers = json.loads(resp.css("pre::text").get())
+                headers = {key.lower(): value for key, value in headers.items()}
+                assert headers["user-agent"] == self.browser_type
+
+                # if Scrapy's user agent is set to some value, use it
+                req = Request(
+                    url=server.urljoin("/headers"),
+                    meta={"playwright": True},
+                    headers={"User-Agent": "foobar"},
+                )
+                resp = await handler._download_request(req, Spider("foo"))
+                headers = json.loads(resp.css("pre::text").get())
+                headers = {key.lower(): value for key, value in headers.items()}
+                assert headers["user-agent"] == "foobar"
+
+    @pytest.mark.asyncio
+    async def test_use_playwright_headers(self):
+        """Ignore Scrapy headers"""
+        settings_dict = {
+            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
+            "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
+            "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
+            "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 2000,
+        }
+        async with make_handler(settings_dict) as handler:
+            with MockServer() as server:
+                req = Request(
+                    url=server.urljoin("/headers"),
+                    meta={"playwright": True},
+                    headers={"User-Agent": "foobar", "Asdf": "qwerty"},
+                )
+                resp = await handler._download_request(req, Spider("foo"))
+                headers = json.loads(resp.css("pre::text").get())
+                headers = {key.lower(): value for key, value in headers.items()}
+                assert headers["user-agent"] == self.browser_type
+                assert "asdf" not in headers
+
+    @pytest.mark.asyncio
+    async def test_use_playwright_headers_deprecated(self):
+        """Ignore Scrapy headers"""
+        settings_dict = {
+            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
+            "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
+            "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": use_playwright_headers,
+            "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 2000,
+        }
+        with warnings.catch_warnings(record=True) as warning_list:
+            async with make_handler(settings_dict) as handler:
+                with MockServer() as server:
+                    req = Request(
+                        url=server.urljoin("/headers"),
+                        meta={"playwright": True},
+                        headers={"User-Agent": "foobar", "Asdf": "qwerty"},
+                    )
+                    resp = await handler._download_request(req, Spider("foo"))
+                    headers = json.loads(resp.css("pre::text").get())
+                    headers = {key.lower(): value for key, value in headers.items()}
+                    assert headers["user-agent"] == self.browser_type
+                    assert "asdf" not in headers
+
+            assert str(warning_list[0].message) == (
+                "The 'scrapy_playwright.headers.use_playwright_headers' function is"
+                " deprecated, please set 'PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None'"
+                " instead."
+            )
+
+    @pytest.mark.asyncio
+    async def test_use_custom_headers(self):
+        """Custom header processing function"""
+
+        async def important_headers(*args, **kwargs) -> dict:
+            return {"foo": "bar"}
+
+        settings_dict = {
+            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
+            "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
+            "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers,
+        }
+        async with make_handler(settings_dict) as handler:
+            with MockServer() as server:
+                req = Request(
+                    url=server.urljoin("/headers"),
+                    meta={"playwright": True},
+                    headers={"User-Agent": "foobar", "Asdf": "qwerty"},
+                )
+                resp = await handler._download_request(req, Spider("foo"))
+                headers = json.loads(resp.css("pre::text").get())
+                headers = {key.lower(): value for key, value in headers.items()}
+                assert headers["foo"] == "bar"
+                assert headers.get("user-agent") not in (self.browser_type, "foobar")
+                assert "asdf" not in headers
+
+
+class TestProcessHeadersChromium(MixinProcessHeadersTestCase):
+    browser_type = "chromium"
+
+
+class TestProcessHeadersFirefox(MixinProcessHeadersTestCase):
+    browser_type = "firefox"
+
+
+@pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin")
+class TestProcessHeadersWebkit(MixinProcessHeadersTestCase):
+    browser_type = "webkit"