diff --git a/README.md b/README.md index d1532980..f206eb50 100644 --- a/README.md +++ b/README.md @@ -233,9 +233,15 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" callback=self.parse_headers, meta={"playwright": True, "playwright_page": page}, ) - ``` +* `playwright_page_goto_kwargs` (type `dict`, default `{}`) + + A dictionary with keyword arguments to be passed to the page's + [`goto` method](https://playwright.dev/python/docs/api/class-page#page-goto) + when navigating to an URL. The `url` key is ignored if present, the request's + URL is used instead. + * `playwright_security_details` (type `Optional[dict]`, read only) A dictionary with [security information](https://playwright.dev/python/docs/api/class-response#response-security-details) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index ff3f620c..842feabf 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -288,7 +288,9 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res request.meta["playwright_page"] = page start_time = time() - response = await page.goto(request.url) + page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {} + page_goto_kwargs.pop("url", None) + response = await page.goto(url=request.url, **page_goto_kwargs) await self._apply_page_methods(page, request) body_str = await page.content() request.meta["download_latency"] = time() - start_time diff --git a/tests/mockserver.py b/tests/mockserver.py index 0fe3fb82..5bf217ff 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -6,10 +6,19 @@ from pathlib import Path from subprocess import Popen, PIPE from threading import Thread +from typing import Optional from urllib.parse import urljoin class StaticMockServer: + """A web server that serves the contents of the sibling "site" directory. + To be used as a context manager: + + with StaticMockServer() as server: + url = server.urljoin("/index.html") + ... + """ + def __enter__(self): self.proc = Popen( [sys.executable, "-u", "-m", "http.server", "0", "--bind", "127.0.0.1"], @@ -58,6 +67,8 @@ def do_GET(self): class MockServer: + """A context manager web server using the _RequestHandler class to handle requests.""" + def __enter__(self): self.httpd = HTTPServer(("127.0.0.1", 0), _RequestHandler) self.address, self.port = self.httpd.server_address @@ -69,7 +80,7 @@ def __exit__(self, exc_type, exc_value, traceback): self.httpd.shutdown() self.thread.join() - def urljoin(self, url: str) -> str: + def urljoin(self, url: Optional[str] = None) -> str: return urljoin(f"http://{self.address}:{self.port}", url) diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index ed3fcd87..59cb64d5 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -1,3 +1,4 @@ +import json import logging import platform import subprocess @@ -321,15 +322,33 @@ async def test_event_handler_dialog_missing(self, caplog): async def test_response_attributes(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with MockServer() as server: - spider = DialogSpider() req = Request( - url=server.urljoin("/index.html"), + url=server.urljoin(), meta={"playwright": True}, ) - response = await handler._download_request(req, spider) + response = await handler._download_request(req, Spider("spider_name")) assert response.ip_address == ip_address(server.address) + @pytest.mark.asyncio + async def test_page_goto_kwargs_referer(self): + if self.browser_type != "chromium": + pytest.skip("referer as goto kwarg seems to work only with chromium :shrug:") + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with MockServer() as server: + fake_referer = server.urljoin("/fake/referer") + req = Request( + url=server.urljoin("/headers"), + meta={ + "playwright": True, + "playwright_page_goto_kwargs": {"referer": fake_referer}, + }, + ) + response = await handler._download_request(req, Spider("spider_name")) + + headers = json.loads(response.css("pre::text").get()) + assert headers["Referer"] == fake_referer + @pytest.mark.asyncio async def test_abort_requests(self): async def should_abort_request_async(request):