diff --git a/README.md b/README.md index 5a9ba1c4..a8bf66b7 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,14 @@ class AwesomeSpider(scrapy.Spider): yield {"url": response.url} ``` +### Notes about the User-Agent header + +By default, outgoing requests include the `User-Agent` set by Scrapy (either with the +`USER_AGENT` or `DEFAULT_REQUEST_HEADERS` settings or via the `Request.headers` attribute). +This could cause some sites to react in unexpected ways, for instance if the user agent +does not match the Browser being used. If you prefer to send the `User-Agent` from the Browser, +set the Scrapy user agent to `None`. + ## Receiving the Page object in the callback diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 6785f34c..9ebcfcec 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -217,12 +217,13 @@ def _make_request_handler( ) -> Callable: def request_handler(route: Route, pw_request: PlaywrightRequest) -> None: """Override request headers, method and body.""" + headers.setdefault("user-agent", pw_request.headers.get("user-agent")) if pw_request.url == url: overrides: dict = {"method": method, "headers": headers} if body is not None: overrides["post_data"] = body.decode(encoding) - # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET if self.browser_type == "firefox": + # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET overrides["headers"]["host"] = urlparse(pw_request.url).netloc else: overrides = {"headers": pw_request.headers.copy()} diff --git a/tests/mockserver.py b/tests/mockserver.py index 328224f0..d29daf0e 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -1,3 +1,4 @@ +import json import re import sys import time @@ -40,11 +41,20 @@ def do_POST(self): self.wfile.write(body) def do_GET(self): - """Take a long time to reply""" - time.sleep(2) + body = "{}" + if self.path == "/headers": + body = json.dumps(dict(self.headers), indent=4) + else: + delay_match = re.match(r"^/delay/(\d+)$", self.path) + if delay_match: + delay = int(delay_match.group(1)) + print(f"Sleeping {delay} seconds...") + time.sleep(delay) + body = json.dumps({"delay": delay}) self.send_response(200) + self.send_header("Content-Type", "application/json") self.end_headers() - self.wfile.write(b"Hello world!") + self.wfile.write(body.encode()) class MockServer: @@ -59,5 +69,12 @@ def __exit__(self, exc_type, exc_value, traceback): self.httpd.shutdown() self.thread.join() - def urljoin(self, url): + def urljoin(self, url: str) -> str: return urljoin("http://{}:{}".format(self.address, self.port), url) + + +if __name__ == "__main__": + with MockServer() as server: + print(f"Listening at http://{server.address}:{server.port}") + while True: + pass diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index c63c5cec..38c9087e 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -1,3 +1,4 @@ +import json import logging import platform import subprocess @@ -57,7 +58,7 @@ async def test_post_request(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with MockServer() as server: req = FormRequest( - server.urljoin("/"), meta={"playwright": True}, formdata={"foo": "bar"} + server.urljoin("/delay/2"), meta={"playwright": True}, formdata={"foo": "bar"} ) resp = await handler._download_request(req, Spider("foo")) @@ -124,7 +125,7 @@ async def test_timeout(self): } async with make_handler(settings_dict) as handler: with MockServer() as server: - req = Request(server.urljoin("/index.html"), meta={"playwright": True}) + req = Request(server.urljoin("/delay/2"), meta={"playwright": True}) with pytest.raises(TimeoutError): await handler._download_request(req, Spider("foo")) @@ -193,6 +194,36 @@ async def test_page_coroutine_pdf(self): assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result assert get_mimetype(pdf_file) == "application/pdf" + @pytest.mark.asyncio + async def test_user_agent(self): + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "USER_AGENT": None, + } + async with make_handler(settings_dict) as handler: + with MockServer() as server: + # if Scrapy's user agent is None, use the one from the Browser + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["user-agent"] == self.browser_type + + # if Scrapy's user agent is set to some value, use it + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar"}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["user-agent"] == "foobar" + @pytest.mark.asyncio async def test_event_handler_dialog_callable(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: