From 1268ffd97c6f690dedaf386aa88e20f8f4fc87a3 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 30 Sep 2021 10:15:45 -0300 Subject: [PATCH 1/3] Use the browser's user agent if Scrapy's is None --- scrapy_playwright/handler.py | 3 ++- tests/mockserver.py | 24 +++++++++++++++--- tests/test_playwright_requests.py | 41 +++++++++++++++++++++++++++++-- 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 7758f60b..02812f8c 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -200,12 +200,13 @@ def _make_request_handler( ) -> Callable: def request_handler(route: Route, pw_request: PlaywrightRequest) -> None: """Override request headers, method and body.""" + headers.setdefault("user-agent", pw_request.headers.get("user-agent")) if pw_request.url == url: overrides: dict = {"method": method, "headers": headers} if body is not None: overrides["post_data"] = body.decode(encoding) - # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET if self.browser_type == "firefox": + # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET overrides["headers"]["host"] = urlparse(pw_request.url).netloc else: overrides = {"headers": pw_request.headers.copy()} diff --git a/tests/mockserver.py b/tests/mockserver.py index 328224f0..1605e1c9 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -1,3 +1,4 @@ +import json import re import sys import time @@ -40,11 +41,19 @@ def do_POST(self): self.wfile.write(body) def do_GET(self): - """Take a long time to reply""" - time.sleep(2) + body = "{}" + if self.path == "/headers": + body = json.dumps(dict(self.headers), indent=4) + else: + delay_match = re.match(r"^/delay/(\d+)$", self.path) + if delay_match: + delay = int(delay_match.group(1)) + print(f"Sleeping {delay} seconds...") + time.sleep(delay) + body = json.dumps({"slept": delay}) self.send_response(200) self.end_headers() - self.wfile.write(b"Hello world!") + self.wfile.write(body.encode()) class MockServer: @@ -59,5 +68,12 @@ def __exit__(self, exc_type, exc_value, traceback): self.httpd.shutdown() self.thread.join() - def urljoin(self, url): + def urljoin(self, url: str) -> str: return urljoin("http://{}:{}".format(self.address, self.port), url) + + +if __name__ == "__main__": + with MockServer() as server: + print(f"Listening at http://{server.address}:{server.port}") + while True: + pass diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index 0ea47ef9..c0e63030 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -1,3 +1,4 @@ +import json import platform import subprocess from tempfile import NamedTemporaryFile @@ -56,7 +57,7 @@ async def test_post_request(self): with MockServer() as server: req = FormRequest( - server.urljoin("/"), meta={"playwright": True}, formdata={"foo": "bar"} + server.urljoin("/delay/2"), meta={"playwright": True}, formdata={"foo": "bar"} ) resp = await handler._download_request(req, Spider("foo")) @@ -142,7 +143,7 @@ async def test_timeout(self): await handler._launch_browser() with MockServer() as server: - req = Request(server.urljoin("/index.html"), meta={"playwright": True}) + req = Request(server.urljoin("/delay/2"), meta={"playwright": True}) with pytest.raises(TimeoutError): await handler._download_request(req, Spider("foo")) @@ -238,6 +239,42 @@ async def test_page_coroutine_pdf(self): await handler.browser.close() + @pytest.mark.asyncio + async def test_user_agent(self): + crawler = get_crawler( + settings_dict={ + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "USER_AGENT": None, + } + ) + handler = ScrapyPlaywrightDownloadHandler(crawler) + await handler._launch_browser() + + with MockServer() as server: + # if Scrapy's user agent is None, use the one from the Browser + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["user-agent"] == self.browser_type + + # if Scrapy's user agent is set to some value, use it + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar"}, + ) + resp = await handler._download_request(req, Spider("foo")) + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["user-agent"] == "foobar" + + await handler.browser.close() + class TestCaseChromium(MixinTestCase): browser_type = "chromium" From 4899be132d2dc5d3f7677e7d35c64f86d4ac73dc Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 30 Sep 2021 11:52:47 -0300 Subject: [PATCH 2/3] Add note about the user agent to the readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index c3aa4a19..1edaf09b 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,14 @@ class AwesomeSpider(scrapy.Spider): yield {"url": response.url} ``` +### Notes about the User-Agent header + +By default, outgoing requests include the `User-Agent` set by Scrapy (either with the +`USER_AGENT` or `DEFAULT_REQUEST_HEADERS` settings or via the `Request.headers` attribute). +This could cause some sites to react in unexpected ways, for instance if the user agent +does not match the Browser being used. If you prefer to send the `User-Agent` from the Browser, +set the Scrapy user agent to `None`. + ## Receiving the Page object in the callback From a936b1d8ce0a202d648f5d5a34348401d25a26b1 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 30 Sep 2021 12:05:02 -0300 Subject: [PATCH 3/3] Content-Type header in tests --- tests/mockserver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index 1605e1c9..d29daf0e 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -50,8 +50,9 @@ def do_GET(self): delay = int(delay_match.group(1)) print(f"Sleeping {delay} seconds...") time.sleep(delay) - body = json.dumps({"slept": delay}) + body = json.dumps({"delay": delay}) self.send_response(200) + self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(body.encode())