Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@ class AwesomeSpider(scrapy.Spider):
yield {"url": response.url}
```

### Notes about the User-Agent header

By default, outgoing requests include the `User-Agent` set by Scrapy (either with the
`USER_AGENT` or `DEFAULT_REQUEST_HEADERS` settings or via the `Request.headers` attribute).
This could cause some sites to react in unexpected ways, for instance if the user agent
does not match the Browser being used. If you prefer to send the `User-Agent` from the Browser,
set the Scrapy user agent to `None`.


## Receiving the Page object in the callback

Expand Down
3 changes: 2 additions & 1 deletion scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,13 @@ def _make_request_handler(
) -> Callable:
def request_handler(route: Route, pw_request: PlaywrightRequest) -> None:
"""Override request headers, method and body."""
headers.setdefault("user-agent", pw_request.headers.get("user-agent"))
if pw_request.url == url:
overrides: dict = {"method": method, "headers": headers}
if body is not None:
overrides["post_data"] = body.decode(encoding)
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
if self.browser_type == "firefox":
# otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
overrides["headers"]["host"] = urlparse(pw_request.url).netloc
else:
overrides = {"headers": pw_request.headers.copy()}
Expand Down
25 changes: 21 additions & 4 deletions tests/mockserver.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import re
import sys
import time
Expand Down Expand Up @@ -40,11 +41,20 @@ def do_POST(self):
self.wfile.write(body)

def do_GET(self):
"""Take a long time to reply"""
time.sleep(2)
body = "{}"
if self.path == "/headers":
body = json.dumps(dict(self.headers), indent=4)
else:
delay_match = re.match(r"^/delay/(\d+)$", self.path)
if delay_match:
delay = int(delay_match.group(1))
print(f"Sleeping {delay} seconds...")
time.sleep(delay)
body = json.dumps({"delay": delay})
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(b"Hello world!")
self.wfile.write(body.encode())


class MockServer:
Expand All @@ -59,5 +69,12 @@ def __exit__(self, exc_type, exc_value, traceback):
self.httpd.shutdown()
self.thread.join()

def urljoin(self, url):
def urljoin(self, url: str) -> str:
return urljoin("http://{}:{}".format(self.address, self.port), url)


if __name__ == "__main__":
with MockServer() as server:
print(f"Listening at http://{server.address}:{server.port}")
while True:
pass
35 changes: 33 additions & 2 deletions tests/test_playwright_requests.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import platform
import subprocess
Expand Down Expand Up @@ -57,7 +58,7 @@ async def test_post_request(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with MockServer() as server:
req = FormRequest(
server.urljoin("/"), meta={"playwright": True}, formdata={"foo": "bar"}
server.urljoin("/delay/2"), meta={"playwright": True}, formdata={"foo": "bar"}
)
resp = await handler._download_request(req, Spider("foo"))

Expand Down Expand Up @@ -124,7 +125,7 @@ async def test_timeout(self):
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(server.urljoin("/index.html"), meta={"playwright": True})
req = Request(server.urljoin("/delay/2"), meta={"playwright": True})
with pytest.raises(TimeoutError):
await handler._download_request(req, Spider("foo"))

Expand Down Expand Up @@ -193,6 +194,36 @@ async def test_page_coroutine_pdf(self):
assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result
assert get_mimetype(pdf_file) == "application/pdf"

@pytest.mark.asyncio
async def test_user_agent(self):
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"USER_AGENT": None,
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
# if Scrapy's user agent is None, use the one from the Browser
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == self.browser_type

# if Scrapy's user agent is set to some value, use it
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar"},
)
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["user-agent"] == "foobar"

@pytest.mark.asyncio
async def test_event_handler_dialog_callable(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
Expand Down