Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
)
headers = Headers()
else:
await self._set_redirect_meta(request=request, response=response)
headers = Headers(await response.all_headers())
headers.pop("Content-Encoding", None)
await self._apply_page_methods(page, request)
Expand Down Expand Up @@ -334,6 +335,23 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
ip_address=server_ip_address,
)

async def _set_redirect_meta(self, request: Request, response: PlaywrightResponse) -> None:
redirect_times: int = 0
redirect_urls: list = []
redirect_reasons: list = []
redirected = response.request.redirected_from
while redirected is not None:
redirect_times += 1
redirect_urls.append(redirected.url)
redirected_response = await redirected.response()
reason = None if redirected_response is None else redirected_response.status
redirect_reasons.append(reason)
redirected = redirected.redirected_from
if redirect_times:
request.meta["redirect_times"] = redirect_times
request.meta["redirect_urls"] = list(reversed(redirect_urls))
request.meta["redirect_reasons"] = list(reversed(redirect_reasons))

async def _apply_page_methods(self, page: Page, request: Request) -> None:
page_methods = request.meta.get("playwright_page_methods") or ()
if isinstance(page_methods, dict):
Expand Down
26 changes: 19 additions & 7 deletions tests/mockserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def urljoin(self, url):


class _RequestHandler(BaseHTTPRequestHandler):
def do_POST(self):
def do_POST(self) -> None:
"""Echo back the request body"""
content_length = int(self.headers["Content-Length"])
body = self.rfile.read(content_length)
Expand All @@ -49,21 +49,33 @@ def do_POST(self):
self.wfile.write(b"Request body: ")
self.wfile.write(body)

def do_GET(self):
body = "{}"
def do_GET(self) -> None:
if self.path == "/headers":
body = json.dumps(dict(self.headers), indent=4)
self._send_json(dict(self.headers))
elif self.path == "/redirect2":
self.send_response(302)
self.send_header("Location", "/redirect")
self.end_headers()
elif self.path == "/redirect":
self.send_response(301)
self.send_header("Location", "/headers")
self.end_headers()
else:
delay_match = re.match(r"^/delay/(\d+)$", self.path)
if delay_match:
delay = int(delay_match.group(1))
print(f"Sleeping {delay} seconds...")
time.sleep(delay)
body = json.dumps({"delay": delay})
self.send_response(200)
self._send_json({"delay": delay})
else:
self._send_json({"error": "unknown path"}, status=400)

def _send_json(self, body: dict, status: int = 200) -> None:
self.send_response(status)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(body.encode())
body_bytes = json.dumps(body, indent=4).encode("utf8")
self.wfile.write(body_bytes)


class MockServer:
Expand Down
18 changes: 18 additions & 0 deletions tests/test_playwright_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,24 @@ async def init_page(page, request, unused_arg):
assert f"[Context=default] Page init callback exception for {req!r}" in log_entry[2]
assert "init_page() missing 1 required positional argument: 'unused_arg'" in log_entry[2]

@pytest.mark.asyncio
async def test_redirect(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with MockServer() as server:
req = Request(
url=server.urljoin("/redirect2"),
meta={"playwright": True},
)
response = await handler._download_request(req, Spider("spider_name"))

assert response.url == server.urljoin("/headers")
assert response.meta["redirect_times"] == 2
assert response.meta["redirect_reasons"] == [302, 301]
assert response.meta["redirect_urls"] == [
server.urljoin("/redirect2"),
server.urljoin("/redirect"),
]


class TestCaseChromium(MixinTestCase):
browser_type = "chromium"
Expand Down