diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index bc202fc3..e6b5297d 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -303,6 +303,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res ) headers = Headers() else: + await self._set_redirect_meta(request=request, response=response) headers = Headers(await response.all_headers()) headers.pop("Content-Encoding", None) await self._apply_page_methods(page, request) @@ -334,6 +335,23 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res ip_address=server_ip_address, ) + async def _set_redirect_meta(self, request: Request, response: PlaywrightResponse) -> None: + redirect_times: int = 0 + redirect_urls: list = [] + redirect_reasons: list = [] + redirected = response.request.redirected_from + while redirected is not None: + redirect_times += 1 + redirect_urls.append(redirected.url) + redirected_response = await redirected.response() + reason = None if redirected_response is None else redirected_response.status + redirect_reasons.append(reason) + redirected = redirected.redirected_from + if redirect_times: + request.meta["redirect_times"] = redirect_times + request.meta["redirect_urls"] = list(reversed(redirect_urls)) + request.meta["redirect_reasons"] = list(reversed(redirect_reasons)) + async def _apply_page_methods(self, page: Page, request: Request) -> None: page_methods = request.meta.get("playwright_page_methods") or () if isinstance(page_methods, dict): diff --git a/tests/mockserver.py b/tests/mockserver.py index 5bf217ff..86106c43 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -40,7 +40,7 @@ def urljoin(self, url): class _RequestHandler(BaseHTTPRequestHandler): - def do_POST(self): + def do_POST(self) -> None: """Echo back the request body""" content_length = int(self.headers["Content-Length"]) body = self.rfile.read(content_length) @@ -49,21 +49,33 @@ def do_POST(self): self.wfile.write(b"Request body: ") self.wfile.write(body) - def do_GET(self): - body = "{}" + def do_GET(self) -> None: if self.path == "/headers": - body = json.dumps(dict(self.headers), indent=4) + self._send_json(dict(self.headers)) + elif self.path == "/redirect2": + self.send_response(302) + self.send_header("Location", "/redirect") + self.end_headers() + elif self.path == "/redirect": + self.send_response(301) + self.send_header("Location", "/headers") + self.end_headers() else: delay_match = re.match(r"^/delay/(\d+)$", self.path) if delay_match: delay = int(delay_match.group(1)) print(f"Sleeping {delay} seconds...") time.sleep(delay) - body = json.dumps({"delay": delay}) - self.send_response(200) + self._send_json({"delay": delay}) + else: + self._send_json({"error": "unknown path"}, status=400) + + def _send_json(self, body: dict, status: int = 200) -> None: + self.send_response(status) self.send_header("Content-Type", "application/json") self.end_headers() - self.wfile.write(body.encode()) + body_bytes = json.dumps(body, indent=4).encode("utf8") + self.wfile.write(body_bytes) class MockServer: diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index d739819a..5ba7abff 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -341,6 +341,24 @@ async def init_page(page, request, unused_arg): assert f"[Context=default] Page init callback exception for {req!r}" in log_entry[2] assert "init_page() missing 1 required positional argument: 'unused_arg'" in log_entry[2] + @pytest.mark.asyncio + async def test_redirect(self): + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with MockServer() as server: + req = Request( + url=server.urljoin("/redirect2"), + meta={"playwright": True}, + ) + response = await handler._download_request(req, Spider("spider_name")) + + assert response.url == server.urljoin("/headers") + assert response.meta["redirect_times"] == 2 + assert response.meta["redirect_reasons"] == [302, 301] + assert response.meta["redirect_urls"] == [ + server.urljoin("/redirect2"), + server.urljoin("/redirect"), + ] + class TestCaseChromium(MixinTestCase): browser_type = "chromium"