diff --git a/README.md b/README.md index 08e68f1f..53524ff2 100644 --- a/README.md +++ b/README.md @@ -316,16 +316,15 @@ This is useful when you need to perform certain actions on a page, like scrollin down or clicking links, and you want everything to count as a single Scrapy Response, containing the final result. -### Supported actions +### `PageCoroutine` class * `scrapy_playwright.page.PageCoroutine(method: str, *args, **kwargs)`: - _Represents a coroutine to be awaited on a `playwright.page.Page` object, + Represents a coroutine to be awaited on a `playwright.page.Page` object, such as "click", "screenshot", "evaluate", etc. `method` should be the name of the coroutine, `*args` and `**kwargs` - are passed to the function call._ - - _The coroutine result will be stored in the `PageCoroutine.result` attribute_ + are passed to the function call. The return value of the coroutine call + will be stored in the `PageCoroutine.result` attribute. For instance, ```python @@ -339,8 +338,21 @@ Response, containing the final result. ``` +### Supported coroutines + +Please refer to the [upstream docs for the `Page` class](https://playwright.dev/python/docs/api/class-page) +to see available coroutines + +### Impact on Response objects + +Certain `Response` attributes (e.g. `url`, `ip_address`) reflect the state after the last +action performed on a page. If you issue a `PageCoroutine` with an action that results in +a navigation (e.g. a `click` on a link), the `Response.url` attribute will point to the +new URL, which might be different from the request's URL. + + ## Page events -A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers` +A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers` [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key. Keys are the name of the event to be handled (`dialog`, `download`, etc). Values can be either callables or strings (in which case a spider method with the name will be looked up). diff --git a/examples/storage.py b/examples/storage.py index b5b935f5..bc093ff2 100644 --- a/examples/storage.py +++ b/examples/storage.py @@ -5,7 +5,7 @@ class StorageSpider(Spider): """ - Set and get storage state + Set and get storage state. Also get the server's IP address. """ name = "storage" @@ -24,7 +24,11 @@ def start_requests(self): async def parse(self, response): page = response.meta["playwright_page"] - return {"url": response.url, "storage_state": await page.context.storage_state()} + return { + "url": response.url, + "storage_state": await page.context.storage_state(), + "ip_address": response.ip_address, + } if __name__ == "__main__": diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 593a9935..7dc61273 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -3,6 +3,7 @@ import warnings from collections import defaultdict from contextlib import suppress +from ipaddress import ip_address from time import time from typing import Callable, Dict, Optional, Type, TypeVar @@ -231,6 +232,11 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res await page.close() self.stats.inc_value("playwright/page_count/closed") + server_ip_address = None + with suppress(AttributeError, KeyError, ValueError): + server_addr = await response.server_addr() + server_ip_address = ip_address(server_addr["ipAddress"]) + headers = Headers(response.headers) headers.pop("Content-Encoding", None) encoding = _get_response_encoding(headers, body_str) or "utf-8" @@ -244,6 +250,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res request=request, flags=["playwright"], encoding=encoding, + ip_address=server_ip_address, ) def _increment_request_stats(self, request: PlaywrightRequest) -> None: diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index ecb6cc48..1711b2b2 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -2,6 +2,7 @@ import logging import platform import subprocess +from ipaddress import ip_address from tempfile import NamedTemporaryFile import pytest @@ -372,6 +373,19 @@ async def test_event_handler_dialog_missing(self, caplog): ) in caplog.record_tuples assert getattr(spider, "dialog_message", None) is None + @pytest.mark.asyncio + async def test_response_attributes(self): + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with MockServer() as server: + spider = DialogSpider() + req = Request( + url=server.urljoin("/index.html"), + meta={"playwright": True}, + ) + response = await handler._download_request(req, spider) + + assert response.ip_address == ip_address(server.address) + class TestCaseChromium(MixinTestCase): browser_type = "chromium"