Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -316,16 +316,15 @@ This is useful when you need to perform certain actions on a page, like scrollin
down or clicking links, and you want everything to count as a single Scrapy
Response, containing the final result.

### Supported actions
### `PageCoroutine` class

* `scrapy_playwright.page.PageCoroutine(method: str, *args, **kwargs)`:

_Represents a coroutine to be awaited on a `playwright.page.Page` object,
Represents a coroutine to be awaited on a `playwright.page.Page` object,
such as "click", "screenshot", "evaluate", etc.
`method` should be the name of the coroutine, `*args` and `**kwargs`
are passed to the function call._

_The coroutine result will be stored in the `PageCoroutine.result` attribute_
are passed to the function call. The return value of the coroutine call
will be stored in the `PageCoroutine.result` attribute.

For instance,
```python
Expand All @@ -339,8 +338,21 @@ Response, containing the final result.
```


### Supported coroutines

Please refer to the [upstream docs for the `Page` class](https://playwright.dev/python/docs/api/class-page)
to see available coroutines

### Impact on Response objects

Certain `Response` attributes (e.g. `url`, `ip_address`) reflect the state after the last
action performed on a page. If you issue a `PageCoroutine` with an action that results in
a navigation (e.g. a `click` on a link), the `Response.url` attribute will point to the
new URL, which might be different from the request's URL.


## Page events
A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers`
A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers`
[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key.
Keys are the name of the event to be handled (`dialog`, `download`, etc).
Values can be either callables or strings (in which case a spider method with the name will be looked up).
Expand Down
8 changes: 6 additions & 2 deletions examples/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class StorageSpider(Spider):
"""
Set and get storage state
Set and get storage state. Also get the server's IP address.
"""

name = "storage"
Expand All @@ -24,7 +24,11 @@ def start_requests(self):

async def parse(self, response):
page = response.meta["playwright_page"]
return {"url": response.url, "storage_state": await page.context.storage_state()}
return {
"url": response.url,
"storage_state": await page.context.storage_state(),
"ip_address": response.ip_address,
}


if __name__ == "__main__":
Expand Down
7 changes: 7 additions & 0 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import warnings
from collections import defaultdict
from contextlib import suppress
from ipaddress import ip_address
from time import time
from typing import Callable, Dict, Optional, Type, TypeVar

Expand Down Expand Up @@ -231,6 +232,11 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
await page.close()
self.stats.inc_value("playwright/page_count/closed")

server_ip_address = None
with suppress(AttributeError, KeyError, ValueError):
server_addr = await response.server_addr()
server_ip_address = ip_address(server_addr["ipAddress"])

headers = Headers(response.headers)
headers.pop("Content-Encoding", None)
encoding = _get_response_encoding(headers, body_str) or "utf-8"
Expand All @@ -244,6 +250,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
request=request,
flags=["playwright"],
encoding=encoding,
ip_address=server_ip_address,
)

def _increment_request_stats(self, request: PlaywrightRequest) -> None:
Expand Down
14 changes: 14 additions & 0 deletions tests/test_playwright_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import platform
import subprocess
from ipaddress import ip_address
from tempfile import NamedTemporaryFile

import pytest
Expand Down Expand Up @@ -372,6 +373,19 @@ async def test_event_handler_dialog_missing(self, caplog):
) in caplog.record_tuples
assert getattr(spider, "dialog_message", None) is None

@pytest.mark.asyncio
async def test_response_attributes(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with MockServer() as server:
spider = DialogSpider()
req = Request(
url=server.urljoin("/index.html"),
meta={"playwright": True},
)
response = await handler._download_request(req, spider)

assert response.ip_address == ip_address(server.address)


class TestCaseChromium(MixinTestCase):
browser_type = "chromium"
Expand Down