Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,50 @@ Response, containing the final result.
```


## Page events
A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers`
[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key.
Keys are the name of the event to be handled (`dialog`, `download`, etc).
Values can be either callables or strings (in which case a spider method with the name will be looked up).

Example:

```python
from playwright.async_api import Dialog

async def handle_dialog(self, dialog: Dialog) -> None:
logging.info(f"Handled dialog with message: {dialog.message}")
await dialog.dismiss()

class EventSpider(scrapy.Spider):
name = "event"

def start_requests(self):
yield scrapy.Request(
url="https://example.org",
meta=dict(
playwright=True,
playwright_page_event_handlers={
"dialog": handle_dialog,
"response": "handle_response",
},
),
)

async def handle_response(self, response: PlaywrightResponse) -> None:
logging.info(f"Received response with URL {response.url}")
```

See the [upstream `Page` docs](https://playwright.dev/python/docs/api/class-page/) for a list of
the accepted events and the arguments passed to their handlers.

**Note**: keep in mind that, unless they are
[removed later](https://playwright.dev/python/docs/events/#addingremoving-event-listener),
these handlers will remain attached to the page and will be called for subsequent
downloads using the same page. This is usually not a problem, since by default
requests are performed in single-use pages.


## Examples

**Click on a link, save the resulting page as PDF**
Expand Down
51 changes: 51 additions & 0 deletions examples/events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from playwright.async_api import Dialog, Response as PlaywrightResponse
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageCoroutine


class EventsSpider(Spider):
"""
Handle page events
"""

name = "events"

def start_requests(self):
yield Request(
url="https://example.org",
meta={
"playwright": True,
"playwright_page_coroutines": [
PageCoroutine("evaluate", "alert('foobar');"),
],
"playwright_page_event_handlers": {
"dialog": self.handle_dialog,
"response": "handle_response",
},
},
)

async def handle_dialog(self, dialog: Dialog) -> None:
self.logger.info(f"Handled dialog with message: {dialog.message}")
await dialog.dismiss()

async def handle_response(self, response: PlaywrightResponse) -> None:
self.logger.info(f"Received response with URL {response.url}")

def parse(self, response):
return {"url": response.url}


if __name__ == "__main__":
process = CrawlerProcess(
settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
}
)
process.crawl(EventsSpider)
process.start()
15 changes: 15 additions & 0 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,21 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
page = request.meta.get("playwright_page")
if not isinstance(page, Page):
page = await self._create_page(request)

# attach event handlers
event_handlers = request.meta.get("playwright_page_event_handlers") or {}
for event, handler in event_handlers.items():
if callable(handler):
page.on(event, handler)
elif isinstance(handler, str):
try:
page.on(event, getattr(spider, handler))
except AttributeError:
logger.warning(
f"Spider '{spider.name}' does not have a '{handler}' attribute,"
f" ignoring handler for event '{event}'"
)

await page.unroute("**")
await page.route(
"**",
Expand Down
94 changes: 93 additions & 1 deletion tests/test_playwright_requests.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
import platform
import subprocess
from tempfile import NamedTemporaryFile

import pytest
from playwright.async_api import Page as PlaywrightPage, TimeoutError
from playwright.async_api import Dialog, Page as PlaywrightPage, TimeoutError
from scrapy import Spider, Request, FormRequest
from scrapy.http.response.html import HtmlResponse
from scrapy.utils.test import get_crawler
Expand All @@ -22,6 +23,16 @@ def get_mimetype(file):
).stdout.strip()


class DialogSpider(Spider):
"""A spider with a method to handle the "dialog" page event"""

name = "dialog"

async def handle_dialog(self, dialog: Dialog) -> None:
self.dialog_message = dialog.message
await dialog.dismiss()


class MixinTestCase:
@pytest.mark.asyncio
async def test_basic_response(self):
Expand Down Expand Up @@ -238,6 +249,87 @@ async def test_page_coroutine_pdf(self):

await handler.browser.close()

@pytest.mark.asyncio
async def test_event_handler_dialog_callable(self):
crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
handler = ScrapyPlaywrightDownloadHandler(crawler)
await handler._launch_browser()

with StaticMockServer() as server:
spider = DialogSpider()
req = Request(
url=server.urljoin("/index.html"),
meta={
"playwright": True,
"playwright_page_coroutines": [
PageCoro("evaluate", "alert('foobar');"),
],
"playwright_page_event_handlers": {
"dialog": spider.handle_dialog,
},
},
)
await handler._download_request(req, spider)

assert spider.dialog_message == "foobar"

await handler.browser.close()

@pytest.mark.asyncio
async def test_event_handler_dialog_str(self):
crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
handler = ScrapyPlaywrightDownloadHandler(crawler)
await handler._launch_browser()

with StaticMockServer() as server:
spider = DialogSpider()
req = Request(
url=server.urljoin("/index.html"),
meta={
"playwright": True,
"playwright_page_coroutines": [
PageCoro("evaluate", "alert('foobar');"),
],
"playwright_page_event_handlers": {
"dialog": "handle_dialog",
},
},
)
await handler._download_request(req, spider)

assert spider.dialog_message == "foobar"

await handler.browser.close()

@pytest.mark.asyncio
async def test_event_handler_dialog_missing(self, caplog):
crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
handler = ScrapyPlaywrightDownloadHandler(crawler)
await handler._launch_browser()

with StaticMockServer() as server:
spider = DialogSpider()
req = Request(
url=server.urljoin("/index.html"),
meta={
"playwright": True,
"playwright_page_event_handlers": {
"dialog": "missing_method",
},
},
)
await handler._download_request(req, spider)

assert (
"scrapy-playwright",
logging.WARNING,
"Spider 'dialog' does not have a 'missing_method' attribute,"
" ignoring handler for event 'dialog'",
) in caplog.record_tuples
assert getattr(spider, "dialog_message", None) is None

await handler.browser.close()


class TestCaseChromium(MixinTestCase):
browser_type = "chromium"
Expand Down
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ deps =
pytest-cov>=2.8
pytest-twisted>=1.11
commands =
playwright install
py.test --reactor=asyncio \
--cov-report=term-missing --cov-report=html --cov-report=xml \
--cov=scrapy_playwright {posargs: scrapy_playwright tests}
Expand Down