diff --git a/README.md b/README.md index 5847f7d2..f66af883 100644 --- a/README.md +++ b/README.md @@ -271,6 +271,50 @@ Response, containing the final result. ``` +## Page events +A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers` +[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key. +Keys are the name of the event to be handled (`dialog`, `download`, etc). +Values can be either callables or strings (in which case a spider method with the name will be looked up). + +Example: + +```python +from playwright.async_api import Dialog + +async def handle_dialog(self, dialog: Dialog) -> None: + logging.info(f"Handled dialog with message: {dialog.message}") + await dialog.dismiss() + +class EventSpider(scrapy.Spider): + name = "event" + + def start_requests(self): + yield scrapy.Request( + url="https://example.org", + meta=dict( + playwright=True, + playwright_page_event_handlers={ + "dialog": handle_dialog, + "response": "handle_response", + }, + ), + ) + + async def handle_response(self, response: PlaywrightResponse) -> None: + logging.info(f"Received response with URL {response.url}") +``` + +See the [upstream `Page` docs](https://playwright.dev/python/docs/api/class-page/) for a list of +the accepted events and the arguments passed to their handlers. + +**Note**: keep in mind that, unless they are +[removed later](https://playwright.dev/python/docs/events/#addingremoving-event-listener), +these handlers will remain attached to the page and will be called for subsequent +downloads using the same page. This is usually not a problem, since by default +requests are performed in single-use pages. + + ## Examples **Click on a link, save the resulting page as PDF** diff --git a/examples/events.py b/examples/events.py new file mode 100644 index 00000000..3a83e634 --- /dev/null +++ b/examples/events.py @@ -0,0 +1,51 @@ +from playwright.async_api import Dialog, Response as PlaywrightResponse +from scrapy import Spider, Request +from scrapy.crawler import CrawlerProcess +from scrapy_playwright.page import PageCoroutine + + +class EventsSpider(Spider): + """ + Handle page events + """ + + name = "events" + + def start_requests(self): + yield Request( + url="https://example.org", + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageCoroutine("evaluate", "alert('foobar');"), + ], + "playwright_page_event_handlers": { + "dialog": self.handle_dialog, + "response": "handle_response", + }, + }, + ) + + async def handle_dialog(self, dialog: Dialog) -> None: + self.logger.info(f"Handled dialog with message: {dialog.message}") + await dialog.dismiss() + + async def handle_response(self, response: PlaywrightResponse) -> None: + self.logger.info(f"Received response with URL {response.url}") + + def parse(self, response): + return {"url": response.url} + + +if __name__ == "__main__": + process = CrawlerProcess( + settings={ + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + "DOWNLOAD_HANDLERS": { + "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + }, + } + ) + process.crawl(EventsSpider) + process.start() diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 7758f60b..dfc94e1d 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -131,6 +131,21 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: page = request.meta.get("playwright_page") if not isinstance(page, Page): page = await self._create_page(request) + + # attach event handlers + event_handlers = request.meta.get("playwright_page_event_handlers") or {} + for event, handler in event_handlers.items(): + if callable(handler): + page.on(event, handler) + elif isinstance(handler, str): + try: + page.on(event, getattr(spider, handler)) + except AttributeError: + logger.warning( + f"Spider '{spider.name}' does not have a '{handler}' attribute," + f" ignoring handler for event '{event}'" + ) + await page.unroute("**") await page.route( "**", diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index 0ea47ef9..5ef7341e 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -1,9 +1,10 @@ +import logging import platform import subprocess from tempfile import NamedTemporaryFile import pytest -from playwright.async_api import Page as PlaywrightPage, TimeoutError +from playwright.async_api import Dialog, Page as PlaywrightPage, TimeoutError from scrapy import Spider, Request, FormRequest from scrapy.http.response.html import HtmlResponse from scrapy.utils.test import get_crawler @@ -22,6 +23,16 @@ def get_mimetype(file): ).stdout.strip() +class DialogSpider(Spider): + """A spider with a method to handle the "dialog" page event""" + + name = "dialog" + + async def handle_dialog(self, dialog: Dialog) -> None: + self.dialog_message = dialog.message + await dialog.dismiss() + + class MixinTestCase: @pytest.mark.asyncio async def test_basic_response(self): @@ -238,6 +249,87 @@ async def test_page_coroutine_pdf(self): await handler.browser.close() + @pytest.mark.asyncio + async def test_event_handler_dialog_callable(self): + crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) + handler = ScrapyPlaywrightDownloadHandler(crawler) + await handler._launch_browser() + + with StaticMockServer() as server: + spider = DialogSpider() + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageCoro("evaluate", "alert('foobar');"), + ], + "playwright_page_event_handlers": { + "dialog": spider.handle_dialog, + }, + }, + ) + await handler._download_request(req, spider) + + assert spider.dialog_message == "foobar" + + await handler.browser.close() + + @pytest.mark.asyncio + async def test_event_handler_dialog_str(self): + crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) + handler = ScrapyPlaywrightDownloadHandler(crawler) + await handler._launch_browser() + + with StaticMockServer() as server: + spider = DialogSpider() + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageCoro("evaluate", "alert('foobar');"), + ], + "playwright_page_event_handlers": { + "dialog": "handle_dialog", + }, + }, + ) + await handler._download_request(req, spider) + + assert spider.dialog_message == "foobar" + + await handler.browser.close() + + @pytest.mark.asyncio + async def test_event_handler_dialog_missing(self, caplog): + crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) + handler = ScrapyPlaywrightDownloadHandler(crawler) + await handler._launch_browser() + + with StaticMockServer() as server: + spider = DialogSpider() + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_event_handlers": { + "dialog": "missing_method", + }, + }, + ) + await handler._download_request(req, spider) + + assert ( + "scrapy-playwright", + logging.WARNING, + "Spider 'dialog' does not have a 'missing_method' attribute," + " ignoring handler for event 'dialog'", + ) in caplog.record_tuples + assert getattr(spider, "dialog_message", None) is None + + await handler.browser.close() + class TestCaseChromium(MixinTestCase): browser_type = "chromium" diff --git a/tox.ini b/tox.ini index 608686c4..422941a7 100644 --- a/tox.ini +++ b/tox.ini @@ -10,6 +10,7 @@ deps = pytest-cov>=2.8 pytest-twisted>=1.11 commands = + playwright install py.test --reactor=asyncio \ --cov-report=term-missing --cov-report=html --cov-report=xml \ --cov=scrapy_playwright {posargs: scrapy_playwright tests}