From 67ab268402ef0d2324579508e334bf448dc0f511 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 24 Sep 2021 14:38:45 -0300 Subject: [PATCH 1/4] Event handlers for Page objects --- README.md | 33 +++++++++++ examples/events.py | 47 ++++++++++++++++ scrapy_playwright/handler.py | 15 +++++ tests/test_playwright_requests.py | 94 ++++++++++++++++++++++++++++++- tox.ini | 1 + 5 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 examples/events.py diff --git a/README.md b/README.md index 5847f7d2..eb1af1b6 100644 --- a/README.md +++ b/README.md @@ -271,6 +271,39 @@ Response, containing the final result. ``` +## Page events +A dictionary with event names as keys and callables as values can be passed as the +`playwright_page_events` [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) +key, to attach handlers to Page events. For instance: + +```python +from playwright.async_api import Dialog + +async def handle_dialog(self, dialog: Dialog) -> None: + logging.info(f"Handled dialog with message: {dialog.message}") + await dialog.dismiss() + +class EventSpider(scrapy.Spider): + name = "event" + + def start_requests(self): + yield scrapy.Request( + url="https://example.org", + meta=dict( + playwright=True, + playwright_page_events={ + "dialog": handle_dialog, + }, + ), + ) +``` + +Se the [upstream `Page` docs](https://playwright.dev/python/docs/api/class-page/) for a list of +the accepted avents and the arguments passed to their handlers. + +**Note**: keep in mind that these handlers will remain attached for subsequent downloads using the same page. + + ## Examples **Click on a link, save the resulting page as PDF** diff --git a/examples/events.py b/examples/events.py new file mode 100644 index 00000000..86c79265 --- /dev/null +++ b/examples/events.py @@ -0,0 +1,47 @@ +from playwright.async_api import Dialog +from scrapy import Spider, Request +from scrapy.crawler import CrawlerProcess +from scrapy_playwright.page import PageCoroutine + + +class EventsSpider(Spider): + """ + Handle page events + """ + + name = "events" + + def start_requests(self): + yield Request( + url="https://example.org", + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageCoroutine("evaluate", "alert('foobar');"), + ], + "playwright_page_events": { + "dialog": self.handle_dialog, + }, + }, + ) + + async def handle_dialog(self, dialog: Dialog) -> None: + self.logger.info(f"Handled dialog with message: {dialog.message}") + await dialog.dismiss() + + def parse(self, response): + return {"url": response.url} + + +if __name__ == "__main__": + process = CrawlerProcess( + settings={ + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + "DOWNLOAD_HANDLERS": { + "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + }, + } + ) + process.crawl(EventsSpider) + process.start() diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 7758f60b..cf9dd7d6 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -131,6 +131,21 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: page = request.meta.get("playwright_page") if not isinstance(page, Page): page = await self._create_page(request) + + # attach event handlers + event_handlers = request.meta.get("playwright_page_events") or {} + for event, handler in event_handlers.items(): + if callable(handler): + page.on(event, handler) + elif isinstance(handler, str): + try: + page.on(event, getattr(spider, handler)) + except AttributeError: + logger.warning( + f"Spider '{spider.name}' does not have a '{handler}' attribute," + f" ignoring handler for event '{event}'" + ) + await page.unroute("**") await page.route( "**", diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index 0ea47ef9..b1bcf307 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -1,9 +1,10 @@ +import logging import platform import subprocess from tempfile import NamedTemporaryFile import pytest -from playwright.async_api import Page as PlaywrightPage, TimeoutError +from playwright.async_api import Dialog, Page as PlaywrightPage, TimeoutError from scrapy import Spider, Request, FormRequest from scrapy.http.response.html import HtmlResponse from scrapy.utils.test import get_crawler @@ -22,6 +23,16 @@ def get_mimetype(file): ).stdout.strip() +class DialogSpider(Spider): + """A spider with a method to handle the "dialog" page event""" + + name = "dialog" + + async def handle_dialog(self, dialog: Dialog) -> None: + self.dialog_message = dialog.message + await dialog.dismiss() + + class MixinTestCase: @pytest.mark.asyncio async def test_basic_response(self): @@ -238,6 +249,87 @@ async def test_page_coroutine_pdf(self): await handler.browser.close() + @pytest.mark.asyncio + async def test_event_handler_dialog_callable(self): + crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) + handler = ScrapyPlaywrightDownloadHandler(crawler) + await handler._launch_browser() + + with StaticMockServer() as server: + spider = DialogSpider() + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageCoro("evaluate", "alert('foobar');"), + ], + "playwright_page_events": { + "dialog": spider.handle_dialog, + }, + }, + ) + await handler._download_request(req, spider) + + assert spider.dialog_message == "foobar" + + await handler.browser.close() + + @pytest.mark.asyncio + async def test_event_handler_dialog_str(self): + crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) + handler = ScrapyPlaywrightDownloadHandler(crawler) + await handler._launch_browser() + + with StaticMockServer() as server: + spider = DialogSpider() + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageCoro("evaluate", "alert('foobar');"), + ], + "playwright_page_events": { + "dialog": "handle_dialog", + }, + }, + ) + await handler._download_request(req, spider) + + assert spider.dialog_message == "foobar" + + await handler.browser.close() + + @pytest.mark.asyncio + async def test_event_handler_dialog_missing(self, caplog): + crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) + handler = ScrapyPlaywrightDownloadHandler(crawler) + await handler._launch_browser() + + with StaticMockServer() as server: + spider = DialogSpider() + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_events": { + "dialog": "missing_method", + }, + }, + ) + await handler._download_request(req, spider) + + assert ( + "scrapy-playwright", + logging.WARNING, + "Spider 'dialog' does not have a 'missing_method' attribute," + " ignoring handler for event 'dialog'", + ) in caplog.record_tuples + assert getattr(spider, "dialog_message", None) is None + + await handler.browser.close() + class TestCaseChromium(MixinTestCase): browser_type = "chromium" diff --git a/tox.ini b/tox.ini index 608686c4..422941a7 100644 --- a/tox.ini +++ b/tox.ini @@ -10,6 +10,7 @@ deps = pytest-cov>=2.8 pytest-twisted>=1.11 commands = + playwright install py.test --reactor=asyncio \ --cov-report=term-missing --cov-report=html --cov-report=xml \ --cov=scrapy_playwright {posargs: scrapy_playwright tests} From e4ed354474eb56f1ef1f2a6cb902013a252a5aef Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 1 Oct 2021 00:51:13 -0300 Subject: [PATCH 2/4] Rename meta key --- README.md | 10 +++++++--- scrapy_playwright/handler.py | 2 +- tests/test_playwright_requests.py | 6 +++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index eb1af1b6..11b817b3 100644 --- a/README.md +++ b/README.md @@ -273,7 +273,7 @@ Response, containing the final result. ## Page events A dictionary with event names as keys and callables as values can be passed as the -`playwright_page_events` [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) +`playwright_page_event_handlers` [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key, to attach handlers to Page events. For instance: ```python @@ -291,7 +291,7 @@ class EventSpider(scrapy.Spider): url="https://example.org", meta=dict( playwright=True, - playwright_page_events={ + playwright_page_event_handlers={ "dialog": handle_dialog, }, ), @@ -301,7 +301,11 @@ class EventSpider(scrapy.Spider): Se the [upstream `Page` docs](https://playwright.dev/python/docs/api/class-page/) for a list of the accepted avents and the arguments passed to their handlers. -**Note**: keep in mind that these handlers will remain attached for subsequent downloads using the same page. +**Note**: keep in mind that, unless they are +[removed later](https://playwright.dev/python/docs/events/#addingremoving-event-listener), +these handlers will remain attached to the page and will be called for subsequent +downloads using the same page. This is usually not a problem, since by default +requests are performed in single-use pages. ## Examples diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index cf9dd7d6..dfc94e1d 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -133,7 +133,7 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: page = await self._create_page(request) # attach event handlers - event_handlers = request.meta.get("playwright_page_events") or {} + event_handlers = request.meta.get("playwright_page_event_handlers") or {} for event, handler in event_handlers.items(): if callable(handler): page.on(event, handler) diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index b1bcf307..5ef7341e 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -264,7 +264,7 @@ async def test_event_handler_dialog_callable(self): "playwright_page_coroutines": [ PageCoro("evaluate", "alert('foobar');"), ], - "playwright_page_events": { + "playwright_page_event_handlers": { "dialog": spider.handle_dialog, }, }, @@ -290,7 +290,7 @@ async def test_event_handler_dialog_str(self): "playwright_page_coroutines": [ PageCoro("evaluate", "alert('foobar');"), ], - "playwright_page_events": { + "playwright_page_event_handlers": { "dialog": "handle_dialog", }, }, @@ -313,7 +313,7 @@ async def test_event_handler_dialog_missing(self, caplog): url=server.urljoin("/index.html"), meta={ "playwright": True, - "playwright_page_events": { + "playwright_page_event_handlers": { "dialog": "missing_method", }, }, From 930b2aa6d17147306aa6dc00d610146dd45af3d6 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 1 Oct 2021 01:10:52 -0300 Subject: [PATCH 3/4] Update readme --- README.md | 13 ++++++++++--- examples/events.py | 8 ++++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 11b817b3..54525bec 100644 --- a/README.md +++ b/README.md @@ -272,9 +272,12 @@ Response, containing the final result. ## Page events -A dictionary with event names as keys and callables as values can be passed as the -`playwright_page_event_handlers` [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) -key, to attach handlers to Page events. For instance: +A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers` +[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key. +Keys are the name of the event to be handled (`dialog`, `download`, etc). +Values can be either callables or strings (in which case a spider method with the name will be looked up). + +Example: ```python from playwright.async_api import Dialog @@ -293,9 +296,13 @@ class EventSpider(scrapy.Spider): playwright=True, playwright_page_event_handlers={ "dialog": handle_dialog, + "response": "handle_response", }, ), ) + + async def handle_response(self, response: PlaywrightResponse) -> None: + logging.info(f"Received response with URL {response.url}") ``` Se the [upstream `Page` docs](https://playwright.dev/python/docs/api/class-page/) for a list of diff --git a/examples/events.py b/examples/events.py index 86c79265..3a83e634 100644 --- a/examples/events.py +++ b/examples/events.py @@ -1,4 +1,4 @@ -from playwright.async_api import Dialog +from playwright.async_api import Dialog, Response as PlaywrightResponse from scrapy import Spider, Request from scrapy.crawler import CrawlerProcess from scrapy_playwright.page import PageCoroutine @@ -19,8 +19,9 @@ def start_requests(self): "playwright_page_coroutines": [ PageCoroutine("evaluate", "alert('foobar');"), ], - "playwright_page_events": { + "playwright_page_event_handlers": { "dialog": self.handle_dialog, + "response": "handle_response", }, }, ) @@ -29,6 +30,9 @@ async def handle_dialog(self, dialog: Dialog) -> None: self.logger.info(f"Handled dialog with message: {dialog.message}") await dialog.dismiss() + async def handle_response(self, response: PlaywrightResponse) -> None: + self.logger.info(f"Received response with URL {response.url}") + def parse(self, response): return {"url": response.url} From 9eb25563e31c6401707e5663890ee5b5e5275f7c Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta <1731933+elacuesta@users.noreply.github.com> Date: Fri, 1 Oct 2021 10:53:58 -0300 Subject: [PATCH 4/4] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 54525bec..f66af883 100644 --- a/README.md +++ b/README.md @@ -305,8 +305,8 @@ class EventSpider(scrapy.Spider): logging.info(f"Received response with URL {response.url}") ``` -Se the [upstream `Page` docs](https://playwright.dev/python/docs/api/class-page/) for a list of -the accepted avents and the arguments passed to their handlers. +See the [upstream `Page` docs](https://playwright.dev/python/docs/api/class-page/) for a list of +the accepted events and the arguments passed to their handlers. **Note**: keep in mind that, unless they are [removed later](https://playwright.dev/python/docs/events/#addingremoving-event-listener),