From 5879bad7fe797d617592b8e14f6efb1703a5c032 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 16 Mar 2022 23:51:57 -0300 Subject: [PATCH 01/12] Use ScrapyDeprecationWarning instead of DeprecationWarning --- scrapy_playwright/handler.py | 3 ++- tests/test_browser_contexts.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 80c4f896..23877594 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -19,6 +19,7 @@ from scrapy import Spider, signals from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy.crawler import Crawler +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http import Request, Response from scrapy.http.headers import Headers from scrapy.responsetypes import responsetypes @@ -96,7 +97,7 @@ def __init__(self, crawler: Crawler) -> None: "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use" " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in" " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context", - category=DeprecationWarning, + category=ScrapyDeprecationWarning, stacklevel=2, ) self.context_kwargs: defaultdict = defaultdict(dict) diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index 1a13ce9d..5be5c1be 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -4,6 +4,7 @@ import pytest from scrapy import Spider, Request +from scrapy.exceptions import ScrapyDeprecationWarning from tests import make_handler from tests.mockserver import StaticMockServer @@ -153,7 +154,7 @@ async def test_deprecated_setting(self): } with warnings.catch_warnings(record=True) as warning_list: async with make_handler(settings) as handler: - assert warning_list[0].category is DeprecationWarning + assert warning_list[0].category is ScrapyDeprecationWarning assert str(warning_list[0].message) == ( "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use" " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in" From 37dcfa8b09825de502d6b943c8b89effd35843cd Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 17 Mar 2022 00:11:01 -0300 Subject: [PATCH 02/12] Deprecate PageCoroutine in favor of PageMethod --- scrapy_playwright/handler.py | 47 ++++++++++++++++++++++-------------- scrapy_playwright/page.py | 28 ++++++++++++++++++--- 2 files changed, 54 insertions(+), 21 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 34afe957..3c15c2e1 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -31,7 +31,7 @@ from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding from scrapy_playwright.headers import use_scrapy_headers -from scrapy_playwright.page import PageCoroutine +from scrapy_playwright.page import PageMethod __all__ = ["ScrapyPlaywrightDownloadHandler"] @@ -248,23 +248,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res start_time = time() response = await page.goto(request.url) - page_coroutines = request.meta.get("playwright_page_coroutines") or () - if isinstance(page_coroutines, dict): - page_coroutines = page_coroutines.values() - for pc in page_coroutines: - if isinstance(pc, PageCoroutine): - try: - method = getattr(page, pc.method) - except AttributeError: - logger.warning(f"Ignoring {repr(pc)}: could not find coroutine") - else: - result = method(*pc.args, **pc.kwargs) - pc.result = await result if isawaitable(result) else result - await page.wait_for_load_state(timeout=self.default_navigation_timeout) - else: - logger.warning( - f"Ignoring {repr(pc)}: expected PageCoroutine, got {repr(type(pc))}" - ) + await self._apply_page_methods(page, request) body_str = await page.content() request.meta["download_latency"] = time() - start_time @@ -299,6 +283,33 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res ip_address=server_ip_address, ) + async def _apply_page_methods(self, page: Page, request: Request) -> None: + page_methods = request.meta.get("playwright_page_methods") or () + + if not page_methods and "playwright_page_coroutines" in request.meta: + page_methods = request.meta["playwright_page_coroutines"] + warnings.warn( + "The 'playwright_page_coroutines' request meta key is deprecated," + " please use 'playwright_page_methods' instead.", + category=ScrapyDeprecationWarning, + stacklevel=1, + ) + + if isinstance(page_methods, dict): + page_methods = page_methods.values() + for pm in page_methods: + if isinstance(pm, PageMethod): + try: + method = getattr(page, pm.method) + except AttributeError: + logger.warning(f"Ignoring {repr(pm)}: could not find method") + else: + result = method(*pm.args, **pm.kwargs) + pm.result = await result if isawaitable(result) else result + await page.wait_for_load_state(timeout=self.default_navigation_timeout) + else: + logger.warning(f"Ignoring {repr(pm)}: expected PageMethod, got {repr(type(pm))}") + def _increment_request_stats(self, request: PlaywrightRequest) -> None: stats_prefix = "playwright/request_count" self.stats.inc_value(stats_prefix) diff --git a/scrapy_playwright/page.py b/scrapy_playwright/page.py index cfe646ad..5c0bd7c8 100644 --- a/scrapy_playwright/page.py +++ b/scrapy_playwright/page.py @@ -1,7 +1,14 @@ -class PageCoroutine: +import warnings + +from scrapy.exceptions import ScrapyDeprecationWarning + +__all__ = ["PageMethod"] + + +class PageMethod: """ - Represents a coroutine to be awaited on a Playwright page, - such as "click", "screenshot" or "evaluate" + Represents a method to be called (and awaited if necessary) on a + Playwright page, such as "click", "screenshot", "evaluate", etc. """ def __init__(self, method: str, *args, **kwargs) -> None: @@ -14,3 +21,18 @@ def __str__(self): return "<%s for method '%s'>" % (self.__class__.__name__, self.method) __repr__ = __str__ + + +class PageCoroutine(PageMethod): + def __init__(self, method: str, *args, **kwargs) -> None: + warnings.warn( + f"The {_qualname(self.__class__)} class is deprecated," + f" please use {_qualname(PageMethod)} instead.", + category=ScrapyDeprecationWarning, + stacklevel=2, + ) + super().__init__(method, *args, **kwargs) + + +def _qualname(cls: type) -> str: + return f"{cls.__module__}.{cls.__qualname__}" From 72ff95d7a7413e00c3d0749cc6fb6914055a8845 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 17 Mar 2022 00:23:26 -0300 Subject: [PATCH 03/12] Rename PageCoroutine -> PageMethod in tests --- ...age_coroutines.py => test_page_methods.py} | 44 ++++++++-------- tests/test_playwright_requests.py | 52 ++++++++++--------- 2 files changed, 50 insertions(+), 46 deletions(-) rename tests/{test_page_coroutines.py => test_page_methods.py} (63%) diff --git a/tests/test_page_coroutines.py b/tests/test_page_methods.py similarity index 63% rename from tests/test_page_coroutines.py rename to tests/test_page_methods.py index 8f60f186..052898f0 100644 --- a/tests/test_page_coroutines.py +++ b/tests/test_page_methods.py @@ -5,33 +5,33 @@ from scrapy import Spider, Request from scrapy.http.response.html import HtmlResponse -from scrapy_playwright.page import PageCoroutine +from scrapy_playwright.page import PageMethod from tests import make_handler from tests.mockserver import StaticMockServer @pytest.mark.asyncio -async def test_page_coroutines(): - screenshot = PageCoroutine("screenshot", "foo", 123, path="/tmp/file", type="png") +async def test_page_methods(): + screenshot = PageMethod("screenshot", "foo", 123, path="/tmp/file", type="png") assert screenshot.method == "screenshot" assert screenshot.args == ("foo", 123) assert screenshot.kwargs == {"path": "/tmp/file", "type": "png"} assert screenshot.result is None - assert str(screenshot) == "" + assert str(screenshot) == "" -class MixinPageCoroutineTestCase: +class MixinPageMethodTestCase: @pytest.mark.asyncio - async def test_page_non_page_coroutine(self, caplog): + async def test_page_non_page_method(self, caplog): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, - "playwright_page_coroutines": [ - "not-a-page-coroutine", + "playwright_page_methods": [ + "not-a-page-method", 5, None, ], @@ -45,25 +45,25 @@ async def test_page_non_page_coroutine(self, caplog): assert resp.status == 200 assert "playwright" in resp.flags - for obj in req.meta["playwright_page_coroutines"]: + for obj in req.meta["playwright_page_methods"]: assert ( "scrapy-playwright", logging.WARNING, - f"Ignoring {repr(obj)}: expected PageCoroutine, got {repr(type(obj))}", + f"Ignoring {repr(obj)}: expected PageMethod, got {repr(type(obj))}", ) in caplog.record_tuples @pytest.mark.asyncio - async def test_page_mixed_page_coroutines(self, caplog): + async def test_page_mixed_page_methods(self, caplog): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, - "playwright_page_coroutines": { - "does_not_exist": PageCoroutine("does_not_exist"), - "is_closed": PageCoroutine("is_closed"), # not awaitable - "title": PageCoroutine("title"), # awaitable + "playwright_page_methods": { + "does_not_exist": PageMethod("does_not_exist"), + "is_closed": PageMethod("is_closed"), # not awaitable + "title": PageMethod("title"), # awaitable }, }, ) @@ -75,24 +75,24 @@ async def test_page_mixed_page_coroutines(self, caplog): assert resp.status == 200 assert "playwright" in resp.flags - does_not_exist = req.meta["playwright_page_coroutines"]["does_not_exist"] + does_not_exist = req.meta["playwright_page_methods"]["does_not_exist"] assert ( "scrapy-playwright", logging.WARNING, - f"Ignoring {repr(does_not_exist)}: could not find coroutine", + f"Ignoring {repr(does_not_exist)}: could not find method", ) in caplog.record_tuples - assert not req.meta["playwright_page_coroutines"]["is_closed"].result - assert req.meta["playwright_page_coroutines"]["title"].result == "Awesome site" + assert not req.meta["playwright_page_methods"]["is_closed"].result + assert req.meta["playwright_page_methods"]["title"].result == "Awesome site" -class TestPageCoroutineChromium(MixinPageCoroutineTestCase): +class TestPageMethodChromium(MixinPageMethodTestCase): browser_type = "chromium" -class TestPageCoroutineFirefox(MixinPageCoroutineTestCase): +class TestPageMethodFirefox(MixinPageMethodTestCase): browser_type = "firefox" @pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin") -class TestPageCoroutineWebkit(MixinPageCoroutineTestCase): +class TestPageMethodWebkit(MixinPageMethodTestCase): browser_type = "webkit" diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index 1711b2b2..d3f74cc9 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -10,7 +10,7 @@ from scrapy import Spider, Request, FormRequest from scrapy.http.response.html import HtmlResponse -from scrapy_playwright.page import PageCoroutine as PageCoro +from scrapy_playwright.page import PageMethod as PageMethod from tests import make_handler from tests.mockserver import MockServer, StaticMockServer @@ -70,14 +70,14 @@ async def test_post_request(self): assert "Request body: foo=bar" in resp.text @pytest.mark.asyncio - async def test_page_coroutine_navigation(self): + async def test_page_method_navigation(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, - "playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")], + "playwright_page_methods": [PageMethod("click", "a.lorem_ipsum")], }, ) resp = await handler._download_request(req, Spider("foo")) @@ -92,7 +92,7 @@ async def test_page_coroutine_navigation(self): assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit." @pytest.mark.asyncio - async def test_page_coroutine_infinite_scroll(self): + async def test_page_method_infinite_scroll(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: req = Request( @@ -100,12 +100,16 @@ async def test_page_coroutine_infinite_scroll(self): headers={"User-Agent": "scrapy-playwright"}, meta={ "playwright": True, - "playwright_page_coroutines": [ - PageCoro("wait_for_selector", selector="div.quote"), - PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), - PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"), - PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), - PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"), + "playwright_page_methods": [ + PageMethod("wait_for_selector", selector="div.quote"), + PageMethod( + "evaluate", "window.scrollBy(0, document.body.scrollHeight)" + ), + PageMethod("wait_for_selector", selector="div.quote:nth-child(11)"), + PageMethod( + "evaluate", "window.scrollBy(0, document.body.scrollHeight)" + ), + PageMethod("wait_for_selector", selector="div.quote:nth-child(21)"), ], }, ) @@ -179,8 +183,8 @@ async def test_context_kwargs(self): url=server.urljoin("/scroll.html"), meta={ "playwright": True, - "playwright_page_coroutines": [ - PageCoro("wait_for_selector", selector="div.quote", timeout=1000), + "playwright_page_methods": [ + PageMethod("wait_for_selector", selector="div.quote", timeout=1000), ], }, ) @@ -188,7 +192,7 @@ async def test_context_kwargs(self): await handler._download_request(req, Spider("foo")) @pytest.mark.asyncio - async def test_page_coroutine_screenshot(self): + async def test_page_method_screenshot(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with NamedTemporaryFile(mode="w+b") as png_file: with StaticMockServer() as server: @@ -196,19 +200,19 @@ async def test_page_coroutine_screenshot(self): url=server.urljoin("/index.html"), meta={ "playwright": True, - "playwright_page_coroutines": { - "png": PageCoro("screenshot", path=png_file.name, type="png"), + "playwright_page_methods": { + "png": PageMethod("screenshot", path=png_file.name, type="png"), }, }, ) await handler._download_request(req, Spider("foo")) png_file.file.seek(0) - assert png_file.file.read() == req.meta["playwright_page_coroutines"]["png"].result + assert png_file.file.read() == req.meta["playwright_page_methods"]["png"].result assert get_mimetype(png_file) == "image/png" @pytest.mark.asyncio - async def test_page_coroutine_pdf(self): + async def test_page_method_pdf(self): if self.browser_type != "chromium": pytest.skip("PDF generation is supported only in Chromium") @@ -219,15 +223,15 @@ async def test_page_coroutine_pdf(self): url=server.urljoin("/index.html"), meta={ "playwright": True, - "playwright_page_coroutines": { - "pdf": PageCoro("pdf", path=pdf_file.name), + "playwright_page_methods": { + "pdf": PageMethod("pdf", path=pdf_file.name), }, }, ) await handler._download_request(req, Spider("foo")) pdf_file.file.seek(0) - assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result + assert pdf_file.file.read() == req.meta["playwright_page_methods"]["pdf"].result assert get_mimetype(pdf_file) == "application/pdf" @pytest.mark.asyncio @@ -316,8 +320,8 @@ async def test_event_handler_dialog_callable(self): url=server.urljoin("/index.html"), meta={ "playwright": True, - "playwright_page_coroutines": [ - PageCoro("evaluate", "alert('foobar');"), + "playwright_page_methods": [ + PageMethod("evaluate", "alert('foobar');"), ], "playwright_page_event_handlers": { "dialog": spider.handle_dialog, @@ -337,8 +341,8 @@ async def test_event_handler_dialog_str(self): url=server.urljoin("/index.html"), meta={ "playwright": True, - "playwright_page_coroutines": [ - PageCoro("evaluate", "alert('foobar');"), + "playwright_page_methods": [ + PageMethod("evaluate", "alert('foobar');"), ], "playwright_page_event_handlers": { "dialog": "handle_dialog", From df0ed7b2c408b9e42c638ac5f983497fad44bc1e Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 17 Mar 2022 00:25:58 -0300 Subject: [PATCH 04/12] Rename PageCoroutine -> PageMethod in examples --- examples/cookies.py | 6 +++--- examples/events.py | 6 +++--- examples/exception.py | 6 +++--- examples/post.py | 6 +++--- examples/scroll.py | 13 +++++++------ examples/storage.py | 6 +++--- 6 files changed, 22 insertions(+), 21 deletions(-) diff --git a/examples/cookies.py b/examples/cookies.py index 5e4c8786..f31ac3a6 100644 --- a/examples/cookies.py +++ b/examples/cookies.py @@ -2,7 +2,7 @@ from scrapy import Spider, Request from scrapy.crawler import CrawlerProcess -from scrapy_playwright.page import PageCoroutine +from scrapy_playwright.page import PageMethod class CookieSpider(Spider): @@ -18,8 +18,8 @@ def start_requests(self): cookies={"foo": "bar"}, meta={ "playwright": True, - "playwright_page_coroutines": [ - PageCoroutine( + "playwright_page_methods": [ + PageMethod( "screenshot", path=Path(__file__).parent / "cookies.png", full_page=True ), ], diff --git a/examples/events.py b/examples/events.py index 3a83e634..b821b0cf 100644 --- a/examples/events.py +++ b/examples/events.py @@ -1,7 +1,7 @@ from playwright.async_api import Dialog, Response as PlaywrightResponse from scrapy import Spider, Request from scrapy.crawler import CrawlerProcess -from scrapy_playwright.page import PageCoroutine +from scrapy_playwright.page import PageMethod class EventsSpider(Spider): @@ -16,8 +16,8 @@ def start_requests(self): url="https://example.org", meta={ "playwright": True, - "playwright_page_coroutines": [ - PageCoroutine("evaluate", "alert('foobar');"), + "playwright_page_methods": [ + PageMethod("evaluate", "alert('foobar');"), ], "playwright_page_event_handlers": { "dialog": self.handle_dialog, diff --git a/examples/exception.py b/examples/exception.py index f9f7fe1b..a060b00a 100644 --- a/examples/exception.py +++ b/examples/exception.py @@ -3,7 +3,7 @@ from scrapy import Spider, Request from scrapy.crawler import CrawlerProcess -from scrapy_playwright.page import PageCoroutine +from scrapy_playwright.page import PageMethod class HandleTimeoutMiddleware: @@ -13,8 +13,8 @@ def process_exception(self, request, exception, spider): url="https://httpbin.org/get", meta={ "playwright": True, - "playwright_page_coroutines": [ - PageCoroutine( + "playwright_page_methods": [ + PageMethod( "screenshot", path=Path(__file__).parent / "recovered.png", full_page=True ), ], diff --git a/examples/post.py b/examples/post.py index 7f1d92c6..e2d2f244 100644 --- a/examples/post.py +++ b/examples/post.py @@ -2,7 +2,7 @@ from scrapy import Spider, FormRequest from scrapy.crawler import CrawlerProcess -from scrapy_playwright.page import PageCoroutine +from scrapy_playwright.page import PageMethod class PostSpider(Spider): @@ -18,8 +18,8 @@ def start_requests(self): formdata={"foo": "bar"}, meta={ "playwright": True, - "playwright_page_coroutines": [ - PageCoroutine( + "playwright_page_methods": [ + PageMethod( "screenshot", path=Path(__file__).parent / "post.png", full_page=True ), ], diff --git a/examples/scroll.py b/examples/scroll.py index 75e53adf..de762844 100644 --- a/examples/scroll.py +++ b/examples/scroll.py @@ -2,7 +2,7 @@ from scrapy import Spider, Request from scrapy.crawler import CrawlerProcess -from scrapy_playwright.page import PageCoroutine +from scrapy_playwright.page import PageMethod class ScrollSpider(Spider): @@ -18,11 +18,11 @@ def start_requests(self): cookies={"foo": "bar", "asdf": "qwerty"}, meta={ "playwright": True, - "playwright_page_coroutines": [ - PageCoroutine("wait_for_selector", "div.quote"), - PageCoroutine("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), - PageCoroutine("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page - PageCoroutine( + "playwright_page_methods": [ + PageMethod("wait_for_selector", "div.quote"), + PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), + PageMethod("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page + PageMethod( "screenshot", path=Path(__file__).parent / "scroll.png", full_page=True ), ], @@ -41,6 +41,7 @@ def parse(self, response): # "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", }, + "LOG_LEVEL": "INFO", } ) process.crawl(ScrollSpider) diff --git a/examples/storage.py b/examples/storage.py index bc093ff2..7c67ee83 100644 --- a/examples/storage.py +++ b/examples/storage.py @@ -1,6 +1,6 @@ from scrapy import Spider, Request from scrapy.crawler import CrawlerProcess -from scrapy_playwright.page import PageCoroutine +from scrapy_playwright.page import PageMethod class StorageSpider(Spider): @@ -16,8 +16,8 @@ def start_requests(self): meta={ "playwright": True, "playwright_include_page": True, - "playwright_page_coroutines": [ - PageCoroutine("evaluate_handle", "window.localStorage.setItem('foo', 'bar');"), + "playwright_page_methods": [ + PageMethod("evaluate_handle", "window.localStorage.setItem('foo', 'bar');"), ], }, ) From 0075d73e7cc439e7169473b51a57b2a1ea3f800b Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 17 Mar 2022 00:30:33 -0300 Subject: [PATCH 05/12] Remove unnecessary import alias --- tests/test_playwright_requests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index d3f74cc9..43e259a7 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -10,7 +10,7 @@ from scrapy import Spider, Request, FormRequest from scrapy.http.response.html import HtmlResponse -from scrapy_playwright.page import PageMethod as PageMethod +from scrapy_playwright.page import PageMethod from tests import make_handler from tests.mockserver import MockServer, StaticMockServer From 31dd51a256e1f29b7141d0ba325f3d4f0d78f9c2 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Thu, 17 Mar 2022 01:41:19 -0300 Subject: [PATCH 06/12] Test deprecation messages --- tests/test_page_methods.py | 82 ++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/tests/test_page_methods.py b/tests/test_page_methods.py index 052898f0..0a3baa57 100644 --- a/tests/test_page_methods.py +++ b/tests/test_page_methods.py @@ -1,11 +1,12 @@ import logging import platform +import warnings import pytest from scrapy import Spider, Request from scrapy.http.response.html import HtmlResponse -from scrapy_playwright.page import PageMethod +from scrapy_playwright.page import PageMethod, PageCoroutine from tests import make_handler from tests.mockserver import StaticMockServer @@ -21,7 +22,24 @@ async def test_page_methods(): assert str(screenshot) == "" +@pytest.mark.asyncio +async def test_deprecated_class(): + with warnings.catch_warnings(record=True) as warning_list: + PageCoroutine("screenshot", "foo", 123, path="/tmp/file", type="png") + assert str(warning_list[0].message) == ( + "The scrapy_playwright.page.PageCoroutine class is deprecated, " + "please use scrapy_playwright.page.PageMethod instead." + ) + + class MixinPageMethodTestCase: + def assert_correct_response(self, response: HtmlResponse, request: Request) -> None: + assert isinstance(response, HtmlResponse) + assert response.request is request + assert response.url == request.url + assert response.status == 200 + assert "playwright" in response.flags + @pytest.mark.asyncio async def test_page_non_page_method(self, caplog): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: @@ -39,18 +57,13 @@ async def test_page_non_page_method(self, caplog): ) resp = await handler._download_request(req, Spider("foo")) - assert isinstance(resp, HtmlResponse) - assert resp.request is req - assert resp.url == server.urljoin("/index.html") - assert resp.status == 200 - assert "playwright" in resp.flags - - for obj in req.meta["playwright_page_methods"]: - assert ( - "scrapy-playwright", - logging.WARNING, - f"Ignoring {repr(obj)}: expected PageMethod, got {repr(type(obj))}", - ) in caplog.record_tuples + self.assert_correct_response(resp, req) + for obj in req.meta["playwright_page_methods"]: + assert ( + "scrapy-playwright", + logging.WARNING, + f"Ignoring {repr(obj)}: expected PageMethod, got {repr(type(obj))}", + ) in caplog.record_tuples @pytest.mark.asyncio async def test_page_mixed_page_methods(self, caplog): @@ -69,20 +82,37 @@ async def test_page_mixed_page_methods(self, caplog): ) resp = await handler._download_request(req, Spider("foo")) - assert isinstance(resp, HtmlResponse) - assert resp.request is req - assert resp.url == server.urljoin("/index.html") - assert resp.status == 200 - assert "playwright" in resp.flags + self.assert_correct_response(resp, req) + does_not_exist = req.meta["playwright_page_methods"]["does_not_exist"] + assert ( + "scrapy-playwright", + logging.WARNING, + f"Ignoring {repr(does_not_exist)}: could not find method", + ) in caplog.record_tuples + assert not req.meta["playwright_page_methods"]["is_closed"].result + assert req.meta["playwright_page_methods"]["title"].result == "Awesome site" - does_not_exist = req.meta["playwright_page_methods"]["does_not_exist"] - assert ( - "scrapy-playwright", - logging.WARNING, - f"Ignoring {repr(does_not_exist)}: could not find method", - ) in caplog.record_tuples - assert not req.meta["playwright_page_methods"]["is_closed"].result - assert req.meta["playwright_page_methods"]["title"].result == "Awesome site" + @pytest.mark.asyncio + async def test_deprecated(self, caplog): + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with StaticMockServer() as server: + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageMethod("is_closed"), + ], + }, + ) + with warnings.catch_warnings(record=True) as warning_list: + resp = await handler._download_request(req, Spider("foo")) + + self.assert_correct_response(resp, req) + assert str(warning_list[0].message) == ( + "The 'playwright_page_coroutines' request meta key is deprecated," + " please use 'playwright_page_methods' instead." + ) class TestPageMethodChromium(MixinPageMethodTestCase): From f853309338507868ce6446e8fadef38c4f1ccc34 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 20 Mar 2022 12:19:05 -0300 Subject: [PATCH 07/12] Update PageCoroutine/PageMethod docs --- README.md | 57 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index c2256c5d..6498f791 100644 --- a/README.md +++ b/README.md @@ -311,31 +311,30 @@ async def parse_in_new_context(self, response): ``` -## Page coroutines +## Executing actions on pages A sorted iterable (`list`, `tuple` or `dict`, for instance) could be passed -in the `playwright_page_coroutines` +in the `playwright_page_methods` [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key to request coroutines to be awaited on the `Page` before returning the final `Response` to the callback. This is useful when you need to perform certain actions on a page, like scrolling -down or clicking links, and you want everything to count as a single Scrapy -Response, containing the final result. +down or clicking links, and you want to handle only the final result in your callback. -### `PageCoroutine` class +### `PageMethod` class -* `scrapy_playwright.page.PageCoroutine(method: str, *args, **kwargs)`: +* `scrapy_playwright.page.PageMethod(method: str, *args, **kwargs)`: - Represents a coroutine to be awaited on a `playwright.page.Page` object, - such as "click", "screenshot", "evaluate", etc. - `method` should be the name of the coroutine, `*args` and `**kwargs` - are passed to the function call. The return value of the coroutine call - will be stored in the `PageCoroutine.result` attribute. + Represents a method to be called (and awaited if necessary) on a + `playwright.page.Page` object, such as "click", "screenshot", "evaluate", etc. + `method` is the name of the method, `*args` and `**kwargs` + are passed when calling such method. The return value + will be stored in the `PageMethod.result` attribute. For instance, ```python - PageCoroutine("screenshot", path="quotes.png", fullPage=True) + PageMethod("screenshot", path="quotes.png", fullPage=True) ``` produces the same effect as: @@ -345,15 +344,15 @@ Response, containing the final result. ``` -### Supported coroutines +### Supported methods Please refer to the [upstream docs for the `Page` class](https://playwright.dev/python/docs/api/class-page) -to see available coroutines +to see available methods. ### Impact on Response objects Certain `Response` attributes (e.g. `url`, `ip_address`) reflect the state after the last -action performed on a page. If you issue a `PageCoroutine` with an action that results in +action performed on a page. If you issue a `PageMethod` with an action that results in a navigation (e.g. a `click` on a link), the `Response.url` attribute will point to the new URL, which might be different from the request's URL. @@ -415,15 +414,15 @@ class ClickAndSavePdfSpider(scrapy.Spider): url="https://example.org", meta=dict( playwright=True, - playwright_page_coroutines={ - "click": PageCoroutine("click", selector="a"), - "pdf": PageCoroutine("pdf", path="/tmp/file.pdf"), + playwright_page_methods={ + "click": PageMethod("click", selector="a"), + "pdf": PageMethod("pdf", path="/tmp/file.pdf"), }, ), ) def parse(self, response): - pdf_bytes = response.meta["playwright_page_coroutines"]["pdf"].result + pdf_bytes = response.meta["playwright_page_methods"]["pdf"].result with open("iana.pdf", "wb") as fp: fp.write(pdf_bytes) yield {"url": response.url} # response.url is "https://www.iana.org/domains/reserved" @@ -441,10 +440,10 @@ class ScrollSpider(scrapy.Spider): meta=dict( playwright=True, playwright_include_page=True, - playwright_page_coroutines=[ - PageCoroutine("wait_for_selector", "div.quote"), - PageCoroutine("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), - PageCoroutine("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page + playwright_page_methods=[ + PageMethod("wait_for_selector", "div.quote"), + PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), + PageMethod("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page ], ), ) @@ -482,3 +481,15 @@ For more examples, please see the scripts in the [examples](examples) directory. Deprecated since [`v0.0.4`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.4), use the `PLAYWRIGHT_CONTEXTS` setting instead + +* `scrapy_playwright.page.PageCoroutine` class + + Deprecated since + [`v0.0.13`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.13), + use `scrapy_playwright.page.PageMethod` instead + +* `playwright_page_coroutines` Request meta key + + Deprecated since + [`v0.0.13`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.13), + use `playwright_page_methods` instead From 2a65ad1a13688c62d2139fe42750b1f1b0210b0a Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 20 Mar 2022 12:22:31 -0300 Subject: [PATCH 08/12] Rename test --- tests/test_page_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_page_methods.py b/tests/test_page_methods.py index 0a3baa57..5b8fa0c1 100644 --- a/tests/test_page_methods.py +++ b/tests/test_page_methods.py @@ -93,7 +93,7 @@ async def test_page_mixed_page_methods(self, caplog): assert req.meta["playwright_page_methods"]["title"].result == "Awesome site" @pytest.mark.asyncio - async def test_deprecated(self, caplog): + async def test_deprecated_request_meta_key(self, caplog): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: req = Request( From 3de053779d671b7e6ee5971231b0adbd54cf9741 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 21 Mar 2022 23:33:31 -0300 Subject: [PATCH 09/12] Update PageMethod docs --- README.md | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6498f791..6a4d0da9 100644 --- a/README.md +++ b/README.md @@ -334,13 +334,34 @@ down or clicking links, and you want to handle only the final result in your cal For instance, ```python - PageMethod("screenshot", path="quotes.png", fullPage=True) + def start_requests(self): + yield Request( + url="https://example.org", + meta={ + "playwright": True, + "playwright_page_methods": [ + PageMethod("screenshot", path="example.png", fullPage=True), + ], + }, + ) + + def parse(self, response): + screenshot = response.meta["playwright_page_methods"][0] + # screenshot.result contains the image's bytes ``` produces the same effect as: ```python - # 'page' is a playwright.async_api.Page object - await page.screenshot(path="quotes.png", fullPage=True) + def start_requests(self): + yield Request( + url="https://example.org", + meta={"playwright": True, "playwright_include_page": True}, + ) + + async def parse(self, response): + page = response.meta["playwright_page"] + await page.screenshot(path="example.png", full_page=True) + await page.close() ``` @@ -358,6 +379,7 @@ new URL, which might be different from the request's URL. ## Page events + A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers` [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key. Keys are the name of the event to be handled (`dialog`, `download`, etc). From 16c1ac460abdc7fb541dfa26d6a700342cb8d30c Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 23 Mar 2022 00:00:32 -0300 Subject: [PATCH 10/12] Make pylint happy --- tests/test_page_methods.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/test_page_methods.py b/tests/test_page_methods.py index 5b8fa0c1..617fb08e 100644 --- a/tests/test_page_methods.py +++ b/tests/test_page_methods.py @@ -32,14 +32,15 @@ async def test_deprecated_class(): ) -class MixinPageMethodTestCase: - def assert_correct_response(self, response: HtmlResponse, request: Request) -> None: - assert isinstance(response, HtmlResponse) - assert response.request is request - assert response.url == request.url - assert response.status == 200 - assert "playwright" in response.flags +def assert_correct_response(response: HtmlResponse, request: Request) -> None: + assert isinstance(response, HtmlResponse) + assert response.request is request + assert response.url == request.url + assert response.status == 200 + assert "playwright" in response.flags + +class MixinPageMethodTestCase: @pytest.mark.asyncio async def test_page_non_page_method(self, caplog): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: From 3920bc96d677aad1c235401132604f1582f4414b Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 23 Mar 2022 09:29:37 -0300 Subject: [PATCH 11/12] Fix function call --- tests/test_page_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_page_methods.py b/tests/test_page_methods.py index 617fb08e..24e0a5b3 100644 --- a/tests/test_page_methods.py +++ b/tests/test_page_methods.py @@ -58,7 +58,7 @@ async def test_page_non_page_method(self, caplog): ) resp = await handler._download_request(req, Spider("foo")) - self.assert_correct_response(resp, req) + assert_correct_response(resp, req) for obj in req.meta["playwright_page_methods"]: assert ( "scrapy-playwright", @@ -83,7 +83,7 @@ async def test_page_mixed_page_methods(self, caplog): ) resp = await handler._download_request(req, Spider("foo")) - self.assert_correct_response(resp, req) + assert_correct_response(resp, req) does_not_exist = req.meta["playwright_page_methods"]["does_not_exist"] assert ( "scrapy-playwright", @@ -109,7 +109,7 @@ async def test_deprecated_request_meta_key(self, caplog): with warnings.catch_warnings(record=True) as warning_list: resp = await handler._download_request(req, Spider("foo")) - self.assert_correct_response(resp, req) + assert_correct_response(resp, req) assert str(warning_list[0].message) == ( "The 'playwright_page_coroutines' request meta key is deprecated," " please use 'playwright_page_methods' instead." From bd6b1cf502eb37c06f0b5fef666870041f8c8895 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 25 Mar 2022 23:56:05 -0300 Subject: [PATCH 12/12] Update readme and changelog --- README.md | 81 ++++++++++++++++++++++++++++------------------------ changelog.md | 5 ++++ 2 files changed, 49 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 557a0e5f..185de13a 100644 --- a/README.md +++ b/README.md @@ -339,45 +339,45 @@ down or clicking links, and you want to handle only the final result in your cal ### `PageMethod` class -* `scrapy_playwright.page.PageMethod(method: str, *args, **kwargs)`: +#### `scrapy_playwright.page.PageMethod(method: str, *args, **kwargs)`: - Represents a method to be called (and awaited if necessary) on a - `playwright.page.Page` object, such as "click", "screenshot", "evaluate", etc. - `method` is the name of the method, `*args` and `**kwargs` - are passed when calling such method. The return value - will be stored in the `PageMethod.result` attribute. +Represents a method to be called (and awaited if necessary) on a +`playwright.page.Page` object, such as "click", "screenshot", "evaluate", etc. +`method` is the name of the method, `*args` and `**kwargs` +are passed when calling such method. The return value +will be stored in the `PageMethod.result` attribute. - For instance, - ```python - def start_requests(self): - yield Request( - url="https://example.org", - meta={ - "playwright": True, - "playwright_page_methods": [ - PageMethod("screenshot", path="example.png", fullPage=True), - ], - }, - ) +For instance, +```python +def start_requests(self): + yield Request( + url="https://example.org", + meta={ + "playwright": True, + "playwright_page_methods": [ + PageMethod("screenshot", path="example.png", fullPage=True), + ], + }, + ) - def parse(self, response): - screenshot = response.meta["playwright_page_methods"][0] - # screenshot.result contains the image's bytes - ``` +def parse(self, response): + screenshot = response.meta["playwright_page_methods"][0] + # screenshot.result contains the image's bytes +``` - produces the same effect as: - ```python - def start_requests(self): - yield Request( - url="https://example.org", - meta={"playwright": True, "playwright_include_page": True}, - ) +produces the same effect as: +```python +def start_requests(self): + yield Request( + url="https://example.org", + meta={"playwright": True, "playwright_include_page": True}, + ) - async def parse(self, response): - page = response.meta["playwright_page"] - await page.screenshot(path="example.png", full_page=True) - await page.close() - ``` +async def parse(self, response): + page = response.meta["playwright_page"] + await page.screenshot(path="example.png", full_page=True) + await page.close() +``` ### Supported methods @@ -508,7 +508,14 @@ For more examples, please see the scripts in the [examples](examples) directory. Refer to the [Proxy support](#proxy-support) section for more information. -##  Deprecations +##  Deprecation policy + +Deprecated features will be supported for at least six months +following the release that deprecated them. After that, they +may be removed at any time. See the [changelog](changelog.md) +for more information about deprecations and removals. + +### Currently deprecated features * `PLAYWRIGHT_CONTEXT_ARGS` setting (type `dict`, default `{}`) @@ -522,11 +529,11 @@ For more examples, please see the scripts in the [examples](examples) directory. * `scrapy_playwright.page.PageCoroutine` class Deprecated since - [`v0.0.13`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.13), + [`v0.0.14`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14), use `scrapy_playwright.page.PageMethod` instead * `playwright_page_coroutines` Request meta key Deprecated since - [`v0.0.13`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.13), + [`v0.0.14`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14), use `playwright_page_methods` instead diff --git a/changelog.md b/changelog.md index c3e0728f..b1c25b74 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,10 @@ # scrapy-playwright changelog +### [v0.0.14](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14) (2022-03-26) + +* Renamed `PageCoroutine` to `PageMethod` (`PageCoroutine` is now deprecated) + + ### [v0.0.13](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.13) (2022-03-24) * PageCoroutine checks