diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index dfc94e1d..6785f34c 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -115,12 +115,14 @@ async def _create_page(self, request: Request) -> Page: @inlineCallbacks def close(self) -> Deferred: yield super().close() - for context in self.contexts.copy().values(): - yield deferred_from_coro(context.close()) + yield deferred_from_coro(self._close()) + + async def _close(self) -> None: + self.contexts.clear() if getattr(self, "browser", None): logger.info("Closing browser") - yield deferred_from_coro(self.browser.close()) - yield deferred_from_coro(self.playwright_context_manager.__aexit__()) + await self.browser.close() + await self.playwright_context_manager.__aexit__() def download_request(self, request: Request, spider: Spider) -> Deferred: if request.meta.get("playwright"): diff --git a/tests/__init__.py b/tests/__init__.py index e69de29b..66b35fd1 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,20 @@ +from contextlib import asynccontextmanager + +from scrapy.utils.test import get_crawler + + +@asynccontextmanager +async def make_handler(settings_dict: dict): + """Convenience function to obtain an initialized handler and close it gracefully""" + from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler + + crawler = get_crawler(settings_dict=settings_dict) + handler = ScrapyPlaywrightDownloadHandler(crawler=crawler) + try: + await handler._launch_browser() + except: # noqa (E722) + pass + else: + yield handler + finally: + await handler._close() diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index ebca42d9..07bcaacf 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -3,10 +3,8 @@ import pytest from scrapy import Spider, Request -from scrapy.utils.test import get_crawler - -from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler +from tests import make_handler from tests.mockserver import StaticMockServer @@ -29,63 +27,56 @@ async def test_contexts_startup(self): }, }, } - handler = ScrapyPlaywrightDownloadHandler(get_crawler(settings_dict=settings)) - await handler._launch_browser() - - with StaticMockServer() as server: - meta = { - "playwright": True, - "playwright_include_page": True, - "playwright_context": "first", - } - req = Request(server.urljoin("/index.html"), meta=meta) - resp = await handler._download_request(req, Spider("foo")) - - page = resp.meta["playwright_page"] - storage_state = await page.context.storage_state() - cookie = storage_state["cookies"][0] - assert cookie["name"] == "foo" - assert cookie["value"] == "bar" - assert cookie["domain"] == "example.org" - - await page.close() - await handler.browser.close() + async with make_handler(settings) as handler: + with StaticMockServer() as server: + meta = { + "playwright": True, + "playwright_include_page": True, + "playwright_context": "first", + } + req = Request(server.urljoin("/index.html"), meta=meta) + resp = await handler._download_request(req, Spider("foo")) + + page = resp.meta["playwright_page"] + storage_state = await page.context.storage_state() + await page.context.close() + await page.close() + cookie = storage_state["cookies"][0] + assert cookie["name"] == "foo" + assert cookie["value"] == "bar" + assert cookie["domain"] == "example.org" @pytest.mark.asyncio async def test_contexts_dynamic(self): - settings = {"PLAYWRIGHT_BROWSER_TYPE": self.browser_type} - handler = ScrapyPlaywrightDownloadHandler(get_crawler(settings_dict=settings)) - await handler._launch_browser() - - with StaticMockServer() as server: - meta = { - "playwright": True, - "playwright_include_page": True, - "playwright_context": "new", - "playwright_context_kwargs": { - "storage_state": { - "cookies": [ - { - "url": "https://example.org", - "name": "asdf", - "value": "qwerty", - }, - ], + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + + with StaticMockServer() as server: + meta = { + "playwright": True, + "playwright_include_page": True, + "playwright_context": "new", + "playwright_context_kwargs": { + "storage_state": { + "cookies": [ + { + "url": "https://example.org", + "name": "asdf", + "value": "qwerty", + }, + ], + }, }, - }, - } - req = Request(server.urljoin("/index.html"), meta=meta) - resp = await handler._download_request(req, Spider("foo")) - - page = resp.meta["playwright_page"] - storage_state = await page.context.storage_state() - cookie = storage_state["cookies"][0] - assert cookie["name"] == "asdf" - assert cookie["value"] == "qwerty" - assert cookie["domain"] == "example.org" - - await page.close() - await handler.browser.close() + } + req = Request(server.urljoin("/index.html"), meta=meta) + resp = await handler._download_request(req, Spider("foo")) + + page = resp.meta["playwright_page"] + storage_state = await page.context.storage_state() + await page.close() + cookie = storage_state["cookies"][0] + assert cookie["name"] == "asdf" + assert cookie["value"] == "qwerty" + assert cookie["domain"] == "example.org" @pytest.mark.asyncio async def test_deprecated_setting(self): @@ -104,33 +95,29 @@ async def test_deprecated_setting(self): }, } with warnings.catch_warnings(record=True) as warning_list: - handler = ScrapyPlaywrightDownloadHandler(get_crawler(settings_dict=settings)) - await handler._launch_browser() - - assert warning_list[0].category is DeprecationWarning - assert str(warning_list[0].message) == ( - "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use" - " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in" - " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context" - ) - - with StaticMockServer() as server: - meta = { - "playwright": True, - "playwright_include_page": True, - } - req = Request(server.urljoin("/index.html"), meta=meta) - resp = await handler._download_request(req, Spider("foo")) - - page = resp.meta["playwright_page"] - storage_state = await page.context.storage_state() - cookie = storage_state["cookies"][0] - assert cookie["name"] == "asdf" - assert cookie["value"] == "qwerty" - assert cookie["domain"] == "example.org" - - await page.close() - await handler.browser.close() + async with make_handler(settings) as handler: + assert warning_list[0].category is DeprecationWarning + assert str(warning_list[0].message) == ( + "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use" + " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in" + " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context" + ) + + with StaticMockServer() as server: + meta = { + "playwright": True, + "playwright_include_page": True, + } + req = Request(server.urljoin("/index.html"), meta=meta) + resp = await handler._download_request(req, Spider("foo")) + + page = resp.meta["playwright_page"] + storage_state = await page.context.storage_state() + await page.close() + cookie = storage_state["cookies"][0] + assert cookie["name"] == "asdf" + assert cookie["value"] == "qwerty" + assert cookie["domain"] == "example.org" class TestCaseMultipleContextsChromium(MixinTestCaseMultipleContexts): diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index 5ef7341e..c63c5cec 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -7,11 +7,10 @@ from playwright.async_api import Dialog, Page as PlaywrightPage, TimeoutError from scrapy import Spider, Request, FormRequest from scrapy.http.response.html import HtmlResponse -from scrapy.utils.test import get_crawler -from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler from scrapy_playwright.page import PageCoroutine as PageCoro +from tests import make_handler from tests.mockserver import MockServer, StaticMockServer @@ -36,289 +35,221 @@ async def handle_dialog(self, dialog: Dialog) -> None: class MixinTestCase: @pytest.mark.asyncio async def test_basic_response(self): - handler = ScrapyPlaywrightDownloadHandler( - get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) - ) - await handler._launch_browser() - - with StaticMockServer() as server: - meta = {"playwright": True, "playwright_include_page": True} - req = Request(server.urljoin("/index.html"), meta=meta) - resp = await handler._download_request(req, Spider("foo")) - - assert isinstance(resp, HtmlResponse) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "playwright" in resp.flags - assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"] - assert isinstance(resp.meta["playwright_page"], PlaywrightPage) - assert resp.meta["playwright_page"].url == resp.url - - await resp.meta["playwright_page"].close() - await handler.browser.close() + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with StaticMockServer() as server: + meta = {"playwright": True, "playwright_include_page": True} + req = Request(server.urljoin("/index.html"), meta=meta) + resp = await handler._download_request(req, Spider("foo")) + + assert isinstance(resp, HtmlResponse) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "playwright" in resp.flags + assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"] + assert isinstance(resp.meta["playwright_page"], PlaywrightPage) + assert resp.meta["playwright_page"].url == resp.url + + await resp.meta["playwright_page"].close() @pytest.mark.asyncio async def test_post_request(self): - handler = ScrapyPlaywrightDownloadHandler( - get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) - ) - await handler._launch_browser() - - with MockServer() as server: - req = FormRequest( - server.urljoin("/"), meta={"playwright": True}, formdata={"foo": "bar"} - ) - resp = await handler._download_request(req, Spider("foo")) - - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "playwright" in resp.flags - assert "Request body: foo=bar" in resp.text - - await handler.browser.close() + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with MockServer() as server: + req = FormRequest( + server.urljoin("/"), meta={"playwright": True}, formdata={"foo": "bar"} + ) + resp = await handler._download_request(req, Spider("foo")) + + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "playwright" in resp.flags + assert "Request body: foo=bar" in resp.text @pytest.mark.asyncio async def test_page_coroutine_navigation(self): - handler = ScrapyPlaywrightDownloadHandler( - get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) - ) - await handler._launch_browser() - - with StaticMockServer() as server: - req = Request( - url=server.urljoin("/index.html"), - meta={ - "playwright": True, - "playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")], - }, - ) - resp = await handler._download_request(req, Spider("foo")) - - assert isinstance(resp, HtmlResponse) - assert resp.request is req - assert resp.url == server.urljoin("/lorem_ipsum.html") - assert resp.status == 200 - assert "playwright" in resp.flags - assert resp.css("title::text").get() == "Lorem Ipsum" - text = resp.css("p::text").get() - assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit." - - await handler.browser.close() + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with StaticMockServer() as server: + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")], + }, + ) + resp = await handler._download_request(req, Spider("foo")) + + assert isinstance(resp, HtmlResponse) + assert resp.request is req + assert resp.url == server.urljoin("/lorem_ipsum.html") + assert resp.status == 200 + assert "playwright" in resp.flags + assert resp.css("title::text").get() == "Lorem Ipsum" + text = resp.css("p::text").get() + assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit." @pytest.mark.asyncio async def test_page_coroutine_infinite_scroll(self): - handler = ScrapyPlaywrightDownloadHandler( - get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) - ) - await handler._launch_browser() - - with StaticMockServer() as server: - req = Request( - url=server.urljoin("/scroll.html"), - headers={"User-Agent": "scrapy-playwright"}, - meta={ - "playwright": True, - "playwright_page_coroutines": [ - PageCoro("wait_for_selector", selector="div.quote"), - PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), - PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"), - PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), - PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"), - ], - }, - ) - resp = await handler._download_request(req, Spider("foo")) - - assert isinstance(resp, HtmlResponse) - assert resp.request is req - assert resp.url == server.urljoin("/scroll.html") - assert resp.status == 200 - assert "playwright" in resp.flags - assert len(resp.css("div.quote")) == 30 - - await handler.browser.close() + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with StaticMockServer() as server: + req = Request( + url=server.urljoin("/scroll.html"), + headers={"User-Agent": "scrapy-playwright"}, + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageCoro("wait_for_selector", selector="div.quote"), + PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), + PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"), + PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), + PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"), + ], + }, + ) + resp = await handler._download_request(req, Spider("foo")) + + assert isinstance(resp, HtmlResponse) + assert resp.request is req + assert resp.url == server.urljoin("/scroll.html") + assert resp.status == 200 + assert "playwright" in resp.flags + assert len(resp.css("div.quote")) == 30 @pytest.mark.asyncio async def test_timeout(self): - handler = ScrapyPlaywrightDownloadHandler( - get_crawler( - settings_dict={ - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000, - } - ) - ) - await handler._launch_browser() - - with MockServer() as server: - req = Request(server.urljoin("/index.html"), meta={"playwright": True}) - with pytest.raises(TimeoutError): - await handler._download_request(req, Spider("foo")) - - await handler.browser.close() + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000, + } + async with make_handler(settings_dict) as handler: + with MockServer() as server: + req = Request(server.urljoin("/index.html"), meta={"playwright": True}) + with pytest.raises(TimeoutError): + await handler._download_request(req, Spider("foo")) @pytest.mark.asyncio async def test_context_kwargs(self): - handler = ScrapyPlaywrightDownloadHandler( - get_crawler( - settings_dict={ - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_CONTEXTS": { - "default": {"java_script_enabled": False}, + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": { + "default": {"java_script_enabled": False}, + }, + } + async with make_handler(settings_dict) as handler: + with StaticMockServer() as server: + req = Request( + url=server.urljoin("/scroll.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageCoro("wait_for_selector", selector="div.quote", timeout=1000), + ], }, - } - ) - ) - await handler._launch_browser() - - with StaticMockServer() as server: - req = Request( - url=server.urljoin("/scroll.html"), - meta={ - "playwright": True, - "playwright_page_coroutines": [ - PageCoro("wait_for_selector", selector="div.quote", timeout=1000), - ], - }, - ) - with pytest.raises(TimeoutError): - await handler._download_request(req, Spider("foo")) - - await handler.browser.close() + ) + with pytest.raises(TimeoutError): + await handler._download_request(req, Spider("foo")) @pytest.mark.asyncio async def test_page_coroutine_screenshot(self): - png_file = NamedTemporaryFile(mode="w+b") - handler = ScrapyPlaywrightDownloadHandler( - get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) - ) - await handler._launch_browser() - - with StaticMockServer() as server: - req = Request( - url=server.urljoin("/index.html"), - meta={ - "playwright": True, - "playwright_page_coroutines": { - "png": PageCoro("screenshot", path=png_file.name, type="png"), - }, - }, - ) - await handler._download_request(req, Spider("foo")) - - assert get_mimetype(png_file) == "image/png" - - png_file.file.seek(0) - assert png_file.file.read() == req.meta["playwright_page_coroutines"]["png"].result - - png_file.close() - - await handler.browser.close() + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with NamedTemporaryFile(mode="w+b") as png_file: + with StaticMockServer() as server: + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": { + "png": PageCoro("screenshot", path=png_file.name, type="png"), + }, + }, + ) + await handler._download_request(req, Spider("foo")) + + png_file.file.seek(0) + assert png_file.file.read() == req.meta["playwright_page_coroutines"]["png"].result + assert get_mimetype(png_file) == "image/png" @pytest.mark.asyncio async def test_page_coroutine_pdf(self): if self.browser_type != "chromium": pytest.skip("PDF generation is supported only in Chromium") - pdf_file = NamedTemporaryFile(mode="w+b") - handler = ScrapyPlaywrightDownloadHandler( - get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) - ) - await handler._launch_browser() - - with StaticMockServer() as server: - req = Request( - url=server.urljoin("/index.html"), - meta={ - "playwright": True, - "playwright_page_coroutines": { - "pdf": PageCoro("pdf", path=pdf_file.name), - }, - }, - ) - await handler._download_request(req, Spider("foo")) - - assert get_mimetype(pdf_file) == "application/pdf" - - pdf_file.file.seek(0) - assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result - - pdf_file.close() - - await handler.browser.close() + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with NamedTemporaryFile(mode="w+b") as pdf_file: + with StaticMockServer() as server: + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": { + "pdf": PageCoro("pdf", path=pdf_file.name), + }, + }, + ) + await handler._download_request(req, Spider("foo")) + + pdf_file.file.seek(0) + assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result + assert get_mimetype(pdf_file) == "application/pdf" @pytest.mark.asyncio async def test_event_handler_dialog_callable(self): - crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) - handler = ScrapyPlaywrightDownloadHandler(crawler) - await handler._launch_browser() - - with StaticMockServer() as server: - spider = DialogSpider() - req = Request( - url=server.urljoin("/index.html"), - meta={ - "playwright": True, - "playwright_page_coroutines": [ - PageCoro("evaluate", "alert('foobar');"), - ], - "playwright_page_event_handlers": { - "dialog": spider.handle_dialog, + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with StaticMockServer() as server: + spider = DialogSpider() + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageCoro("evaluate", "alert('foobar');"), + ], + "playwright_page_event_handlers": { + "dialog": spider.handle_dialog, + }, }, - }, - ) - await handler._download_request(req, spider) - - assert spider.dialog_message == "foobar" + ) + await handler._download_request(req, spider) - await handler.browser.close() + assert spider.dialog_message == "foobar" @pytest.mark.asyncio async def test_event_handler_dialog_str(self): - crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) - handler = ScrapyPlaywrightDownloadHandler(crawler) - await handler._launch_browser() - - with StaticMockServer() as server: - spider = DialogSpider() - req = Request( - url=server.urljoin("/index.html"), - meta={ - "playwright": True, - "playwright_page_coroutines": [ - PageCoro("evaluate", "alert('foobar');"), - ], - "playwright_page_event_handlers": { - "dialog": "handle_dialog", + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with StaticMockServer() as server: + spider = DialogSpider() + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_coroutines": [ + PageCoro("evaluate", "alert('foobar');"), + ], + "playwright_page_event_handlers": { + "dialog": "handle_dialog", + }, }, - }, - ) - await handler._download_request(req, spider) + ) + await handler._download_request(req, spider) - assert spider.dialog_message == "foobar" - - await handler.browser.close() + assert spider.dialog_message == "foobar" @pytest.mark.asyncio async def test_event_handler_dialog_missing(self, caplog): - crawler = get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) - handler = ScrapyPlaywrightDownloadHandler(crawler) - await handler._launch_browser() - - with StaticMockServer() as server: - spider = DialogSpider() - req = Request( - url=server.urljoin("/index.html"), - meta={ - "playwright": True, - "playwright_page_event_handlers": { - "dialog": "missing_method", + async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: + with StaticMockServer() as server: + spider = DialogSpider() + req = Request( + url=server.urljoin("/index.html"), + meta={ + "playwright": True, + "playwright_page_event_handlers": { + "dialog": "missing_method", + }, }, - }, - ) - await handler._download_request(req, spider) + ) + await handler._download_request(req, spider) assert ( "scrapy-playwright", @@ -328,8 +259,6 @@ async def test_event_handler_dialog_missing(self, caplog): ) in caplog.record_tuples assert getattr(spider, "dialog_message", None) is None - await handler.browser.close() - class TestCaseChromium(MixinTestCase): browser_type = "chromium"