From 9c1a773893c4d258a154196bc1fea124d94ea36d Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sat, 14 May 2022 15:37:30 -0300 Subject: [PATCH 01/10] Launch persistent context --- examples/persistent_context.py | 35 +++++++++++++++++++ scrapy_playwright/handler.py | 63 ++++++++++++++++++++++++++-------- 2 files changed, 83 insertions(+), 15 deletions(-) create mode 100644 examples/persistent_context.py diff --git a/examples/persistent_context.py b/examples/persistent_context.py new file mode 100644 index 00000000..906389ef --- /dev/null +++ b/examples/persistent_context.py @@ -0,0 +1,35 @@ +from pathlib import Path + +from scrapy import Spider, Request + + +class PersistentContextSpider(Spider): + """Use a persistent browser context""" + + name = "persistent_context" + custom_settings = { + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + "DOWNLOAD_HANDLERS": { + "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + }, + "" + "PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS": { + "user_data_dir": str(Path.home() / "playwright-persistent-context"), + "java_script_enabled": False, + "extra_http_headers": {"Asdf": "Qwerty"}, + "user_agent": "foobar", + }, + "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None, + } + + def start_requests(self): + yield Request(url="https://httpbin.org/get", meta={"playwright": True}) + + def parse(self, response): + content = response.css("pre::text").get() + print(content) + return { + "url": response.url, + "context": response.meta["playwright_context"], + } diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 64453697..e1a25f3f 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -41,6 +41,10 @@ logger = logging.getLogger("scrapy-playwright") +DEFAULT_CONTEXT_NAME = "default" +PERSISTENT_CONTEXT_NAME = "persistent" + + class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler): def __init__(self, crawler: Crawler) -> None: super().__init__(settings=crawler.settings, crawler=crawler) @@ -80,9 +84,22 @@ def __init__(self, crawler: Crawler) -> None: else: self.process_request_headers = use_scrapy_headers - self.context_kwargs: dict = crawler.settings.getdict("PLAYWRIGHT_CONTEXTS") + # if PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS is present we only launch one context self.contexts: Dict[str, BrowserContext] = {} + self.persistent_context: bool = False self.context_semaphores: Dict[str, asyncio.Semaphore] = {} + self.context_kwargs: dict = {} + if crawler.settings.get("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS"): + self.persistent_context = True + ctx_kwargs = crawler.settings.getdict("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS") + self.context_kwargs[PERSISTENT_CONTEXT_NAME] = ctx_kwargs + if crawler.settings.getdict("PLAYWRIGHT_CONTEXTS"): + logger.info( + "Both PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS and PLAYWRIGHT_CONTEXTS" + " are set, ignoring PLAYWRIGHT_CONTEXTS" + ) + else: + self.context_kwargs = crawler.settings.getdict("PLAYWRIGHT_CONTEXTS") self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"): @@ -99,20 +116,32 @@ def _engine_started(self) -> Deferred: async def _launch_browser(self) -> None: self.playwright_context_manager = PlaywrightContextManager() self.playwright = await self.playwright_context_manager.start() - logger.info("Launching browser") - browser_launcher = getattr(self.playwright, self.browser_type).launch - self.browser = await browser_launcher(**self.launch_options) - logger.info(f"Browser {self.browser_type} launched") - contexts = await asyncio.gather( - *[ - self._create_browser_context(name, kwargs) - for name, kwargs in self.context_kwargs.items() - ] + browser_type = getattr(self.playwright, self.browser_type) + if self.persistent_context: + logger.info("Launching single persistent context") + self.contexts[PERSISTENT_CONTEXT_NAME] = await browser_type.launch_persistent_context( + **self.context_kwargs[PERSISTENT_CONTEXT_NAME] + ) + if self.default_navigation_timeout is not None: + self.contexts[PERSISTENT_CONTEXT_NAME].set_default_navigation_timeout( + self.default_navigation_timeout + ) + logger.info("Persistent context launched") + else: + logger.info(f"Launching browser {self.browser_type}") + self.browser = await browser_type.launch(**self.launch_options) + logger.info("Launching startup context(s)") + contexts = await asyncio.gather( + *[ + self._create_browser_context(name, kwargs) + for name, kwargs in self.context_kwargs.items() + ] + ) + self.contexts = dict(zip(self.context_kwargs.keys(), contexts)) + self.context_semaphores.update( + {name: asyncio.Semaphore(value=self.max_pages_per_context) for name in self.contexts} ) - self.contexts = dict(zip(self.context_kwargs.keys(), contexts)) - self.context_semaphores = { - name: asyncio.Semaphore(value=self.max_pages_per_context) for name in self.contexts - } + logger.info(f"Browser {self.browser_type} launched") async def _create_browser_context(self, name: str, context_kwargs: dict) -> BrowserContext: context = await self.browser.new_context(**context_kwargs) @@ -125,7 +154,11 @@ async def _create_browser_context(self, name: str, context_kwargs: dict) -> Brow async def _create_page(self, request: Request) -> Page: """Create a new page in a context, also creating a new context if necessary.""" - context_name = request.meta.setdefault("playwright_context", "default") + if self.persistent_context: + context_name = request.meta["playwright_context"] = PERSISTENT_CONTEXT_NAME + else: + context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME) + context = self.contexts.get(context_name) if context is None: context_kwargs = request.meta.get("playwright_context_kwargs") or {} From c5385d0c578bd524670376ea72c39beda7cc2d5c Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sat, 14 May 2022 15:47:38 -0300 Subject: [PATCH 02/10] Launch persistent context (docs) --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index c15e1757..0b3171b7 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,14 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" this will be used by all requests which do not explicitly specify a context. See the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browser-new-context). +* `PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS` (type `Optional[dict]`, default `None`) + + A dictionary of keyword arguments to be passed to + [`BrowserType.launch_persistent_context`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context). + Use this setting to launch a **single** persistent context. + This setting takes precedence over `PLAYWRIGHT_CONTEXTS`, + i.e. `PLAYWRIGHT_CONTEXTS` is ignored if both are passed. + * `PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT` (type `Optional[float]`, default `None`) The timeout used when requesting pages by Playwright. If `None` or unset, @@ -258,6 +266,10 @@ class AwesomeSpiderWithPage(scrapy.Spider): Multiple [browser contexts](https://playwright.dev/python/docs/browser-contexts) to be launched at startup can be defined via the `PLAYWRIGHT_CONTEXTS` [setting](#settings). +**Note:** this section doesn not apply if using the +`PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS` setting. In that case, +only one context is used for all operations. + ### Choosing a specific context for a request Pass the name of the desired context in the `playwright_context` meta key: From bc7fe9480605aa2bb76bdeae5ef164da18ddc56c Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sat, 14 May 2022 17:35:07 -0300 Subject: [PATCH 03/10] Reduce duplication --- scrapy_playwright/handler.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index e1a25f3f..a5a69bb7 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -84,7 +84,7 @@ def __init__(self, crawler: Crawler) -> None: else: self.process_request_headers = use_scrapy_headers - # if PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS is present we only launch one context + # context-related settings self.contexts: Dict[str, BrowserContext] = {} self.persistent_context: bool = False self.context_semaphores: Dict[str, asyncio.Semaphore] = {} @@ -114,21 +114,22 @@ def _engine_started(self) -> Deferred: return deferred_from_coro(self._launch_browser()) async def _launch_browser(self) -> None: + """Start the browser instance and the configured contexts. Alternatively, + start only one persistent context if PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS was set. + """ + logger.info(f"Launching browser {self.browser_type}") self.playwright_context_manager = PlaywrightContextManager() self.playwright = await self.playwright_context_manager.start() browser_type = getattr(self.playwright, self.browser_type) if self.persistent_context: logger.info("Launching single persistent context") - self.contexts[PERSISTENT_CONTEXT_NAME] = await browser_type.launch_persistent_context( + context = await browser_type.launch_persistent_context( **self.context_kwargs[PERSISTENT_CONTEXT_NAME] ) - if self.default_navigation_timeout is not None: - self.contexts[PERSISTENT_CONTEXT_NAME].set_default_navigation_timeout( - self.default_navigation_timeout - ) + self._init_browser_context(PERSISTENT_CONTEXT_NAME, context) + self.contexts[PERSISTENT_CONTEXT_NAME] = context logger.info("Persistent context launched") else: - logger.info(f"Launching browser {self.browser_type}") self.browser = await browser_type.launch(**self.launch_options) logger.info("Launching startup context(s)") contexts = await asyncio.gather( @@ -142,15 +143,19 @@ async def _launch_browser(self) -> None: {name: asyncio.Semaphore(value=self.max_pages_per_context) for name in self.contexts} ) logger.info(f"Browser {self.browser_type} launched") + self.stats.set_value("playwright/page_count", self._get_total_page_count()) async def _create_browser_context(self, name: str, context_kwargs: dict) -> BrowserContext: context = await self.browser.new_context(**context_kwargs) + self._init_browser_context(name, context) + return context + + def _init_browser_context(self, name: str, context: BrowserContext) -> None: context.on("close", self._make_close_browser_context_callback(name)) logger.debug("Browser context started: '%s'", name) self.stats.inc_value("playwright/context_count") if self.default_navigation_timeout is not None: context.set_default_navigation_timeout(self.default_navigation_timeout) - return context async def _create_page(self, request: Request) -> Page: """Create a new page in a context, also creating a new context if necessary.""" @@ -177,6 +182,7 @@ async def _create_page(self, request: Request) -> Page: len(context.pages), self._get_total_page_count(), ) + self._set_max_concurrent_page_count() if self.default_navigation_timeout is not None: page.set_default_navigation_timeout(self.default_navigation_timeout) @@ -190,11 +196,13 @@ async def _create_page(self, request: Request) -> Page: return page def _get_total_page_count(self): - count = sum([len(context.pages) for context in self.contexts.values()]) + return sum([len(context.pages) for context in self.contexts.values()]) + + def _set_max_concurrent_page_count(self): + count = self._get_total_page_count() current_max_count = self.stats.get_value("playwright/page_count/max_concurrent") if current_max_count is None or count > current_max_count: self.stats.set_value("playwright/page_count/max_concurrent", count) - return count @inlineCallbacks def close(self) -> Deferred: @@ -202,6 +210,7 @@ def close(self) -> Deferred: yield deferred_from_coro(self._close()) async def _close(self) -> None: + await asyncio.gather(*[context.close() for context in self.contexts.values()]) self.contexts.clear() self.context_semaphores.clear() if getattr(self, "browser", None): From c06c791d768982e1cba916cd1e86646e4f2f7072 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 15 May 2022 00:20:38 -0300 Subject: [PATCH 04/10] Launch persistent context (tests) --- README.md | 9 +++++++ scrapy_playwright/handler.py | 4 ++-- tests/test_browser_contexts.py | 44 ++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0b3171b7..82e3fc0a 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,15 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" This setting takes precedence over `PLAYWRIGHT_CONTEXTS`, i.e. `PLAYWRIGHT_CONTEXTS` is ignored if both are passed. + Example: + ```python + PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS = { + "user_data_dir": "/path/to/dir", # mandatory + "java_script_enabled": False, + "headless": False, + } + ``` + * `PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT` (type `Optional[float]`, default `None`) The timeout used when requesting pages by Playwright. If `None` or unset, diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index a5a69bb7..4d712a4f 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -93,8 +93,8 @@ def __init__(self, crawler: Crawler) -> None: self.persistent_context = True ctx_kwargs = crawler.settings.getdict("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS") self.context_kwargs[PERSISTENT_CONTEXT_NAME] = ctx_kwargs - if crawler.settings.getdict("PLAYWRIGHT_CONTEXTS"): - logger.info( + if crawler.settings.get("PLAYWRIGHT_CONTEXTS"): + logger.warning( "Both PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS and PLAYWRIGHT_CONTEXTS" " are set, ignoring PLAYWRIGHT_CONTEXTS" ) diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index 0ff96542..c195f784 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -1,9 +1,15 @@ import asyncio +import logging import platform +import tempfile +from pathlib import Path +from uuid import uuid4 import pytest from scrapy import Spider, Request +from scrapy_playwright.handler import PERSISTENT_CONTEXT_NAME + from tests import make_handler from tests.mockserver import StaticMockServer @@ -97,6 +103,44 @@ async def test_contexts_startup(self): assert cookie["value"] == "bar" assert cookie["domain"] == "example.org" + @pytest.mark.asyncio + async def test_persistent_context(self, caplog): + temp_dir = f"{tempfile.gettempdir()}/{uuid4()}" + settings = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 3000, + "PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS": { + "user_data_dir": temp_dir, + }, + "PLAYWRIGHT_CONTEXTS": { + "foo": {}, + }, + } + assert not Path(temp_dir).exists() + async with make_handler(settings) as handler: + assert handler.persistent_context + assert Path(temp_dir).is_dir() + assert PERSISTENT_CONTEXT_NAME in handler.contexts + assert len(handler.contexts) == 1 + assert len(handler.context_semaphores) == 1 + assert getattr(handler, "browser", None) is None + + with StaticMockServer() as server: + meta = { + "playwright": True, + "playwright_context": "will-be-ignored", + } + req = Request(server.urljoin("/index.html"), meta=meta) + resp = await handler._download_request(req, Spider("foo")) + assert resp.meta["playwright_context"] == PERSISTENT_CONTEXT_NAME + + assert ( + "scrapy-playwright", + logging.WARNING, + "Both PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS and PLAYWRIGHT_CONTEXTS" + " are set, ignoring PLAYWRIGHT_CONTEXTS", + ) in caplog.record_tuples + @pytest.mark.asyncio async def test_contexts_dynamic(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: From 10d101b98778b0e20acb9857b1b15221963376f9 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 15 May 2022 20:38:02 -0300 Subject: [PATCH 05/10] Refactor browser context creation --- scrapy_playwright/handler.py | 118 ++++++++++++++++++--------------- tests/__init__.py | 2 +- tests/test_browser_contexts.py | 4 -- 3 files changed, 64 insertions(+), 60 deletions(-) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 4d712a4f..a62d0896 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -2,12 +2,14 @@ import logging import warnings from contextlib import suppress +from dataclasses import dataclass from ipaddress import ip_address from time import time from typing import Awaitable, Callable, Dict, Generator, Optional, Tuple, Type, TypeVar, Union from playwright.async_api import ( BrowserContext, + BrowserType, Page, PlaywrightContextManager, Request as PlaywrightRequest, @@ -41,36 +43,45 @@ logger = logging.getLogger("scrapy-playwright") +DEFAULT_BROWSER_TYPE = "chromium" DEFAULT_CONTEXT_NAME = "default" PERSISTENT_CONTEXT_NAME = "persistent" +@dataclass +class BrowserContextWrapper: + context: BrowserContext + semaphore: asyncio.Semaphore + + class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler): def __init__(self, crawler: Crawler) -> None: - super().__init__(settings=crawler.settings, crawler=crawler) + settings = crawler.settings + super().__init__(settings=settings, crawler=crawler) verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") crawler.signals.connect(self._engine_started, signals.engine_started) self.stats = crawler.stats - self.browser_type: str = crawler.settings.get("PLAYWRIGHT_BROWSER_TYPE") or "chromium" - self.max_pages_per_context: int = crawler.settings.getint( + self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE + self.max_pages_per_context: int = settings.getint( "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT" - ) or crawler.settings.getint("CONCURRENT_REQUESTS") - self.launch_options: dict = crawler.settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {} + ) or settings.getint("CONCURRENT_REQUESTS") + self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {} self.default_navigation_timeout: Optional[float] = None - if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler.settings: + if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings: with suppress(TypeError, ValueError): self.default_navigation_timeout = float( - crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT") + settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT") ) - if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings: - if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None: + # header-related settings + if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings: + if settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None: self.process_request_headers = None # use headers from the Playwright request else: self.process_request_headers = load_object( - crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] + settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] ) if self.process_request_headers is use_playwright_headers: warnings.warn( @@ -85,25 +96,24 @@ def __init__(self, crawler: Crawler) -> None: self.process_request_headers = use_scrapy_headers # context-related settings - self.contexts: Dict[str, BrowserContext] = {} + self.contexts: Dict[str, BrowserContextWrapper] = {} self.persistent_context: bool = False - self.context_semaphores: Dict[str, asyncio.Semaphore] = {} self.context_kwargs: dict = {} - if crawler.settings.get("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS"): + if settings.get("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS"): self.persistent_context = True - ctx_kwargs = crawler.settings.getdict("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS") + ctx_kwargs = settings.getdict("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS") self.context_kwargs[PERSISTENT_CONTEXT_NAME] = ctx_kwargs - if crawler.settings.get("PLAYWRIGHT_CONTEXTS"): + if settings.get("PLAYWRIGHT_CONTEXTS"): logger.warning( "Both PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS and PLAYWRIGHT_CONTEXTS" " are set, ignoring PLAYWRIGHT_CONTEXTS" ) else: - self.context_kwargs = crawler.settings.getdict("PLAYWRIGHT_CONTEXTS") + self.context_kwargs = settings.getdict("PLAYWRIGHT_CONTEXTS") self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None - if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"): - self.abort_request = load_object(crawler.settings["PLAYWRIGHT_ABORT_REQUEST"]) + if settings.get("PLAYWRIGHT_ABORT_REQUEST"): + self.abort_request = load_object(settings["PLAYWRIGHT_ABORT_REQUEST"]) @classmethod def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler: @@ -111,51 +121,54 @@ def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHa def _engine_started(self) -> Deferred: """Launch the browser. Use the engine_started signal as it supports returning deferreds.""" - return deferred_from_coro(self._launch_browser()) + return deferred_from_coro(self._launch()) - async def _launch_browser(self) -> None: - """Start the browser instance and the configured contexts. Alternatively, - start only one persistent context if PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS was set. + async def _launch(self) -> None: + """Start a browser instance and configured contexts. + Start only one persistent context if PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS was set. """ - logger.info(f"Launching browser {self.browser_type}") + logger.info(f"Launching browser {self.browser_type_name}") self.playwright_context_manager = PlaywrightContextManager() self.playwright = await self.playwright_context_manager.start() - browser_type = getattr(self.playwright, self.browser_type) + self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name) if self.persistent_context: logger.info("Launching single persistent context") - context = await browser_type.launch_persistent_context( - **self.context_kwargs[PERSISTENT_CONTEXT_NAME] + self.contexts[PERSISTENT_CONTEXT_NAME] = await self._create_browser_context( + name=PERSISTENT_CONTEXT_NAME, + context_kwargs=self.context_kwargs[PERSISTENT_CONTEXT_NAME], + persistent=True, ) - self._init_browser_context(PERSISTENT_CONTEXT_NAME, context) - self.contexts[PERSISTENT_CONTEXT_NAME] = context logger.info("Persistent context launched") else: - self.browser = await browser_type.launch(**self.launch_options) + self.browser = await self.browser_type.launch(**self.launch_options) logger.info("Launching startup context(s)") contexts = await asyncio.gather( *[ - self._create_browser_context(name, kwargs) + self._create_browser_context(name=name, context_kwargs=kwargs) for name, kwargs in self.context_kwargs.items() ] ) self.contexts = dict(zip(self.context_kwargs.keys(), contexts)) - self.context_semaphores.update( - {name: asyncio.Semaphore(value=self.max_pages_per_context) for name in self.contexts} - ) - logger.info(f"Browser {self.browser_type} launched") + logger.info("Startup context(s) launched") + logger.info(f"Browser {self.browser_type_name} launched") self.stats.set_value("playwright/page_count", self._get_total_page_count()) - async def _create_browser_context(self, name: str, context_kwargs: dict) -> BrowserContext: - context = await self.browser.new_context(**context_kwargs) - self._init_browser_context(name, context) - return context - - def _init_browser_context(self, name: str, context: BrowserContext) -> None: + async def _create_browser_context( + self, name: str, context_kwargs: dict, persistent: bool = False + ) -> BrowserContextWrapper: + if persistent: + context = await self.browser_type.launch_persistent_context(**context_kwargs) + else: + context = await self.browser.new_context(**context_kwargs) context.on("close", self._make_close_browser_context_callback(name)) logger.debug("Browser context started: '%s'", name) self.stats.inc_value("playwright/context_count") if self.default_navigation_timeout is not None: context.set_default_navigation_timeout(self.default_navigation_timeout) + return BrowserContextWrapper( + context=context, + semaphore=asyncio.Semaphore(value=self.max_pages_per_context), + ) async def _create_page(self, request: Request) -> Page: """Create a new page in a context, also creating a new context if necessary.""" @@ -167,19 +180,17 @@ async def _create_page(self, request: Request) -> Page: context = self.contexts.get(context_name) if context is None: context_kwargs = request.meta.get("playwright_context_kwargs") or {} - context = await self._create_browser_context(context_name, context_kwargs) - self.contexts[context_name] = context - self.context_semaphores[context_name] = asyncio.Semaphore( - value=self.max_pages_per_context + context = self.contexts[context_name] = await self._create_browser_context( + name=context_name, context_kwargs=context_kwargs, persistent=False ) - await self.context_semaphores[context_name].acquire() - page = await context.new_page() + await context.semaphore.acquire() + page = await context.context.new_page() self.stats.inc_value("playwright/page_count") logger.debug( "[Context=%s] New page created, page count is %i (%i for all contexts)", context_name, - len(context.pages), + len(context.context.pages), self._get_total_page_count(), ) self._set_max_concurrent_page_count() @@ -196,7 +207,7 @@ async def _create_page(self, request: Request) -> Page: return page def _get_total_page_count(self): - return sum([len(context.pages) for context in self.contexts.values()]) + return sum([len(ctx.context.pages) for ctx in self.contexts.values()]) def _set_max_concurrent_page_count(self): count = self._get_total_page_count() @@ -210,9 +221,8 @@ def close(self) -> Deferred: yield deferred_from_coro(self._close()) async def _close(self) -> None: - await asyncio.gather(*[context.close() for context in self.contexts.values()]) + await asyncio.gather(*[ctx.context.close() for ctx in self.contexts.values()]) self.contexts.clear() - self.context_semaphores.clear() if getattr(self, "browser", None): logger.info("Closing browser") await self.browser.close() @@ -344,8 +354,8 @@ def _increment_response_stats(self, response: PlaywrightResponse) -> None: def _make_close_page_callback(self, context_name: str) -> Callable: def close_page_callback() -> None: - if context_name in self.context_semaphores: - self.context_semaphores[context_name].release() + if context_name in self.contexts: + self.contexts[context_name].semaphore.release() return close_page_callback @@ -354,8 +364,6 @@ def close_browser_context_callback() -> None: logger.debug("Browser context closed: '%s'", name) if name in self.contexts: self.contexts.pop(name) - if name in self.context_semaphores: - self.context_semaphores.pop(name) return close_browser_context_callback @@ -376,7 +384,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) if self.process_request_headers is not None: overrides["headers"] = await _await_if_necessary( self.process_request_headers( - self.browser_type, playwright_request, scrapy_headers + self.browser_type_name, playwright_request, scrapy_headers ) ) # the request that reaches the callback should contain the final headers diff --git a/tests/__init__.py b/tests/__init__.py index 5f2a34bb..f5385194 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -11,7 +11,7 @@ async def make_handler(settings_dict: dict): crawler = get_crawler(settings_dict=settings_dict) handler = ScrapyPlaywrightDownloadHandler(crawler=crawler) try: - await handler._launch_browser() + await handler._launch() except: # noqa (E722), pylint: disable=bare-except pass else: diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index c195f784..8fbde126 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -83,7 +83,6 @@ async def test_contexts_startup(self): } async with make_handler(settings) as handler: assert len(handler.contexts) == 1 - assert len(handler.context_semaphores) == 1 with StaticMockServer() as server: meta = { @@ -122,7 +121,6 @@ async def test_persistent_context(self, caplog): assert Path(temp_dir).is_dir() assert PERSISTENT_CONTEXT_NAME in handler.contexts assert len(handler.contexts) == 1 - assert len(handler.context_semaphores) == 1 assert getattr(handler, "browser", None) is None with StaticMockServer() as server: @@ -145,7 +143,6 @@ async def test_persistent_context(self, caplog): async def test_contexts_dynamic(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: assert len(handler.contexts) == 0 - assert len(handler.context_semaphores) == 0 with StaticMockServer() as server: meta = { @@ -168,7 +165,6 @@ async def test_contexts_dynamic(self): resp = await handler._download_request(req, Spider("foo")) assert len(handler.contexts) == 1 - assert len(handler.context_semaphores) == 1 page = resp.meta["playwright_page"] storage_state = await page.context.storage_state() From 2ff494f2030a4857b5ec73b522bae74be6bd5da6 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 15 May 2022 21:20:02 -0300 Subject: [PATCH 06/10] Allow persistent and non-persitent contexts at the smae time --- README.md | 41 ++++++------- examples/persistent_context.py | 17 ++++-- scrapy_playwright/handler.py | 103 +++++++++++++++------------------ tests/test_browser_contexts.py | 48 ++++++--------- 4 files changed, 96 insertions(+), 113 deletions(-) diff --git a/README.md b/README.md index 82e3fc0a..49aa51c7 100644 --- a/README.md +++ b/README.md @@ -78,35 +78,25 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" It should be a mapping of (name, keyword arguments). For instance: ```python { - "first": { + "foobar": { "context_arg1": "value", "context_arg2": "value", }, - "second": { + "default": { + "context_arg1": "value", + "context_arg2": "value", + }, + "persistent": { + "user_data_dir": "/path/to/dir", # will be a persistent context "context_arg1": "value", }, } ``` - A default context (called `default`) is created if no contexts are defined, - this will be used by all requests which do not explicitly specify a context. - See the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browser-new-context). -* `PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS` (type `Optional[dict]`, default `None`) + See the section on [Multiple browser contexts](#multiple-browser-contexts) + for more information. - A dictionary of keyword arguments to be passed to - [`BrowserType.launch_persistent_context`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context). - Use this setting to launch a **single** persistent context. - This setting takes precedence over `PLAYWRIGHT_CONTEXTS`, - i.e. `PLAYWRIGHT_CONTEXTS` is ignored if both are passed. - - Example: - ```python - PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS = { - "user_data_dir": "/path/to/dir", # mandatory - "java_script_enabled": False, - "headless": False, - } - ``` + See also the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browser-new-context). * `PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT` (type `Optional[float]`, default `None`) @@ -290,6 +280,17 @@ yield scrapy.Request( ) ``` +### Default context + +If a request does not explicitly indicate a context via the `playwright_context` +meta key, it falls back to using a general context called `default`. This `default` +context can also be customized on startup via the `PLAYWRIGHT_CONTEXTS` setting. + +### Persistent contexts + +Pass a value for the `user_data_dir` keyword argument to launch a context as +**persistent** (see [`BrowserType.launch_persistent_context`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context)). + ### Creating a context during a crawl If the context specified in the `playwright_context` meta key does not exist, it will be created. diff --git a/examples/persistent_context.py b/examples/persistent_context.py index 906389ef..f4d1cddb 100644 --- a/examples/persistent_context.py +++ b/examples/persistent_context.py @@ -14,17 +14,22 @@ class PersistentContextSpider(Spider): # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", }, "" - "PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS": { - "user_data_dir": str(Path.home() / "playwright-persistent-context"), - "java_script_enabled": False, - "extra_http_headers": {"Asdf": "Qwerty"}, - "user_agent": "foobar", + "PLAYWRIGHT_CONTEXTS": { + "foobar": { + "user_data_dir": str(Path.home() / "playwright-persistent-context"), + "java_script_enabled": False, + "extra_http_headers": {"Asdf": "Qwerty"}, + "user_agent": "foobar", + } }, "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None, } def start_requests(self): - yield Request(url="https://httpbin.org/get", meta={"playwright": True}) + yield Request( + url="https://httpbin.org/get", + meta={"playwright": True, "playwright_context": "foobar"}, + ) def parse(self, response): content = response.css("pre::text").get() diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index a62d0896..dda76f9b 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -8,6 +8,7 @@ from typing import Awaitable, Callable, Dict, Generator, Optional, Tuple, Type, TypeVar, Union from playwright.async_api import ( + Browser, BrowserContext, BrowserType, Page, @@ -45,13 +46,14 @@ DEFAULT_BROWSER_TYPE = "chromium" DEFAULT_CONTEXT_NAME = "default" -PERSISTENT_CONTEXT_NAME = "persistent" +PERSISTENT_CONTEXT_PATH_KEY = "user_data_dir" @dataclass class BrowserContextWrapper: context: BrowserContext semaphore: asyncio.Semaphore + persistent: bool class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler): @@ -62,6 +64,9 @@ def __init__(self, crawler: Crawler) -> None: crawler.signals.connect(self._engine_started, signals.engine_started) self.stats = crawler.stats + self.browser_launch_lock = asyncio.Lock() + self.context_launch_lock = asyncio.Lock() + self.browser: Optional[Browser] = None self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE self.max_pages_per_context: int = settings.getint( "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT" @@ -97,19 +102,7 @@ def __init__(self, crawler: Crawler) -> None: # context-related settings self.contexts: Dict[str, BrowserContextWrapper] = {} - self.persistent_context: bool = False - self.context_kwargs: dict = {} - if settings.get("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS"): - self.persistent_context = True - ctx_kwargs = settings.getdict("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS") - self.context_kwargs[PERSISTENT_CONTEXT_NAME] = ctx_kwargs - if settings.get("PLAYWRIGHT_CONTEXTS"): - logger.warning( - "Both PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS and PLAYWRIGHT_CONTEXTS" - " are set, ignoring PLAYWRIGHT_CONTEXTS" - ) - else: - self.context_kwargs = settings.getdict("PLAYWRIGHT_CONTEXTS") + self.context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS") self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None if settings.get("PLAYWRIGHT_ABORT_REQUEST"): @@ -124,65 +117,63 @@ def _engine_started(self) -> Deferred: return deferred_from_coro(self._launch()) async def _launch(self) -> None: - """Start a browser instance and configured contexts. - Start only one persistent context if PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS was set. - """ - logger.info(f"Launching browser {self.browser_type_name}") + """Launch Playwright manager and configured startup context(s).""" self.playwright_context_manager = PlaywrightContextManager() self.playwright = await self.playwright_context_manager.start() self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name) - if self.persistent_context: - logger.info("Launching single persistent context") - self.contexts[PERSISTENT_CONTEXT_NAME] = await self._create_browser_context( - name=PERSISTENT_CONTEXT_NAME, - context_kwargs=self.context_kwargs[PERSISTENT_CONTEXT_NAME], - persistent=True, - ) - logger.info("Persistent context launched") - else: - self.browser = await self.browser_type.launch(**self.launch_options) - logger.info("Launching startup context(s)") - contexts = await asyncio.gather( - *[ - self._create_browser_context(name=name, context_kwargs=kwargs) - for name, kwargs in self.context_kwargs.items() - ] - ) - self.contexts = dict(zip(self.context_kwargs.keys(), contexts)) - logger.info("Startup context(s) launched") - logger.info(f"Browser {self.browser_type_name} launched") + logger.info("Launching startup context(s)") + contexts = await asyncio.gather( + *[ + self._create_browser_context(name=name, context_kwargs=kwargs) + for name, kwargs in self.context_kwargs.items() + ] + ) + self.contexts = dict(zip(self.context_kwargs.keys(), contexts)) + logger.info("Startup context(s) launched") self.stats.set_value("playwright/page_count", self._get_total_page_count()) + async def _maybe_launch_browser(self) -> None: + async with self.browser_launch_lock: + if self.browser is None: + logger.info(f"Launching browser {self.browser_type.name}") + self.browser = await self.browser_type.launch(**self.launch_options) + logger.info(f"Browser {self.browser_type.name} launched") + async def _create_browser_context( - self, name: str, context_kwargs: dict, persistent: bool = False + self, name: str, context_kwargs: Optional[dict] ) -> BrowserContextWrapper: - if persistent: + """Create a new context, also launching a browser if necessary.""" + context_kwargs = context_kwargs or {} + if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY): context = await self.browser_type.launch_persistent_context(**context_kwargs) + persistent = True else: + await self._maybe_launch_browser() + assert self.browser # nosec[assert_used] (typing) context = await self.browser.new_context(**context_kwargs) - context.on("close", self._make_close_browser_context_callback(name)) - logger.debug("Browser context started: '%s'", name) + persistent = False + context.on("close", self._make_close_browser_context_callback(name, persistent)) + logger.debug(f"Browser context started: '{name}' (persistent={persistent})") self.stats.inc_value("playwright/context_count") if self.default_navigation_timeout is not None: context.set_default_navigation_timeout(self.default_navigation_timeout) return BrowserContextWrapper( context=context, semaphore=asyncio.Semaphore(value=self.max_pages_per_context), + persistent=persistent, ) async def _create_page(self, request: Request) -> Page: """Create a new page in a context, also creating a new context if necessary.""" - if self.persistent_context: - context_name = request.meta["playwright_context"] = PERSISTENT_CONTEXT_NAME - else: - context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME) - - context = self.contexts.get(context_name) - if context is None: - context_kwargs = request.meta.get("playwright_context_kwargs") or {} - context = self.contexts[context_name] = await self._create_browser_context( - name=context_name, context_kwargs=context_kwargs, persistent=False - ) + context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME) + # this block needs to be locked because several attempts to launch a context + # with the same name could happen at the same time from different requests + async with self.context_launch_lock: + context = self.contexts.get(context_name) + if context is None: + context = self.contexts[context_name] = await self._create_browser_context( + name=context_name, context_kwargs=request.meta.get("playwright_context_kwargs") + ) await context.semaphore.acquire() page = await context.context.new_page() @@ -223,7 +214,7 @@ def close(self) -> Deferred: async def _close(self) -> None: await asyncio.gather(*[ctx.context.close() for ctx in self.contexts.values()]) self.contexts.clear() - if getattr(self, "browser", None): + if self.browser is not None: logger.info("Closing browser") await self.browser.close() await self.playwright_context_manager.__aexit__() @@ -359,9 +350,9 @@ def close_page_callback() -> None: return close_page_callback - def _make_close_browser_context_callback(self, name: str) -> Callable: + def _make_close_browser_context_callback(self, name: str, persistent: bool) -> Callable: def close_browser_context_callback() -> None: - logger.debug("Browser context closed: '%s'", name) + logger.debug(f"Browser context closed: '{name}' (persistent={persistent})") if name in self.contexts: self.contexts.pop(name) diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index 8fbde126..dc9880b0 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -1,5 +1,4 @@ import asyncio -import logging import platform import tempfile from pathlib import Path @@ -8,8 +7,6 @@ import pytest from scrapy import Spider, Request -from scrapy_playwright.handler import PERSISTENT_CONTEXT_NAME - from tests import make_handler from tests.mockserver import StaticMockServer @@ -103,41 +100,30 @@ async def test_contexts_startup(self): assert cookie["domain"] == "example.org" @pytest.mark.asyncio - async def test_persistent_context(self, caplog): - temp_dir = f"{tempfile.gettempdir()}/{uuid4()}" + async def test_persistent_contexts(self): + temp_dir_foo = f"{tempfile.gettempdir()}/{uuid4()}" + temp_dir_bar = f"{tempfile.gettempdir()}/{uuid4()}" settings = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 3000, - "PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS": { - "user_data_dir": temp_dir, - }, "PLAYWRIGHT_CONTEXTS": { - "foo": {}, + "foo": { + "user_data_dir": temp_dir_foo, + }, + "bar": { + "user_data_dir": temp_dir_bar, + }, }, } - assert not Path(temp_dir).exists() + assert not Path(temp_dir_foo).exists() + assert not Path(temp_dir_bar).exists() async with make_handler(settings) as handler: - assert handler.persistent_context - assert Path(temp_dir).is_dir() - assert PERSISTENT_CONTEXT_NAME in handler.contexts - assert len(handler.contexts) == 1 - assert getattr(handler, "browser", None) is None - - with StaticMockServer() as server: - meta = { - "playwright": True, - "playwright_context": "will-be-ignored", - } - req = Request(server.urljoin("/index.html"), meta=meta) - resp = await handler._download_request(req, Spider("foo")) - assert resp.meta["playwright_context"] == PERSISTENT_CONTEXT_NAME - - assert ( - "scrapy-playwright", - logging.WARNING, - "Both PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS and PLAYWRIGHT_CONTEXTS" - " are set, ignoring PLAYWRIGHT_CONTEXTS", - ) in caplog.record_tuples + assert Path(temp_dir_foo).is_dir() + assert Path(temp_dir_bar).is_dir() + assert len(handler.contexts) == 2 + assert handler.contexts["foo"].persistent + assert handler.contexts["bar"].persistent + assert handler.browser is None @pytest.mark.asyncio async def test_contexts_dynamic(self): From 1fc7701872dea5fcf7028a483b3681287d00d41d Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 15 May 2022 21:32:31 -0300 Subject: [PATCH 07/10] Stats: persistent context count --- scrapy_playwright/handler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index dda76f9b..a5e49de4 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -147,11 +147,13 @@ async def _create_browser_context( if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY): context = await self.browser_type.launch_persistent_context(**context_kwargs) persistent = True + self.stats.inc_value("playwright/context_count/persistent") else: await self._maybe_launch_browser() assert self.browser # nosec[assert_used] (typing) context = await self.browser.new_context(**context_kwargs) persistent = False + self.stats.inc_value("playwright/context_count/non-persistent") context.on("close", self._make_close_browser_context_callback(name, persistent)) logger.debug(f"Browser context started: '{name}' (persistent={persistent})") self.stats.inc_value("playwright/context_count") From 76777d16f9018417ebd77cbfdd3fc3fe6bc73ff1 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 16 May 2022 01:35:45 -0300 Subject: [PATCH 08/10] Tidy up --- examples/persistent_context.py | 1 - scrapy_playwright/handler.py | 33 +++++++++++++++++---------------- tests/test_browser_contexts.py | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/examples/persistent_context.py b/examples/persistent_context.py index f4d1cddb..b107b008 100644 --- a/examples/persistent_context.py +++ b/examples/persistent_context.py @@ -13,7 +13,6 @@ class PersistentContextSpider(Spider): "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", }, - "" "PLAYWRIGHT_CONTEXTS": { "foobar": { "user_data_dir": str(Path.home() / "playwright-persistent-context"), diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index a5e49de4..201fde89 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -66,7 +66,6 @@ def __init__(self, crawler: Crawler) -> None: self.browser_launch_lock = asyncio.Lock() self.context_launch_lock = asyncio.Lock() - self.browser: Optional[Browser] = None self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE self.max_pages_per_context: int = settings.getint( "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT" @@ -83,7 +82,7 @@ def __init__(self, crawler: Crawler) -> None: # header-related settings if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings: if settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None: - self.process_request_headers = None # use headers from the Playwright request + self.process_request_headers = None else: self.process_request_headers = load_object( settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] @@ -118,25 +117,27 @@ def _engine_started(self) -> Deferred: async def _launch(self) -> None: """Launch Playwright manager and configured startup context(s).""" + logger.info("Starting download handler") self.playwright_context_manager = PlaywrightContextManager() self.playwright = await self.playwright_context_manager.start() self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name) - logger.info("Launching startup context(s)") - contexts = await asyncio.gather( - *[ - self._create_browser_context(name=name, context_kwargs=kwargs) - for name, kwargs in self.context_kwargs.items() - ] - ) - self.contexts = dict(zip(self.context_kwargs.keys(), contexts)) - logger.info("Startup context(s) launched") - self.stats.set_value("playwright/page_count", self._get_total_page_count()) + if self.context_kwargs: + logger.info(f"Launching {len(self.context_kwargs)} startup context(s)") + contexts = await asyncio.gather( + *[ + self._create_browser_context(name=name, context_kwargs=kwargs) + for name, kwargs in self.context_kwargs.items() + ] + ) + self.contexts = dict(zip(self.context_kwargs.keys(), contexts)) + logger.info("Startup context(s) launched") + self.stats.set_value("playwright/page_count", self._get_total_page_count()) async def _maybe_launch_browser(self) -> None: async with self.browser_launch_lock: - if self.browser is None: + if not hasattr(self, "browser"): logger.info(f"Launching browser {self.browser_type.name}") - self.browser = await self.browser_type.launch(**self.launch_options) + self.browser: Browser = await self.browser_type.launch(**self.launch_options) logger.info(f"Browser {self.browser_type.name} launched") async def _create_browser_context( @@ -150,7 +151,6 @@ async def _create_browser_context( self.stats.inc_value("playwright/context_count/persistent") else: await self._maybe_launch_browser() - assert self.browser # nosec[assert_used] (typing) context = await self.browser.new_context(**context_kwargs) persistent = False self.stats.inc_value("playwright/context_count/non-persistent") @@ -210,13 +210,14 @@ def _set_max_concurrent_page_count(self): @inlineCallbacks def close(self) -> Deferred: + logger.info("Closing download handler") yield super().close() yield deferred_from_coro(self._close()) async def _close(self) -> None: await asyncio.gather(*[ctx.context.close() for ctx in self.contexts.values()]) self.contexts.clear() - if self.browser is not None: + if hasattr(self, "browser"): logger.info("Closing browser") await self.browser.close() await self.playwright_context_manager.__aexit__() diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index dc9880b0..e6c02eca 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -123,7 +123,7 @@ async def test_persistent_contexts(self): assert len(handler.contexts) == 2 assert handler.contexts["foo"].persistent assert handler.contexts["bar"].persistent - assert handler.browser is None + assert not hasattr(handler, "browser") @pytest.mark.asyncio async def test_contexts_dynamic(self): From a6f58e09a12c46bbec4cd5072a4d90acdfeec4d4 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 16 May 2022 01:40:12 -0300 Subject: [PATCH 09/10] Remove outdated section in readme --- README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.md b/README.md index 49aa51c7..7a0659db 100644 --- a/README.md +++ b/README.md @@ -265,10 +265,6 @@ class AwesomeSpiderWithPage(scrapy.Spider): Multiple [browser contexts](https://playwright.dev/python/docs/browser-contexts) to be launched at startup can be defined via the `PLAYWRIGHT_CONTEXTS` [setting](#settings). -**Note:** this section doesn not apply if using the -`PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS` setting. In that case, -only one context is used for all operations. - ### Choosing a specific context for a request Pass the name of the desired context in the `playwright_context` meta key: From e119c628576665a1ccb924643fafa51fdcb1ff22 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 17 May 2022 17:52:34 -0300 Subject: [PATCH 10/10] Adjust persistent context tests --- tests/test_browser_contexts.py | 68 ++++++++++++++++++++++++------- tests/test_playwright_requests.py | 22 ---------- 2 files changed, 54 insertions(+), 36 deletions(-) diff --git a/tests/test_browser_contexts.py b/tests/test_browser_contexts.py index e6c02eca..ee34e29a 100644 --- a/tests/test_browser_contexts.py +++ b/tests/test_browser_contexts.py @@ -5,7 +5,9 @@ from uuid import uuid4 import pytest +from playwright.async_api import Browser, TimeoutError as PlaywrightTimeoutError from scrapy import Spider, Request +from scrapy_playwright.page import PageMethod from tests import make_handler from tests.mockserver import StaticMockServer @@ -25,6 +27,28 @@ async def test_contexts_max_pages_setting(self): async with make_handler(settings) as handler: assert handler.max_pages_per_context == 9876 + @pytest.mark.asyncio + async def test_context_kwargs(self): + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": { + "default": {"java_script_enabled": False}, + }, + } + async with make_handler(settings_dict) as handler: + with StaticMockServer() as server: + req = Request( + url=server.urljoin("/scroll.html"), + meta={ + "playwright": True, + "playwright_page_methods": [ + PageMethod("wait_for_selector", selector="div.quote", timeout=1000), + ], + }, + ) + with pytest.raises(PlaywrightTimeoutError): + await handler._download_request(req, Spider("foo")) + @pytest.mark.asyncio async def test_contexts_max_pages(self): settings = { @@ -100,30 +124,46 @@ async def test_contexts_startup(self): assert cookie["domain"] == "example.org" @pytest.mark.asyncio - async def test_persistent_contexts(self): - temp_dir_foo = f"{tempfile.gettempdir()}/{uuid4()}" - temp_dir_bar = f"{tempfile.gettempdir()}/{uuid4()}" + async def test_persistent_context(self): + temp_dir = f"{tempfile.gettempdir()}/{uuid4()}" + settings = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 3000, + "PLAYWRIGHT_CONTEXTS": { + "persistent": { + "user_data_dir": temp_dir, + }, + }, + } + assert not Path(temp_dir).exists() + async with make_handler(settings) as handler: + assert Path(temp_dir).is_dir() + assert len(handler.contexts) == 1 + assert handler.contexts["persistent"].persistent + assert not hasattr(handler, "browser") + + @pytest.mark.asyncio + async def test_mixed_persistent_contexts(self): + temp_dir = f"{tempfile.gettempdir()}/{uuid4()}" settings = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 3000, "PLAYWRIGHT_CONTEXTS": { - "foo": { - "user_data_dir": temp_dir_foo, + "persistent": { + "user_data_dir": temp_dir, }, - "bar": { - "user_data_dir": temp_dir_bar, + "non-persistent": { + "java_script_enabled": False, }, }, } - assert not Path(temp_dir_foo).exists() - assert not Path(temp_dir_bar).exists() + assert not Path(temp_dir).exists() async with make_handler(settings) as handler: - assert Path(temp_dir_foo).is_dir() - assert Path(temp_dir_bar).is_dir() + assert Path(temp_dir).is_dir() assert len(handler.contexts) == 2 - assert handler.contexts["foo"].persistent - assert handler.contexts["bar"].persistent - assert not hasattr(handler, "browser") + assert handler.contexts["persistent"].persistent + assert not handler.contexts["non-persistent"].persistent + assert isinstance(handler.browser, Browser) @pytest.mark.asyncio async def test_contexts_dynamic(self): diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py index d207c14d..ed3fcd87 100644 --- a/tests/test_playwright_requests.py +++ b/tests/test_playwright_requests.py @@ -208,28 +208,6 @@ async def test_route_continue_exception(self, logger): with pytest.raises(ZeroDivisionError): await req_handler(route, playwright_request) - @pytest.mark.asyncio - async def test_context_kwargs(self): - settings_dict = { - "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, - "PLAYWRIGHT_CONTEXTS": { - "default": {"java_script_enabled": False}, - }, - } - async with make_handler(settings_dict) as handler: - with StaticMockServer() as server: - req = Request( - url=server.urljoin("/scroll.html"), - meta={ - "playwright": True, - "playwright_page_methods": [ - PageMethod("wait_for_selector", selector="div.quote", timeout=1000), - ], - }, - ) - with pytest.raises(PlaywrightTimeoutError): - await handler._download_request(req, Spider("foo")) - @pytest.mark.asyncio async def test_page_method_screenshot(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: