scrapy-plugins · elacuesta · May 22, 2022 · May 14, 2022 · May 14, 2022 · May 14, 2022
diff --git a/README.md b/README.md
@@ -60,9 +60,7 @@ Also, be sure to [install the `asyncio`-based Twisted reactor](https://docs.scra
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 ```
 
-## Settings
-
-`scrapy-playwright` accepts the following settings:
+## Supported settings
 
 * `PLAYWRIGHT_BROWSER_TYPE` (type `str`, default `chromium`)
     The browser type to be launched, e.g. `chromium`, `firefox`, `webkit`.
@@ -93,11 +91,16 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
     }
     ```
 
-    See the section on [Multiple browser contexts](#multiple-browser-contexts)
-    for more information.
+    See the section on [browser contexts](#browser-contexts) for more information.
 
     See also the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browser-new-context).
 
+* `PLAYWRIGHT_MAX_CONTEXTS` (type `Optional[int]`, default `None`)
+
+    Maximum amount of allowed concurrent Playwright contexts. If unset or `None`,
+    no limit is enforced. See the [Maximum concurrent context count](#maximum-concurrent-context-count)
+    section for more information.
+
 * `PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT` (type `Optional[float]`, default `None`)
 
     The timeout used when requesting pages by Playwright. If `None` or unset,
@@ -260,7 +263,7 @@ class AwesomeSpiderWithPage(scrapy.Spider):
   Scrapy request workflow (Scheduler, Middlewares, etc).
 
 
-## Multiple browser contexts
+## Browser contexts
 
 Multiple [browser contexts](https://playwright.dev/python/docs/browser-contexts)
 to be launched at startup can be defined via the `PLAYWRIGHT_CONTEXTS` [setting](#settings).
@@ -327,6 +330,7 @@ def parse(self, response):
     yield scrapy.Request(
         url="https://example.org",
         callback=self.parse_in_new_context,
+        errback=self.close_context_on_error,
         meta={"playwright": True, "playwright_context": "new", "playwright_include_page": True},
     )
 
@@ -336,8 +340,21 @@ async def parse_in_new_context(self, response):
     await page.context.close()  # close the context
     await page.close()
     return {"title": title}
+
+async def close_context_on_error(self, failure):
+    page = failure.request.meta["playwright_page"]
+    await page.context.close()
 ```
 
+### Maximum concurrent context count
+
+Specify a value for the `PLAYWRIGHT_MAX_CONTEXTS` setting to limit the amount
+of concurent contexts. This setting should be used with caution: it's possible
+to block the whole crawl if contexts are not closed after they are no longer
+used (refer to the above section to dinamically close contexts). Make sure to
+define an errback to still be able to close the context even if there are
+errors with a request.
+
 
 ## Proxy support
 

diff --git a/examples/contexts.py b/examples/contexts.py
@@ -1,18 +1,25 @@
+from pathlib import Path
+
 from scrapy import Spider, Request
-from scrapy.crawler import CrawlerProcess
 
 
 class MultipleContextsSpider(Spider):
     """Handle multiple browser contexts"""
 
     name = "contexts"
     custom_settings = {
+        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+        "DOWNLOAD_HANDLERS": {
+            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+            # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+        },
+        "PLAYWRIGHT_MAX_CONTEXTS": 6,
         "PLAYWRIGHT_CONTEXTS": {
             "first": {
                 "storage_state": {
                     "cookies": [
                         {
-                            "url": "https://httpbin.org/headers",
+                            "url": "https://example.org",
                             "name": "context",
                             "value": "first",
                         },
@@ -23,47 +30,43 @@ class MultipleContextsSpider(Spider):
                 "storage_state": {
                     "cookies": [
                         {
-                            "url": "https://httpbin.org/headers",
+                            "url": "https://example.org",
                             "name": "context",
                             "value": "second",
                         },
                     ],
                 },
             },
+            "persistent": {
+                "user_data_dir": str(Path.home() / "playwright-persistent-context"),
+                "java_script_enabled": False,
+            },
         },
     }
 
     def start_requests(self):
         # using existing contexts
-        yield Request(
-            url="https://httpbin.org/headers",
-            meta={
-                "playwright": True,
-                "playwright_context": "first",
-                "playwright_include_page": True,
-            },
-            dont_filter=True,
-        )
-        yield Request(
-            url="https://httpbin.org/headers",
-            meta={
-                "playwright": True,
-                "playwright_context": "second",
-                "playwright_include_page": True,
-            },
-            dont_filter=True,
-        )
+        for ctx_name in self.custom_settings["PLAYWRIGHT_CONTEXTS"].keys():
+            yield Request(
+                url="https://example.org",
+                meta={
+                    "playwright": True,
+                    "playwright_context": ctx_name,
+                    "playwright_include_page": True,
+                },
+                dont_filter=True,
+            )
         # create a new context
         yield Request(
-            url="https://httpbin.org/headers",
+            url="https://example.org",
             meta={
                 "playwright": True,
                 "playwright_context": "third",
                 "playwright_context_kwargs": {
                     "storage_state": {
                         "cookies": [
                             {
-                                "url": "https://httpbin.org/headers",
+                                "url": "https://example.org",
                                 "name": "context",
                                 "value": "third",
                             },
@@ -76,10 +79,21 @@ def start_requests(self):
         )
         # default context
         yield Request(
-            url="https://httpbin.org/headers",
+            url="https://example.org",
             meta={"playwright": True, "playwright_include_page": True},
             dont_filter=True,
         )
+        # each request on a different context
+        for i in range(20):
+            yield Request(
+                url=f"https://example.org?foo={i}",
+                meta={
+                    "playwright": True,
+                    "playwright_context": f"context-{i}",
+                    "playwright_include_page": True,
+                },
+                dont_filter=True,
+            )
 
     async def parse(self, response):
         page = response.meta["playwright_page"]
@@ -91,17 +105,3 @@ async def parse(self, response):
             "context": context_name,
             "cookies": storage_state["cookies"],
         }
-
-
-if __name__ == "__main__":
-    process = CrawlerProcess(
-        settings={
-            "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
-            "DOWNLOAD_HANDLERS": {
-                "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
-                # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
-            },
-        }
-    )
-    process.crawl(MultipleContextsSpider)
-    process.start()
diff --git a/examples/persistent_context.py b/examples/persistent_context.py
diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -64,13 +64,22 @@ def __init__(self, crawler: Crawler) -> None:
         crawler.signals.connect(self._engine_started, signals.engine_started)
         self.stats = crawler.stats
 
-        self.browser_launch_lock = asyncio.Lock()
-        self.context_launch_lock = asyncio.Lock()
+        # browser
         self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE
+        self.browser_launch_lock = asyncio.Lock()
+        self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
+
+        # contexts
         self.max_pages_per_context: int = settings.getint(
             "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"
         ) or settings.getint("CONCURRENT_REQUESTS")
-        self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
+        self.context_launch_lock = asyncio.Lock()
+        self.contexts: Dict[str, BrowserContextWrapper] = {}
+        self.context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")
+        if settings.getint("PLAYWRIGHT_MAX_CONTEXTS"):
+            self.context_semaphore = asyncio.Semaphore(
+                value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS")
+            )
 
         self.default_navigation_timeout: Optional[float] = None
         if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings:
@@ -79,7 +88,7 @@ def __init__(self, crawler: Crawler) -> None:
                     settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
                 )
 
-        # header-related settings
+        # headers
         if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings:
             if settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
                 self.process_request_headers = None
@@ -99,10 +108,6 @@ def __init__(self, crawler: Crawler) -> None:
         else:
             self.process_request_headers = use_scrapy_headers
 
-        # context-related settings
-        self.contexts: Dict[str, BrowserContextWrapper] = {}
-        self.context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")
-
         self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None
         if settings.get("PLAYWRIGHT_ABORT_REQUEST"):
             self.abort_request = load_object(settings["PLAYWRIGHT_ABORT_REQUEST"])
@@ -123,13 +128,13 @@ async def _launch(self) -> None:
         self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name)
         if self.context_kwargs:
             logger.info(f"Launching {len(self.context_kwargs)} startup context(s)")
-            contexts = await asyncio.gather(
+            await asyncio.gather(
                 *[
                     self._create_browser_context(name=name, context_kwargs=kwargs)
                     for name, kwargs in self.context_kwargs.items()
                 ]
             )
-            self.contexts = dict(zip(self.context_kwargs.keys(), contexts))
+            self._set_max_concurrent_context_count()
             logger.info("Startup context(s) launched")
             self.stats.set_value("playwright/page_count", self._get_total_page_count())
 
@@ -144,6 +149,8 @@ async def _create_browser_context(
         self, name: str, context_kwargs: Optional[dict]
     ) -> BrowserContextWrapper:
         """Create a new context, also launching a browser if necessary."""
+        if hasattr(self, "context_semaphore"):
+            await self.context_semaphore.acquire()
         context_kwargs = context_kwargs or {}
         if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY):
             context = await self.browser_type.launch_persistent_context(**context_kwargs)
@@ -159,11 +166,13 @@ async def _create_browser_context(
         self.stats.inc_value("playwright/context_count")
         if self.default_navigation_timeout is not None:
             context.set_default_navigation_timeout(self.default_navigation_timeout)
-        return BrowserContextWrapper(
+        self.contexts[name] = BrowserContextWrapper(
             context=context,
             semaphore=asyncio.Semaphore(value=self.max_pages_per_context),
             persistent=persistent,
         )
+        self._set_max_concurrent_context_count()
+        return self.contexts[name]
 
     async def _create_page(self, request: Request) -> Page:
         """Create a new page in a context, also creating a new context if necessary."""
@@ -173,7 +182,7 @@ async def _create_page(self, request: Request) -> Page:
         async with self.context_launch_lock:
             context = self.contexts.get(context_name)
             if context is None:
-                context = self.contexts[context_name] = await self._create_browser_context(
+                context = await self._create_browser_context(
                     name=context_name, context_kwargs=request.meta.get("playwright_context_kwargs")
                 )
 
@@ -208,6 +217,11 @@ def _set_max_concurrent_page_count(self):
         if current_max_count is None or count > current_max_count:
             self.stats.set_value("playwright/page_count/max_concurrent", count)
 
+    def _set_max_concurrent_context_count(self):
+        current_max_count = self.stats.get_value("playwright/context_count/max_concurrent")
+        if current_max_count is None or len(self.contexts) > current_max_count:
+            self.stats.set_value("playwright/context_count/max_concurrent", len(self.contexts))
+
     @inlineCallbacks
     def close(self) -> Deferred:
         logger.info("Closing download handler")
@@ -355,9 +369,10 @@ def close_page_callback() -> None:
 
     def _make_close_browser_context_callback(self, name: str, persistent: bool) -> Callable:
         def close_browser_context_callback() -> None:
+            self.contexts.pop(name, None)
+            if hasattr(self, "context_semaphore"):
+                self.context_semaphore.release()
             logger.debug(f"Browser context closed: '{name}' (persistent={persistent})")
-            if name in self.contexts:
-                self.contexts.pop(name)
 
         return close_browser_context_callback