Skip to content
28 changes: 23 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,18 +78,25 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
It should be a mapping of (name, keyword arguments). For instance:
```python
{
"first": {
"foobar": {
"context_arg1": "value",
"context_arg2": "value",
},
"second": {
"default": {
"context_arg1": "value",
"context_arg2": "value",
},
"persistent": {
"user_data_dir": "/path/to/dir", # will be a persistent context
"context_arg1": "value",
},
}
```
A default context (called `default`) is created if no contexts are defined,
this will be used by all requests which do not explicitly specify a context.
See the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browser-new-context).

See the section on [Multiple browser contexts](#multiple-browser-contexts)
for more information.

See also the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browser-new-context).

* `PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT` (type `Optional[float]`, default `None`)

Expand Down Expand Up @@ -269,6 +276,17 @@ yield scrapy.Request(
)
```

### Default context

If a request does not explicitly indicate a context via the `playwright_context`
meta key, it falls back to using a general context called `default`. This `default`
context can also be customized on startup via the `PLAYWRIGHT_CONTEXTS` setting.

### Persistent contexts

Pass a value for the `user_data_dir` keyword argument to launch a context as
**persistent** (see [`BrowserType.launch_persistent_context`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context)).

### Creating a context during a crawl

If the context specified in the `playwright_context` meta key does not exist, it will be created.
Expand Down
39 changes: 39 additions & 0 deletions examples/persistent_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from pathlib import Path

from scrapy import Spider, Request


class PersistentContextSpider(Spider):
"""Use a persistent browser context"""

name = "persistent_context"
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"PLAYWRIGHT_CONTEXTS": {
"foobar": {
"user_data_dir": str(Path.home() / "playwright-persistent-context"),
"java_script_enabled": False,
"extra_http_headers": {"Asdf": "Qwerty"},
"user_agent": "foobar",
}
},
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
}

def start_requests(self):
yield Request(
url="https://httpbin.org/get",
meta={"playwright": True, "playwright_context": "foobar"},
)

def parse(self, response):
content = response.css("pre::text").get()
print(content)
return {
"url": response.url,
"context": response.meta["playwright_context"],
}
166 changes: 105 additions & 61 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
import logging
import warnings
from contextlib import suppress
from dataclasses import dataclass
from ipaddress import ip_address
from time import time
from typing import Awaitable, Callable, Dict, Generator, Optional, Tuple, Type, TypeVar, Union

from playwright.async_api import (
Browser,
BrowserContext,
BrowserType,
Page,
PlaywrightContextManager,
Request as PlaywrightRequest,
Expand Down Expand Up @@ -41,32 +44,48 @@
logger = logging.getLogger("scrapy-playwright")


DEFAULT_BROWSER_TYPE = "chromium"
DEFAULT_CONTEXT_NAME = "default"
PERSISTENT_CONTEXT_PATH_KEY = "user_data_dir"


@dataclass
class BrowserContextWrapper:
context: BrowserContext
semaphore: asyncio.Semaphore
persistent: bool


class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
def __init__(self, crawler: Crawler) -> None:
super().__init__(settings=crawler.settings, crawler=crawler)
settings = crawler.settings
super().__init__(settings=settings, crawler=crawler)
verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
crawler.signals.connect(self._engine_started, signals.engine_started)
self.stats = crawler.stats

self.browser_type: str = crawler.settings.get("PLAYWRIGHT_BROWSER_TYPE") or "chromium"
self.max_pages_per_context: int = crawler.settings.getint(
self.browser_launch_lock = asyncio.Lock()
self.context_launch_lock = asyncio.Lock()
self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE
self.max_pages_per_context: int = settings.getint(
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"
) or crawler.settings.getint("CONCURRENT_REQUESTS")
self.launch_options: dict = crawler.settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
) or settings.getint("CONCURRENT_REQUESTS")
self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}

self.default_navigation_timeout: Optional[float] = None
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler.settings:
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings:
with suppress(TypeError, ValueError):
self.default_navigation_timeout = float(
crawler.settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
)

if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings:
if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
self.process_request_headers = None # use headers from the Playwright request
# header-related settings
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings:
if settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
self.process_request_headers = None
else:
self.process_request_headers = load_object(
crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
)
if self.process_request_headers is use_playwright_headers:
warnings.warn(
Expand All @@ -80,70 +99,94 @@ def __init__(self, crawler: Crawler) -> None:
else:
self.process_request_headers = use_scrapy_headers

self.context_kwargs: dict = crawler.settings.getdict("PLAYWRIGHT_CONTEXTS")
self.contexts: Dict[str, BrowserContext] = {}
self.context_semaphores: Dict[str, asyncio.Semaphore] = {}
# context-related settings
self.contexts: Dict[str, BrowserContextWrapper] = {}
self.context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")

self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None
if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"):
self.abort_request = load_object(crawler.settings["PLAYWRIGHT_ABORT_REQUEST"])
if settings.get("PLAYWRIGHT_ABORT_REQUEST"):
self.abort_request = load_object(settings["PLAYWRIGHT_ABORT_REQUEST"])

@classmethod
def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler:
return cls(crawler)

def _engine_started(self) -> Deferred:
"""Launch the browser. Use the engine_started signal as it supports returning deferreds."""
return deferred_from_coro(self._launch_browser())
return deferred_from_coro(self._launch())

async def _launch_browser(self) -> None:
async def _launch(self) -> None:
"""Launch Playwright manager and configured startup context(s)."""
logger.info("Starting download handler")
self.playwright_context_manager = PlaywrightContextManager()
self.playwright = await self.playwright_context_manager.start()
logger.info("Launching browser")
browser_launcher = getattr(self.playwright, self.browser_type).launch
self.browser = await browser_launcher(**self.launch_options)
logger.info(f"Browser {self.browser_type} launched")
contexts = await asyncio.gather(
*[
self._create_browser_context(name, kwargs)
for name, kwargs in self.context_kwargs.items()
]
)
self.contexts = dict(zip(self.context_kwargs.keys(), contexts))
self.context_semaphores = {
name: asyncio.Semaphore(value=self.max_pages_per_context) for name in self.contexts
}

async def _create_browser_context(self, name: str, context_kwargs: dict) -> BrowserContext:
context = await self.browser.new_context(**context_kwargs)
context.on("close", self._make_close_browser_context_callback(name))
logger.debug("Browser context started: '%s'", name)
self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name)
if self.context_kwargs:
logger.info(f"Launching {len(self.context_kwargs)} startup context(s)")
contexts = await asyncio.gather(
*[
self._create_browser_context(name=name, context_kwargs=kwargs)
for name, kwargs in self.context_kwargs.items()
]
)
self.contexts = dict(zip(self.context_kwargs.keys(), contexts))
logger.info("Startup context(s) launched")
self.stats.set_value("playwright/page_count", self._get_total_page_count())

async def _maybe_launch_browser(self) -> None:
async with self.browser_launch_lock:
if not hasattr(self, "browser"):
logger.info(f"Launching browser {self.browser_type.name}")
self.browser: Browser = await self.browser_type.launch(**self.launch_options)
logger.info(f"Browser {self.browser_type.name} launched")

async def _create_browser_context(
self, name: str, context_kwargs: Optional[dict]
) -> BrowserContextWrapper:
"""Create a new context, also launching a browser if necessary."""
context_kwargs = context_kwargs or {}
if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY):
context = await self.browser_type.launch_persistent_context(**context_kwargs)
persistent = True
self.stats.inc_value("playwright/context_count/persistent")
else:
await self._maybe_launch_browser()
context = await self.browser.new_context(**context_kwargs)
persistent = False
self.stats.inc_value("playwright/context_count/non-persistent")
context.on("close", self._make_close_browser_context_callback(name, persistent))
logger.debug(f"Browser context started: '{name}' (persistent={persistent})")
self.stats.inc_value("playwright/context_count")
if self.default_navigation_timeout is not None:
context.set_default_navigation_timeout(self.default_navigation_timeout)
return context
return BrowserContextWrapper(
context=context,
semaphore=asyncio.Semaphore(value=self.max_pages_per_context),
persistent=persistent,
)

async def _create_page(self, request: Request) -> Page:
"""Create a new page in a context, also creating a new context if necessary."""
context_name = request.meta.setdefault("playwright_context", "default")
context = self.contexts.get(context_name)
if context is None:
context_kwargs = request.meta.get("playwright_context_kwargs") or {}
context = await self._create_browser_context(context_name, context_kwargs)
self.contexts[context_name] = context
self.context_semaphores[context_name] = asyncio.Semaphore(
value=self.max_pages_per_context
)
context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)
# this block needs to be locked because several attempts to launch a context
# with the same name could happen at the same time from different requests
async with self.context_launch_lock:
context = self.contexts.get(context_name)
if context is None:
context = self.contexts[context_name] = await self._create_browser_context(
name=context_name, context_kwargs=request.meta.get("playwright_context_kwargs")
)

await self.context_semaphores[context_name].acquire()
page = await context.new_page()
await context.semaphore.acquire()
page = await context.context.new_page()
self.stats.inc_value("playwright/page_count")
logger.debug(
"[Context=%s] New page created, page count is %i (%i for all contexts)",
context_name,
len(context.pages),
len(context.context.pages),
self._get_total_page_count(),
)
self._set_max_concurrent_page_count()
if self.default_navigation_timeout is not None:
page.set_default_navigation_timeout(self.default_navigation_timeout)

Expand All @@ -157,21 +200,24 @@ async def _create_page(self, request: Request) -> Page:
return page

def _get_total_page_count(self):
count = sum([len(context.pages) for context in self.contexts.values()])
return sum([len(ctx.context.pages) for ctx in self.contexts.values()])

def _set_max_concurrent_page_count(self):
count = self._get_total_page_count()
current_max_count = self.stats.get_value("playwright/page_count/max_concurrent")
if current_max_count is None or count > current_max_count:
self.stats.set_value("playwright/page_count/max_concurrent", count)
return count

@inlineCallbacks
def close(self) -> Deferred:
logger.info("Closing download handler")
yield super().close()
yield deferred_from_coro(self._close())

async def _close(self) -> None:
await asyncio.gather(*[ctx.context.close() for ctx in self.contexts.values()])
self.contexts.clear()
self.context_semaphores.clear()
if getattr(self, "browser", None):
if hasattr(self, "browser"):
logger.info("Closing browser")
await self.browser.close()
await self.playwright_context_manager.__aexit__()
Expand Down Expand Up @@ -302,18 +348,16 @@ def _increment_response_stats(self, response: PlaywrightResponse) -> None:

def _make_close_page_callback(self, context_name: str) -> Callable:
def close_page_callback() -> None:
if context_name in self.context_semaphores:
self.context_semaphores[context_name].release()
if context_name in self.contexts:
self.contexts[context_name].semaphore.release()

return close_page_callback

def _make_close_browser_context_callback(self, name: str) -> Callable:
def _make_close_browser_context_callback(self, name: str, persistent: bool) -> Callable:
def close_browser_context_callback() -> None:
logger.debug("Browser context closed: '%s'", name)
logger.debug(f"Browser context closed: '{name}' (persistent={persistent})")
if name in self.contexts:
self.contexts.pop(name)
if name in self.context_semaphores:
self.context_semaphores.pop(name)

return close_browser_context_callback

Expand All @@ -334,7 +378,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
if self.process_request_headers is not None:
overrides["headers"] = await _maybe_await(
self.process_request_headers(
self.browser_type, playwright_request, scrapy_headers
self.browser_type_name, playwright_request, scrapy_headers
)
)
# the request that reaches the callback should contain the final headers
Expand Down
2 changes: 1 addition & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ async def make_handler(settings_dict: dict):
crawler = get_crawler(settings_dict=settings_dict)
handler = ScrapyPlaywrightDownloadHandler(crawler=crawler)
try:
await handler._launch_browser()
await handler._launch()
except: # noqa (E722), pylint: disable=bare-except
pass
else:
Expand Down
Loading