Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ Also, be sure to [install the `asyncio`-based Twisted reactor](https://docs.scra
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
```

## Settings

`scrapy-playwright` accepts the following settings:
## Supported settings

* `PLAYWRIGHT_BROWSER_TYPE` (type `str`, default `chromium`)
The browser type to be launched, e.g. `chromium`, `firefox`, `webkit`.
Expand Down Expand Up @@ -93,11 +91,16 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
}
```

See the section on [Multiple browser contexts](#multiple-browser-contexts)
for more information.
See the section on [browser contexts](#browser-contexts) for more information.

See also the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browser-new-context).

* `PLAYWRIGHT_MAX_CONTEXTS` (type `Optional[int]`, default `None`)

Maximum amount of allowed concurrent Playwright contexts. If unset or `None`,
no limit is enforced. See the [Maximum concurrent context count](#maximum-concurrent-context-count)
section for more information.

* `PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT` (type `Optional[float]`, default `None`)

The timeout used when requesting pages by Playwright. If `None` or unset,
Expand Down Expand Up @@ -260,7 +263,7 @@ class AwesomeSpiderWithPage(scrapy.Spider):
Scrapy request workflow (Scheduler, Middlewares, etc).


## Multiple browser contexts
## Browser contexts

Multiple [browser contexts](https://playwright.dev/python/docs/browser-contexts)
to be launched at startup can be defined via the `PLAYWRIGHT_CONTEXTS` [setting](#settings).
Expand Down Expand Up @@ -327,6 +330,7 @@ def parse(self, response):
yield scrapy.Request(
url="https://example.org",
callback=self.parse_in_new_context,
errback=self.close_context_on_error,
meta={"playwright": True, "playwright_context": "new", "playwright_include_page": True},
)

Expand All @@ -336,8 +340,21 @@ async def parse_in_new_context(self, response):
await page.context.close() # close the context
await page.close()
return {"title": title}

async def close_context_on_error(self, failure):
page = failure.request.meta["playwright_page"]
await page.context.close()
```

### Maximum concurrent context count

Specify a value for the `PLAYWRIGHT_MAX_CONTEXTS` setting to limit the amount
of concurent contexts. This setting should be used with caution: it's possible
to block the whole crawl if contexts are not closed after they are no longer
used (refer to the above section to dinamically close contexts). Make sure to
define an errback to still be able to close the context even if there are
errors with a request.


## Proxy support

Expand Down
76 changes: 38 additions & 38 deletions examples/contexts.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
from pathlib import Path

from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess


class MultipleContextsSpider(Spider):
"""Handle multiple browser contexts"""

name = "contexts"
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"PLAYWRIGHT_MAX_CONTEXTS": 6,
"PLAYWRIGHT_CONTEXTS": {
"first": {
"storage_state": {
"cookies": [
{
"url": "https://httpbin.org/headers",
"url": "https://example.org",
"name": "context",
"value": "first",
},
Expand All @@ -23,47 +30,43 @@ class MultipleContextsSpider(Spider):
"storage_state": {
"cookies": [
{
"url": "https://httpbin.org/headers",
"url": "https://example.org",
"name": "context",
"value": "second",
},
],
},
},
"persistent": {
"user_data_dir": str(Path.home() / "playwright-persistent-context"),
"java_script_enabled": False,
},
},
}

def start_requests(self):
# using existing contexts
yield Request(
url="https://httpbin.org/headers",
meta={
"playwright": True,
"playwright_context": "first",
"playwright_include_page": True,
},
dont_filter=True,
)
yield Request(
url="https://httpbin.org/headers",
meta={
"playwright": True,
"playwright_context": "second",
"playwright_include_page": True,
},
dont_filter=True,
)
for ctx_name in self.custom_settings["PLAYWRIGHT_CONTEXTS"].keys():
yield Request(
url="https://example.org",
meta={
"playwright": True,
"playwright_context": ctx_name,
"playwright_include_page": True,
},
dont_filter=True,
)
# create a new context
yield Request(
url="https://httpbin.org/headers",
url="https://example.org",
meta={
"playwright": True,
"playwright_context": "third",
"playwright_context_kwargs": {
"storage_state": {
"cookies": [
{
"url": "https://httpbin.org/headers",
"url": "https://example.org",
"name": "context",
"value": "third",
},
Expand All @@ -76,10 +79,21 @@ def start_requests(self):
)
# default context
yield Request(
url="https://httpbin.org/headers",
url="https://example.org",
meta={"playwright": True, "playwright_include_page": True},
dont_filter=True,
)
# each request on a different context
for i in range(20):
yield Request(
url=f"https://example.org?foo={i}",
meta={
"playwright": True,
"playwright_context": f"context-{i}",
"playwright_include_page": True,
},
dont_filter=True,
)

async def parse(self, response):
page = response.meta["playwright_page"]
Expand All @@ -91,17 +105,3 @@ async def parse(self, response):
"context": context_name,
"cookies": storage_state["cookies"],
}


if __name__ == "__main__":
process = CrawlerProcess(
settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
}
)
process.crawl(MultipleContextsSpider)
process.start()
39 changes: 0 additions & 39 deletions examples/persistent_context.py

This file was deleted.

43 changes: 29 additions & 14 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,22 @@ def __init__(self, crawler: Crawler) -> None:
crawler.signals.connect(self._engine_started, signals.engine_started)
self.stats = crawler.stats

self.browser_launch_lock = asyncio.Lock()
self.context_launch_lock = asyncio.Lock()
# browser
self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE
self.browser_launch_lock = asyncio.Lock()
self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}

# contexts
self.max_pages_per_context: int = settings.getint(
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"
) or settings.getint("CONCURRENT_REQUESTS")
self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
self.context_launch_lock = asyncio.Lock()
self.contexts: Dict[str, BrowserContextWrapper] = {}
self.context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")
if settings.getint("PLAYWRIGHT_MAX_CONTEXTS"):
self.context_semaphore = asyncio.Semaphore(
value=settings.getint("PLAYWRIGHT_MAX_CONTEXTS")
)

self.default_navigation_timeout: Optional[float] = None
if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in settings:
Expand All @@ -79,7 +88,7 @@ def __init__(self, crawler: Crawler) -> None:
settings.get("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT")
)

# header-related settings
# headers
if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in settings:
if settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
self.process_request_headers = None
Expand All @@ -99,10 +108,6 @@ def __init__(self, crawler: Crawler) -> None:
else:
self.process_request_headers = use_scrapy_headers

# context-related settings
self.contexts: Dict[str, BrowserContextWrapper] = {}
self.context_kwargs: dict = settings.getdict("PLAYWRIGHT_CONTEXTS")

self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None
if settings.get("PLAYWRIGHT_ABORT_REQUEST"):
self.abort_request = load_object(settings["PLAYWRIGHT_ABORT_REQUEST"])
Expand All @@ -123,13 +128,13 @@ async def _launch(self) -> None:
self.browser_type: BrowserType = getattr(self.playwright, self.browser_type_name)
if self.context_kwargs:
logger.info(f"Launching {len(self.context_kwargs)} startup context(s)")
contexts = await asyncio.gather(
await asyncio.gather(
*[
self._create_browser_context(name=name, context_kwargs=kwargs)
for name, kwargs in self.context_kwargs.items()
]
)
self.contexts = dict(zip(self.context_kwargs.keys(), contexts))
self._set_max_concurrent_context_count()
logger.info("Startup context(s) launched")
self.stats.set_value("playwright/page_count", self._get_total_page_count())

Expand All @@ -144,6 +149,8 @@ async def _create_browser_context(
self, name: str, context_kwargs: Optional[dict]
) -> BrowserContextWrapper:
"""Create a new context, also launching a browser if necessary."""
if hasattr(self, "context_semaphore"):
await self.context_semaphore.acquire()
context_kwargs = context_kwargs or {}
if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY):
context = await self.browser_type.launch_persistent_context(**context_kwargs)
Expand All @@ -159,11 +166,13 @@ async def _create_browser_context(
self.stats.inc_value("playwright/context_count")
if self.default_navigation_timeout is not None:
context.set_default_navigation_timeout(self.default_navigation_timeout)
return BrowserContextWrapper(
self.contexts[name] = BrowserContextWrapper(
context=context,
semaphore=asyncio.Semaphore(value=self.max_pages_per_context),
persistent=persistent,
)
self._set_max_concurrent_context_count()
return self.contexts[name]

async def _create_page(self, request: Request) -> Page:
"""Create a new page in a context, also creating a new context if necessary."""
Expand All @@ -173,7 +182,7 @@ async def _create_page(self, request: Request) -> Page:
async with self.context_launch_lock:
context = self.contexts.get(context_name)
if context is None:
context = self.contexts[context_name] = await self._create_browser_context(
context = await self._create_browser_context(
name=context_name, context_kwargs=request.meta.get("playwright_context_kwargs")
)

Expand Down Expand Up @@ -208,6 +217,11 @@ def _set_max_concurrent_page_count(self):
if current_max_count is None or count > current_max_count:
self.stats.set_value("playwright/page_count/max_concurrent", count)

def _set_max_concurrent_context_count(self):
current_max_count = self.stats.get_value("playwright/context_count/max_concurrent")
if current_max_count is None or len(self.contexts) > current_max_count:
self.stats.set_value("playwright/context_count/max_concurrent", len(self.contexts))

@inlineCallbacks
def close(self) -> Deferred:
logger.info("Closing download handler")
Expand Down Expand Up @@ -355,9 +369,10 @@ def close_page_callback() -> None:

def _make_close_browser_context_callback(self, name: str, persistent: bool) -> Callable:
def close_browser_context_callback() -> None:
self.contexts.pop(name, None)
if hasattr(self, "context_semaphore"):
self.context_semaphore.release()
logger.debug(f"Browser context closed: '{name}' (persistent={persistent})")
if name in self.contexts:
self.contexts.pop(name)

return close_browser_context_callback

Expand Down
Loading