Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,16 +128,38 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

A predicate function (or the path to a function) that receives a
`playwright.async_api.Request` object and must return `True` if the
request should be aborted, `False` otherwise.
request should be aborted, `False` otherwise. Coroutine functions
(`async def`) are supported.

For instance, the following prevents the download of images:
For instance, the following are all equivalent, and prevent the download of images:
```python
PLAYWRIGHT_ABORT_REQUEST = lambda req: req.resource_type == "image"
```

Note that all requests will appear in the DEBUG level logs, however there will
be no corresponding response log lines for aborted requests. Aborted requests
are counted in the `playwright/request_count/aborted` job stats item.
```python
def should_abort_request(req):
return req.resource_type == "image"

PLAYWRIGHT_ABORT_REQUEST = should_abort_request
```

```python
# project/utils.py
def should_abort_request(req):
return req.resource_type == "image"

# settings.py
PLAYWRIGHT_ABORT_REQUEST = "project.utils.should_abort_request"

```

Please note:

* All requests will appear in the DEBUG level logs, however there will
be no corresponding response log lines for aborted requests. Aborted requests
are counted in the `playwright/request_count/aborted` job stats item.
* Passing callable objects is only supported when using Scrapy>=2.4. With prior
versions, only strings containing object paths are supported.


## Basic usage
Expand Down
22 changes: 13 additions & 9 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
import logging
import warnings
from contextlib import suppress
from inspect import isawaitable
from ipaddress import ip_address
from time import time
from typing import Callable, Dict, Generator, Optional, Tuple, Type, TypeVar
from typing import Awaitable, Callable, Dict, Generator, Optional, Tuple, Type, TypeVar, Union

from playwright.async_api import (
BrowserContext,
Expand Down Expand Up @@ -93,7 +92,7 @@ def __init__(self, crawler: Crawler) -> None:
self.contexts: Dict[str, BrowserContext] = {}
self.context_semaphores: Dict[str, asyncio.Semaphore] = {}

self.abort_request: Optional[Callable[[PlaywrightRequest], bool]] = None
self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None
if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"):
self.abort_request = load_object(crawler.settings["PLAYWRIGHT_ABORT_REQUEST"])

Expand Down Expand Up @@ -290,8 +289,9 @@ async def _apply_page_methods(self, page: Page, request: Request) -> None:
except AttributeError:
logger.warning(f"Ignoring {repr(pm)}: could not find method")
else:
result = method(*pm.args, **pm.kwargs)
pm.result = await result if isawaitable(result) else result
pm.result = method(*pm.args, **pm.kwargs)
if isinstance(pm.result, Awaitable):
pm.result = await pm.result
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
else:
logger.warning(f"Ignoring {repr(pm)}: expected PageMethod, got {repr(type(pm))}")
Expand Down Expand Up @@ -332,10 +332,14 @@ def _make_request_handler(
) -> Callable:
async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None:
"""Override request headers, method and body."""
if self.abort_request and self.abort_request(playwright_request):
await route.abort()
self.stats.inc_value("playwright/request_count/aborted")
return None
if self.abort_request:
should_abort = self.abort_request(playwright_request)
if isinstance(should_abort, Awaitable):
should_abort = await should_abort
if should_abort:
await route.abort()
self.stats.inc_value("playwright/request_count/aborted")
return None

processed_headers = await self.process_request_headers(
self.browser_type, playwright_request, scrapy_headers
Expand Down
49 changes: 30 additions & 19 deletions tests/test_playwright_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,25 +433,36 @@ async def test_response_attributes(self):

@pytest.mark.asyncio
async def test_abort_requests(self):
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_ABORT_REQUEST": lambda req: req.resource_type == "image",
}
async with make_handler(settings_dict) as handler:
with StaticMockServer() as server:
req = Request(
url=server.urljoin("/gallery.html"),
meta={"playwright": True},
)
await handler._download_request(req, Spider("foo"))

req_prefix = "playwright/request_count"
resp_prefix = "playwright/response_count"
assert handler.stats.get_value(f"{req_prefix}/resource_type/document") == 1
assert handler.stats.get_value(f"{req_prefix}/resource_type/image") == 3
assert handler.stats.get_value(f"{resp_prefix}/resource_type/document") == 1
assert handler.stats.get_value(f"{resp_prefix}/resource_type/image") is None
assert handler.stats.get_value(f"{req_prefix}/aborted") == 3
async def should_abort_request_async(request):
return request.resource_type == "image"

def should_abort_request_sync(request):
return request.resource_type == "image"

for predicate in (
lambda request: request.resource_type == "image",
should_abort_request_async,
should_abort_request_sync,
):
settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_ABORT_REQUEST": predicate,
}
async with make_handler(settings_dict) as handler:
with StaticMockServer() as server:
req = Request(
url=server.urljoin("/gallery.html"),
meta={"playwright": True},
)
await handler._download_request(req, Spider("foo"))

req_prefix = "playwright/request_count"
resp_prefix = "playwright/response_count"
assert handler.stats.get_value(f"{req_prefix}/resource_type/document") == 1
assert handler.stats.get_value(f"{req_prefix}/resource_type/image") == 3
assert handler.stats.get_value(f"{resp_prefix}/resource_type/document") == 1
assert handler.stats.get_value(f"{resp_prefix}/resource_type/image") is None
assert handler.stats.get_value(f"{req_prefix}/aborted") == 3


class TestCaseChromium(MixinTestCase):
Expand Down