Skip to content

Commit

Permalink
Allow disabling the AutoThrottle extension for a given slot (#6246)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Mar 12, 2024
1 parent 642af40 commit d7581c6
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 17 deletions.
12 changes: 12 additions & 0 deletions docs/topics/autothrottle.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@ effect, but there are some important differences:

AutoThrottle doesn't have these issues.

Disabling throttling on a downloader slot
=========================================

It is possible to disable AutoThrottle for a specific download slot at run time
by setting its ``throttle`` attribute to ``False``, e.g. using
:setting:`DOWNLOAD_SLOTS`.

Note, however, that AutoThrottle still determines the starting delay of every
slot by setting the ``download_delay`` attribute on the running spider. You
might want to set a custom value for the ``delay`` attribute of the slot, e.g.
using :setting:`DOWNLOAD_SLOTS`.

Throttling algorithm
====================

Expand Down
12 changes: 10 additions & 2 deletions docs/topics/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -835,7 +835,7 @@ The default HTTPS handler uses HTTP/1.1. To use HTTP/2:
.. setting:: DOWNLOAD_SLOTS

DOWNLOAD_SLOTS
----------------
--------------

Default: ``{}``

Expand All @@ -844,7 +844,12 @@ Allows to define concurrency/delay parameters on per slot (domain) basis:
.. code-block:: python
DOWNLOAD_SLOTS = {
"quotes.toscrape.com": {"concurrency": 1, "delay": 2, "randomize_delay": False},
"quotes.toscrape.com": {
"concurrency": 1,
"delay": 2,
"randomize_delay": False,
"throttle": False,
},
"books.toscrape.com": {"delay": 3, "randomize_delay": False},
}
Expand All @@ -856,6 +861,9 @@ Allows to define concurrency/delay parameters on per slot (domain) basis:
- :setting:`CONCURRENT_REQUESTS_PER_DOMAIN`: ``concurrency``
- :setting:`RANDOMIZE_DOWNLOAD_DELAY`: ``randomize_delay``

There is no global setting for ``throttle``, whose default value is
``None``.


.. setting:: DOWNLOAD_TIMEOUT

Expand Down
19 changes: 15 additions & 4 deletions scrapy/core/downloader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from collections import deque
from datetime import datetime
from time import time
from typing import TYPE_CHECKING, Any, Deque, Dict, Set, Tuple, cast
from typing import TYPE_CHECKING, Any, Deque, Dict, Optional, Set, Tuple, cast

from twisted.internet import task
from twisted.internet.defer import Deferred
Expand All @@ -24,10 +24,18 @@
class Slot:
"""Downloader slot"""

def __init__(self, concurrency: int, delay: float, randomize_delay: bool):
def __init__(
self,
concurrency: int,
delay: float,
randomize_delay: bool,
*,
throttle: Optional[bool] = None,
):
self.concurrency: int = concurrency
self.delay: float = delay
self.randomize_delay: bool = randomize_delay
self.throttle = throttle

self.active: Set[Request] = set()
self.queue: Deque[Tuple[Request, Deferred]] = deque()
Expand All @@ -52,13 +60,15 @@ def __repr__(self) -> str:
return (
f"{cls_name}(concurrency={self.concurrency!r}, "
f"delay={self.delay:.2f}, "
f"randomize_delay={self.randomize_delay!r})"
f"randomize_delay={self.randomize_delay!r}, "
f"throttle={self.throttle!r})"
)

def __str__(self) -> str:
return (
f"<downloader.Slot concurrency={self.concurrency!r} "
f"delay={self.delay:.2f} randomize_delay={self.randomize_delay!r} "
f"throttle={self.throttle!r} "
f"len(active)={len(self.active)} len(queue)={len(self.queue)} "
f"len(transferring)={len(self.transferring)} "
f"lastseen={datetime.fromtimestamp(self.lastseen).isoformat()}>"
Expand Down Expand Up @@ -127,7 +137,8 @@ def _get_slot(self, request: Request, spider: Spider) -> Tuple[str, Slot]:
slot_settings.get("delay", delay),
)
randomize_delay = slot_settings.get("randomize_delay", self.randomize_delay)
new_slot = Slot(conc, delay, randomize_delay)
throttle = slot_settings.get("throttle", None)
new_slot = Slot(conc, delay, randomize_delay, throttle=throttle)
self.slots[key] = new_slot

return key, self.slots[key]
Expand Down
2 changes: 1 addition & 1 deletion scrapy/extensions/throttle.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _response_downloaded(
) -> None:
key, slot = self._get_slot(request, spider)
latency = request.meta.get("download_latency")
if latency is None or slot is None:
if latency is None or slot is None or slot.throttle is False:
return

olddelay = slot.delay
Expand Down
3 changes: 2 additions & 1 deletion tests/test_core_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ class SlotTest(unittest.TestCase):
def test_repr(self):
slot = Slot(concurrency=8, delay=0.1, randomize_delay=True)
self.assertEqual(
repr(slot), "Slot(concurrency=8, delay=0.10, randomize_delay=True)"
repr(slot),
"Slot(concurrency=8, delay=0.10, randomize_delay=True, throttle=None)",
)
29 changes: 28 additions & 1 deletion tests/test_downloaderslotssettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from twisted.internet import defer
from twisted.trial.unittest import TestCase

from scrapy import Request
from scrapy.core.downloader import Downloader, Slot
from scrapy.crawler import CrawlerRunner
from scrapy.http import Request
from scrapy.utils.test import get_crawler
from tests.mockserver import MockServer
from tests.spiders import MetaSpider

Expand All @@ -20,6 +22,7 @@ class DownloaderSlotsSettingsTestSpider(MetaSpider):
"concurrency": 1,
"delay": 2,
"randomize_delay": False,
"throttle": False,
},
"books.toscrape.com": {"delay": 3, "randomize_delay": False},
},
Expand Down Expand Up @@ -70,3 +73,27 @@ def test_delay(self):
}

self.assertTrue(max(list(error_delta.values())) < tolerance)


def test_params():
params = {
"concurrency": 1,
"delay": 2,
"randomize_delay": False,
"throttle": False,
}
settings = {
"DOWNLOAD_SLOTS": {
"example.com": params,
},
}
crawler = get_crawler(settings_dict=settings)
downloader = Downloader(crawler)
downloader._slot_gc_loop.stop() # Prevent an unclean reactor.
request = Request("https://example.com")
_, actual = downloader._get_slot(request, spider=None)
expected = Slot(**params)
for param in params:
assert getattr(expected, param) == getattr(
actual, param
), f"Slot.{param}: {getattr(expected, param)!r} != {getattr(actual, param)!r}"
19 changes: 11 additions & 8 deletions tests/test_extension_throttle.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,17 @@ class _TestSpider(Spider):


@pytest.mark.parametrize(
("meta", "slot"),
("meta", "slot", "throttle"),
(
({}, None),
({"download_latency": 1.0}, None),
({"download_slot": "foo"}, None),
({"download_slot": "foo"}, "foo"),
({"download_latency": 1.0, "download_slot": "foo"}, None),
({}, None, None),
({"download_latency": 1.0}, None, None),
({"download_slot": "foo"}, None, None),
({"download_slot": "foo"}, "foo", None),
({"download_latency": 1.0, "download_slot": "foo"}, None, None),
({"download_latency": 1.0, "download_slot": "foo"}, "foo", False),
),
)
def test_skipped(meta, slot):
def test_skipped(meta, slot, throttle):
crawler = get_crawler()
at = build_from_crawler(AutoThrottle, crawler)
spider = TestSpider()
Expand All @@ -177,7 +178,9 @@ def test_skipped(meta, slot):
crawler.engine.downloader = Mock()
crawler.engine.downloader.slots = {}
if slot is not None:
crawler.engine.downloader.slots[slot] = object()
_slot = Mock()
_slot.throttle = throttle
crawler.engine.downloader.slots[slot] = _slot
at._adjust_delay = None # Raise exception if called.

at._response_downloaded(None, request, spider)
Expand Down

0 comments on commit d7581c6

Please sign in to comment.