scrapy · kmike · May 27, 2020 · May 10, 2020 · May 20, 2020 · May 20, 2020
diff --git a/docs/faq.rst b/docs/faq.rst
@@ -371,6 +371,19 @@ Twisted reactor is :class:`twisted.internet.selectreactor.SelectReactor`. Switch
 different reactor is possible by using the :setting:`TWISTED_REACTOR` setting.
 
 
+.. _faq-stop-response-download:
+
+How can I cancel the download of a given response?
+--------------------------------------------------
+
+In some situations, it might be useful to stop the download of a certain response.
+For instance, if you only need the first part of a large response and you would like
+to save resources by avoiding the download of the whole body.
+In that case, you could attach a handler to the :class:`~scrapy.signals.bytes_received`
+signal and raise a :exc:`~scrapy.exceptions.StopDownload` exception. Please refer to
+the :ref:`topics-stop-response-download` topic for additional information and examples.
+
+
 .. _has been reported: https://github.com/scrapy/scrapy/issues/2905
 .. _user agents: https://en.wikipedia.org/wiki/User_agent
 .. _LIFO: https://en.wikipedia.org/wiki/Stack_(abstract_data_type)

diff --git a/docs/topics/exceptions.rst b/docs/topics/exceptions.rst
@@ -14,13 +14,6 @@ Built-in Exceptions reference
 
 Here's a list of all exceptions included in Scrapy and their usage.
 
-DropItem
---------
-
-.. exception:: DropItem
-
-The exception that must be raised by item pipeline stages to stop processing an
-Item. For more information see :ref:`topics-item-pipeline`.
 
 CloseSpider
 -----------
@@ -47,6 +40,14 @@ DontCloseSpider
 This exception can be raised in a :signal:`spider_idle` signal handler to
 prevent the spider from being closed.
 
+DropItem
+--------
+
+.. exception:: DropItem
+
+The exception that must be raised by item pipeline stages to stop processing an
+Item. For more information see :ref:`topics-item-pipeline`.
+
 IgnoreRequest
 -------------
 
@@ -77,3 +78,34 @@ NotSupported
 
 This exception is raised to indicate an unsupported feature.
 
+StopDownload
+-------------
+
+.. versionadded:: 2.2
+
+.. exception:: StopDownload(fail=True)
+
+Raised from a :class:`~scrapy.signals.bytes_received` signal handler to
+indicate that no further bytes should be downloaded for a response.
+
+The ``fail`` boolean parameter controls which method will handle the resulting
+response:
+
+* If ``fail=True``, the request errback is called. The response object is
+  available as the ``response`` attribute of the received
+  :class:`~twisted.python.failure.Failure` object. This is the default behaviour.
+
+* If ``fail=False``, the request callback is called instead.
+
+In both cases, the response could have its body truncated: the body contains
+all bytes received up until the exception is raised, including the bytes
+received in the signal handler that raises the exception. Also, the response
+object is marked with ``"download_stopped"`` in its :attr:`Response.flags`
+attribute.
+
+.. note:: ``fail`` is a keyword-only parameter, i.e. raising
+    ``StopDownload(False)`` or ``StopDownload(True)`` will raise
+    a :class:`TypeError`.
+
+See the documentation for the :class:`~scrapy.signals.bytes_received` signal
+and the :ref:`topics-stop-response-download` topic for additional information and examples.
diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst
@@ -385,6 +385,51 @@ The meta key is used set retry times per request. When initialized, the
 :reqmeta:`max_retry_times` meta key takes higher precedence over the
 :setting:`RETRY_TIMES` setting.
 
+
+.. _topics-stop-response-download:
+
+Stopping the download of a Response
+===================================
+
+Raising a :exc:`~scrapy.exceptions.StopDownload` exception from a
+:class:`~scrapy.signals.bytes_received` signal handler will stop the
+download of a given response. See the following example::
+
+    import scrapy
+
+
+    class StopSpider(scrapy.Spider):
+        name = "stop"
+        start_urls = ["https://docs.scrapy.org/en/latest/"]
+
+        @classmethod
+        def from_crawler(cls, crawler):
+            spider = super().from_crawler(crawler)
+            crawler.signals.connect(spider.on_bytes_received, signal=scrapy.signals.bytes_received)
+            return spider
+
+        def parse(self, response):
+            # 'last_chars' show that the full response was not downloaded
+            yield {"len": len(response.text), "last_chars": response.text[-40:]}
+
+        def on_bytes_received(self, data, request, spider):
+            raise scrapy.exceptions.StopDownload(fail=False)
+
+which produces the following output::
+
+    2020-05-19 17:26:12 [scrapy.core.engine] INFO: Spider opened
+    2020-05-19 17:26:12 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
+    2020-05-19 17:26:13 [scrapy.core.downloader.handlers.http11] DEBUG: Download stopped for <GET https://docs.scrapy.org/en/latest/> from signal handler StopSpider.on_bytes_received
+    2020-05-19 17:26:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://docs.scrapy.org/en/latest/> (referer: None) ['download_stopped']
+    2020-05-19 17:26:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://docs.scrapy.org/en/latest/>
+    {'len': 279, 'last_chars': 'dth, initial-scale=1.0">\n  \n  <title>Scr'}
+    2020-05-19 17:26:13 [scrapy.core.engine] INFO: Closing spider (finished)
+
+By default, resulting responses are handled by their corresponding errbacks. To
+call their callback instead, like in this example, pass ``fail=False`` to the
+:exc:`~scrapy.exceptions.StopDownload` exception.
+
+
 .. _topics-request-response-ref-request-subclasses:
 
 Request subclasses
@@ -716,9 +761,9 @@ Response objects
         .. versionadded:: 2.1.0
 
         The IP address of the server from which the Response originated.
-        
+
         This attribute is currently only populated by the HTTP 1.1 download
-        handler, i.e. for ``http(s)`` responses. For other handlers, 
+        handler, i.e. for ``http(s)`` responses. For other handlers,
         :attr:`ip_address` is always ``None``.
 
     .. method:: Response.copy()

diff --git a/docs/topics/signals.rst b/docs/topics/signals.rst
@@ -373,6 +373,8 @@ request_left_downloader
 bytes_received
 ~~~~~~~~~~~~~~
 
+.. versionadded:: 2.2
+
 .. signal:: bytes_received
 .. function:: bytes_received(data, request, spider)
 
@@ -385,14 +387,19 @@ bytes_received
     This signal does not support returning deferreds from its handlers.
 
     :param data: the data received by the download handler
-    :type spider: :class:`bytes` object
+    :type data: :class:`bytes` object
 
-    :param request: the request that generated the response
+    :param request: the request that generated the download
     :type request: :class:`~scrapy.http.Request` object
 
     :param spider: the spider associated with the response
     :type spider: :class:`~scrapy.spiders.Spider` object
 
+.. note:: Handlers of this signal can stop the download of a response while it
+    is in progress by raising the :exc:`~scrapy.exceptions.StopDownload`
+    exception. Please refer to the :ref:`topics-stop-response-download` topic
+    for additional information and examples.
+
 Response signals
 ----------------
 

diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py
@@ -12,6 +12,7 @@
 from twisted.internet import defer, protocol, ssl
 from twisted.internet.endpoints import TCP4ClientEndpoint
 from twisted.internet.error import TimeoutError
+from twisted.python.failure import Failure
 from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
 from twisted.web.http import _DataLoss, PotentialDataLoss
 from twisted.web.http_headers import Headers as TxHeaders
@@ -21,7 +22,7 @@
 from scrapy import signals
 from scrapy.core.downloader.tls import openssl_methods
 from scrapy.core.downloader.webclient import _parse
-from scrapy.exceptions import ScrapyDeprecationWarning
+from scrapy.exceptions import ScrapyDeprecationWarning, StopDownload
 from scrapy.http import Headers
 from scrapy.responsetypes import responsetypes
 from scrapy.utils.misc import create_instance, load_object
@@ -431,7 +432,7 @@ def _cancel(_):
     def _cb_bodydone(self, result, request, url):
         headers = Headers(result["txresponse"].headers.getAllRawHeaders())
         respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
-        return respcls(
+        response = respcls(
             url=url,
             status=int(result["txresponse"].code),
             headers=headers,
@@ -440,6 +441,14 @@ def _cb_bodydone(self, result, request, url):
             certificate=result["certificate"],
             ip_address=result["ip_address"],
         )
+        if result.get("failure"):
+            # This failure is not the same object that will reach the errback,
+            # so we need to temporarily store the response in the exception.
+            # It will be moved to the failure in core/scraper.py
+            failure = result["failure"]
+            failure.value.response = response
+            return failure
+        return response
 
 
 @implementer(IBodyProducer)
@@ -477,6 +486,16 @@ def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dat
         self._ip_address = None
         self._crawler = crawler
 
+    def _finish_response(self, flags=None, failure=None):
+        self._finished.callback({
+            "txresponse": self._txresponse,
+            "body": self._bodybuf.getvalue(),
+            "flags": flags,
+            "certificate": self._certificate,
+            "ip_address": self._ip_address,
+            "failure": failure,
+        })
+
     def connectionMade(self):
         if self._certificate is None:
             with suppress(AttributeError):
@@ -493,12 +512,19 @@ def dataReceived(self, bodyBytes):
         self._bodybuf.write(bodyBytes)
         self._bytes_received += len(bodyBytes)
 
-        self._crawler.signals.send_catch_log(
+        bytes_received_result = self._crawler.signals.send_catch_log(
             signal=signals.bytes_received,
             data=bodyBytes,
             request=self._request,
             spider=self._crawler.spider,
         )
+        for handler, result in bytes_received_result:
+            if isinstance(result, Failure) and isinstance(result.value, StopDownload):
+                logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
+                             {"request": self._request, "handler": handler.__qualname__})
+                self.transport._producer.loseConnection()
+                failure = result if result.value.fail else None
+                self._finish_response(flags=["download_stopped"], failure=failure)
 
         if self._maxsize and self._bytes_received > self._maxsize:
             logger.error("Received (%(bytes)s) bytes larger than download "
@@ -521,36 +547,17 @@ def connectionLost(self, reason):
         if self._finished.called:
             return
 
-        body = self._bodybuf.getvalue()
         if reason.check(ResponseDone):
-            self._finished.callback({
-                "txresponse": self._txresponse,
-                "body": body,
-                "flags": None,
-                "certificate": self._certificate,
-                "ip_address": self._ip_address,
-            })
+            self._finish_response()
             return
 
         if reason.check(PotentialDataLoss):
-            self._finished.callback({
-                "txresponse": self._txresponse,
-                "body": body,
-                "flags": ["partial"],
-                "certificate": self._certificate,
-                "ip_address": self._ip_address,
-            })
+            self._finish_response(flags=["partial"])
             return
 
         if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
             if not self._fail_on_dataloss:
-                self._finished.callback({
-                    "txresponse": self._txresponse,
-                    "body": body,
-                    "flags": ["dataloss"],
-                    "certificate": self._certificate,
-                    "ip_address": self._ip_address,
-                })
+                self._finish_response(flags=["dataloss"])
                 return
 
             elif not self._fail_on_dataloss_warned:

diff --git a/scrapy/core/scraper.py b/scrapy/core/scraper.py
@@ -11,7 +11,7 @@
 from scrapy.utils.spider import iterate_spider_output
 from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
 from scrapy.utils.log import logformatter_adapter, failure_to_exc_info
-from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
+from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest, StopDownload
 from scrapy import signals
 from scrapy.http import Request, Response
 from scrapy.item import _BaseItem
@@ -147,6 +147,14 @@ def _scrape2(self, request_result, request, spider):
 
     def call_spider(self, result, request, spider):
         result.request = request
+        # StopDownload exceptions: make the partial response an attribute of the failure
+        if (
+            isinstance(result, Failure)
+            and isinstance(result.value, StopDownload)
+            and hasattr(result.value, "response")
+        ):
+            result.response = result.value.response
+            delattr(result.value, "response")
         dfd = defer_result(result)
         callback = request.callback or spider.parse
         warn_on_generator_with_return_value(spider, callback)

diff --git a/scrapy/exceptions.py b/scrapy/exceptions.py
@@ -41,6 +41,18 @@ def __init__(self, reason='cancelled'):
         self.reason = reason
 
 
+class StopDownload(Exception):
+    """
+    Stop the download of the body for a given response.
+    The 'fail' boolean parameter indicates whether or not the resulting partial response
+    should be handled by the request errback. Note that 'fail' is a keyword-only argument.
+    """
+
+    def __init__(self, *, fail=True):
+        super().__init__()
+        self.fail = fail
+
+
 # Items
 
 
@@ -59,6 +71,7 @@ class NotSupported(Exception):
 
 class UsageError(Exception):
     """To indicate a command-line usage error"""
+
     def __init__(self, *a, **kw):
         self.print_help = kw.pop('print_help', True)
         super(UsageError, self).__init__(*a, **kw)

diff --git a/scrapy/utils/signal.py b/scrapy/utils/signal.py
@@ -5,13 +5,14 @@
 from twisted.internet.defer import DeferredList, Deferred
 from twisted.python.failure import Failure
 
-from pydispatch.dispatcher import Any, Anonymous, liveReceivers, \
-    getAllReceivers, disconnect
+from pydispatch.dispatcher import Anonymous, Any, disconnect, getAllReceivers, liveReceivers
 from pydispatch.robustapply import robustApply
 
+from scrapy.exceptions import StopDownload
 from scrapy.utils.defer import maybeDeferred_coro
 from scrapy.utils.log import failure_to_exc_info
 
+
 logger = logging.getLogger(__name__)
 
 
@@ -23,7 +24,7 @@ def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
     """Like pydispatcher.robust.sendRobust but it also logs errors and returns
     Failures instead of exceptions.
     """
-    dont_log = named.pop('dont_log', _IgnoredException)
+    dont_log = (named.pop('dont_log', _IgnoredException), StopDownload)
     spider = named.get('spider', None)
     responses = []
     for receiver in liveReceivers(getAllReceivers(sender, signal)):