Skip to content

Commit

Permalink
Add errback parameter to StopDownload
Browse files Browse the repository at this point in the history
  • Loading branch information
elacuesta committed May 14, 2020
1 parent 0076498 commit dc590fe
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 17 deletions.
12 changes: 8 additions & 4 deletions docs/topics/exceptions.rst
Expand Up @@ -81,11 +81,15 @@ This exception is raised to indicate an unsupported feature.
StopDownload
-------------

.. exception:: StopDownload()
.. exception:: StopDownload(errback=False)

Raised to indicate that no further bytes should be downloaded for a response.
The resulting partial response will be handled by the request callback, as
any other response, but it will contain ``"download_stopped"`` in its
:attr:`Response.flags` attribute.

The ``errback`` boolean parameter indicates whether or not the resulting partial response,
which will contain ``"download_stopped"`` in its :attr:`Response.flags` attribute,
should be handled by the request errback. ``errback`` is ``False`` by default, which
means the response will be handled by the request callback.
Note that ``errback`` is a keyword-only argument.

See the documentation for the :class:`~scrapy.signals.bytes_received` signal
and the :ref:`topics-stop-response-download` topic for additional information and examples.
9 changes: 6 additions & 3 deletions docs/topics/request-response.rst
Expand Up @@ -398,8 +398,8 @@ download of a given request. See the following example::
import scrapy


class CancelSpider(scrapy.Spider):
name = "cancel"
class StopSpider(scrapy.Spider):
name = "stop"
start_urls = ["https://docs.scrapy.org/en/latest/"]

@classmethod
Expand All @@ -421,12 +421,15 @@ which produces the following output::

2020-05-12 19:02:06 [scrapy.core.engine] INFO: Spider opened
2020-05-12 19:02:06 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-05-12 19:02:06 [scrapy.core.downloader.handlers.http11] DEBUG: Download stopped for <GET https://docs.scrapy.org/en/latest/> from signal handler CancelSpider.handler
2020-05-12 19:02:06 [scrapy.core.downloader.handlers.http11] DEBUG: Download stopped for <GET https://docs.scrapy.org/en/latest/> from signal handler StopSpider.handler
2020-05-12 19:02:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://docs.scrapy.org/en/latest/> (referer: None) ['download_stopped']
2020-05-12 19:02:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://docs.scrapy.org/en/latest/>
{'len': 279, 'last_chars': 'dth, initial-scale=1.0">\n \n <title>Scr'}
2020-05-12 19:02:07 [scrapy.core.engine] INFO: Closing spider (finished)

By default, resulting responses are handled by their corresponding callback. If you prefer
to redirect them to their errbacks, raise ``StopDownload(errback=True)`` instead.


.. _topics-request-response-ref-request-subclasses:

Expand Down
18 changes: 15 additions & 3 deletions scrapy/core/downloader/handlers/http11.py
Expand Up @@ -432,7 +432,7 @@ def _cancel(_):
def _cb_bodydone(self, result, request, url):
headers = Headers(result["txresponse"].headers.getAllRawHeaders())
respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
return respcls(
response = respcls(
url=url,
status=int(result["txresponse"].code),
headers=headers,
Expand All @@ -441,6 +441,14 @@ def _cb_bodydone(self, result, request, url):
certificate=result["certificate"],
ip_address=result["ip_address"],
)
if result.get("failure"):
# This failure is not the same object that will reach the errback,
# so we need to temporarily store the response in the exception.
# It will be moved to the failure in core/scraper.py
failure = result["failure"]
failure.value.response = response
raise failure
return response


@implementer(IBodyProducer)
Expand Down Expand Up @@ -478,13 +486,14 @@ def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dat
self._ip_address = None
self._crawler = crawler

def _finish_response(self, flags=None):
def _finish_response(self, flags=None, failure=None):
self._finished.callback({
"txresponse": self._txresponse,
"body": self._bodybuf.getvalue(),
"flags": flags,
"certificate": self._certificate,
"ip_address": self._ip_address,
"failure": failure,
})

def connectionMade(self):
Expand Down Expand Up @@ -514,7 +523,10 @@ def dataReceived(self, bodyBytes):
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
{"request": self._request, "handler": handler.__qualname__})
self.transport._producer.loseConnection()
self._finish_response(flags=["download_stopped"])
self._finish_response(
flags=["download_stopped"],
failure=result if result.value.errback else None,
)

if self._maxsize and self._bytes_received > self._maxsize:
logger.error("Received (%(bytes)s) bytes larger than download "
Expand Down
10 changes: 9 additions & 1 deletion scrapy/core/scraper.py
Expand Up @@ -11,7 +11,7 @@
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
from scrapy.utils.log import logformatter_adapter, failure_to_exc_info
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest, StopDownload
from scrapy import signals
from scrapy.http import Request, Response
from scrapy.item import BaseItem
Expand Down Expand Up @@ -147,6 +147,14 @@ def _scrape2(self, request_result, request, spider):

def call_spider(self, result, request, spider):
result.request = request
# StopDownload exceptions: make the partial response an attribute of the failure
if (
isinstance(result, Failure)
and isinstance(result.value, StopDownload)
and hasattr(result.value, "response")
):
result.response = result.value.response
delattr(result.value, "response")
dfd = defer_result(result)
callback = request.callback or spider.parse
warn_on_generator_with_return_value(spider, callback)
Expand Down
9 changes: 7 additions & 2 deletions scrapy/exceptions.py
Expand Up @@ -43,9 +43,14 @@ def __init__(self, reason='cancelled'):

class StopDownload(Exception):
"""
Stop the download of the body for a given response
Stop the download of the body for a given response.
The 'errback' boolean parameter indicates whether or not the resulting partial response
should be handled by the request errback. Note that 'errback' is a keyword-only argument.
"""
pass

def __init__(self, *, errback=False):
super().__init__()
self.errback = errback


# Items
Expand Down
33 changes: 33 additions & 0 deletions tests/spiders.py
Expand Up @@ -7,6 +7,8 @@

from twisted.internet import defer

from scrapy import signals
from scrapy.exceptions import StopDownload
from scrapy.http import Request
from scrapy.item import Item
from scrapy.linkextractors import LinkExtractor
Expand Down Expand Up @@ -267,3 +269,34 @@ def callback(self, response):

def errback(self, failure):
self.logger.info('[errback] status %i', failure.value.response.status)


class BytesReceivedCallbackSpider(MetaSpider):

full_response_length = 2**18

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.bytes_received, signals.bytes_received)
return spider

def start_requests(self):
body = b"a" * self.full_response_length
url = self.mockserver.url("/alpayload")
yield Request(url, method="POST", body=body, errback=self.errback)

def parse(self, response):
self.meta["response"] = response

def errback(self, failure):
self.meta["failure"] = failure

def bytes_received(self, data, request, spider):
raise StopDownload(errback=False)


class BytesReceivedErrbackSpider(BytesReceivedCallbackSpider):

def bytes_received(self, data, request, spider):
raise StopDownload(errback=True)
43 changes: 39 additions & 4 deletions tests/test_crawl.py
Expand Up @@ -9,17 +9,31 @@
from testfixtures import LogCapture
from twisted.internet import defer
from twisted.internet.ssl import Certificate
from twisted.python.failure import Failure
from twisted.trial.unittest import TestCase

from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.exceptions import StopDownload
from scrapy.http import Request
from scrapy.http.response import Response
from scrapy.utils.python import to_unicode
from tests.mockserver import MockServer
from tests.spiders import (FollowAllSpider, DelaySpider, SimpleSpider, BrokenStartRequestsSpider,
SingleRequestSpider, DuplicateStartRequestsSpider, CrawlSpiderWithErrback,
AsyncDefSpider, AsyncDefAsyncioSpider, AsyncDefAsyncioReturnSpider,
AsyncDefAsyncioReqsReturnSpider)
from tests.spiders import (
AsyncDefAsyncioReqsReturnSpider,
AsyncDefAsyncioReturnSpider,
AsyncDefAsyncioSpider,
AsyncDefSpider,
BrokenStartRequestsSpider,
BytesReceivedCallbackSpider,
BytesReceivedErrbackSpider,
CrawlSpiderWithErrback,
DelaySpider,
DuplicateStartRequestsSpider,
FollowAllSpider,
SimpleSpider,
SingleRequestSpider,
)


class CrawlTestCase(TestCase):
Expand Down Expand Up @@ -457,3 +471,24 @@ def test_dns_server_ip_address(self):
ip_address = crawler.spider.meta['responses'][0].ip_address
self.assertIsInstance(ip_address, IPv4Address)
self.assertEqual(str(ip_address), gethostbyname(expected_netloc))

@defer.inlineCallbacks
def test_stop_download_callback(self):
crawler = self.runner.create_crawler(BytesReceivedCallbackSpider)
yield crawler.crawl(mockserver=self.mockserver)
print("*" * 100)
print(crawler.spider.meta)
print("*" * 100)
self.assertIsNone(crawler.spider.meta.get("failure"))
self.assertIsInstance(crawler.spider.meta["response"], Response)
self.assertLess(len(crawler.spider.meta["response"].text), crawler.spider.full_response_length)

@defer.inlineCallbacks
def test_stop_download_errback(self):
crawler = self.runner.create_crawler(BytesReceivedErrbackSpider)
yield crawler.crawl(mockserver=self.mockserver)
self.assertIsNone(crawler.spider.meta.get("response"))
self.assertIsInstance(crawler.spider.meta["failure"], Failure)
self.assertIsInstance(crawler.spider.meta["failure"].value, StopDownload)
self.assertIsInstance(crawler.spider.meta["failure"].response, Response)
self.assertLess(len(crawler.spider.meta["failure"].response.text), crawler.spider.full_response_length)

0 comments on commit dc590fe

Please sign in to comment.