diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst index 9a6e0d1b6e1..214ac564086 100644 --- a/docs/topics/request-response.rst +++ b/docs/topics/request-response.rst @@ -303,6 +303,7 @@ Those are: * :reqmeta:`download_timeout` * :reqmeta:`download_maxsize` * :reqmeta:`download_latency` +* :reqmeta:`download_fail_on_dataloss` * :reqmeta:`proxy` .. reqmeta:: bindaddress @@ -330,6 +331,14 @@ started, i.e. HTTP message sent over the network. This meta key only becomes available when the response has been downloaded. While most other meta keys are used to control Scrapy behavior, this one is supposed to be read-only. +.. reqmeta:: download_fail_on_dataloss + +download_fail_on_dataloss +------------------------- + +Whether or not to fail on broken responses. See: +:setting:`DOWNLOAD_FAIL_ON_DATALOSS`. + .. _topics-request-response-ref-request-subclasses: Request subclasses diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index f616742c467..ccdd02c4ede 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -604,6 +604,32 @@ If you want to disable it set to 0. This feature needs Twisted >= 11.1. +.. setting:: DOWNLOAD_FAIL_ON_DATALOSS + +DOWNLOAD_FAIL_ON_DATALOSS +------------------------- + +Default: ``True`` + +Whether or not to fail on broken responses, that is, declared +``Content-Length`` does not match content sent by the server or chunked +response was not properly finish. If ``True``, these responses raise a +``ResponseFailed([_DataLoss])`` error. If ``False``, these responses +are passed through and the flag ``dataloss`` is added to the response, i.e.: +``'dataloss' in response.flags`` is ``True``. + +Optionally, this can be set per-request basis by using the +:reqmeta:`download_fail_on_dataloss` Request.meta key to ``False``. + +.. note:: + + A broken response, or data loss error, may happen under several + circumstances, from server misconfiguration to network errors to data + corruption. It is up to the user to decide if it makes sense to process + broken responses considering they may contain partial or incomplete content. + If setting:`RETRY_ENABLED` is ``True`` and this setting is set to ``True``, + the ``ResponseFailed([_DataLoss])`` failure will be retried as usual. + .. setting:: DUPEFILTER_CLASS DUPEFILTER_CLASS diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index b96c8c6fe88..37e83680913 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -12,9 +12,9 @@ from twisted.web.http_headers import Headers as TxHeaders from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH from twisted.internet.error import TimeoutError -from twisted.web.http import PotentialDataLoss +from twisted.web.http import _DataLoss, PotentialDataLoss from twisted.web.client import Agent, ProxyAgent, ResponseDone, \ - HTTPConnectionPool + HTTPConnectionPool, ResponseFailed from twisted.internet.endpoints import TCP4ClientEndpoint from scrapy.http import Headers @@ -51,13 +51,15 @@ def __init__(self, settings): warnings.warn(msg) self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') + self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS') self._disconnect_timeout = 1 def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool, maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), - warnsize=getattr(spider, 'download_warnsize', self._default_warnsize)) + warnsize=getattr(spider, 'download_warnsize', self._default_warnsize), + fail_on_dataloss=self._fail_on_dataloss) return agent.download_request(request) def close(self): @@ -233,13 +235,14 @@ class ScrapyAgent(object): _TunnelingAgent = TunnelingAgent def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None, - maxsize=0, warnsize=0): + maxsize=0, warnsize=0, fail_on_dataloss=True): self._contextFactory = contextFactory self._connectTimeout = connectTimeout self._bindAddress = bindAddress self._pool = pool self._maxsize = maxsize self._warnsize = warnsize + self._fail_on_dataloss = fail_on_dataloss self._txresponse = None def _get_agent(self, request, timeout): @@ -326,6 +329,7 @@ def _cb_bodyready(self, txresponse, request): maxsize = request.meta.get('download_maxsize', self._maxsize) warnsize = request.meta.get('download_warnsize', self._warnsize) expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1 + fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss) if maxsize and expected_size > maxsize: error_msg = ("Cancelling download of %(url)s: expected response " @@ -345,7 +349,8 @@ def _cancel(_): txresponse._transport._producer.loseConnection() d = defer.Deferred(_cancel) - txresponse.deliverBody(_ResponseReader(d, txresponse, request, maxsize, warnsize)) + txresponse.deliverBody(_ResponseReader( + d, txresponse, request, maxsize, warnsize, fail_on_dataloss)) # save response for timeouts self._txresponse = txresponse @@ -380,13 +385,16 @@ def stopProducing(self): class _ResponseReader(protocol.Protocol): - def __init__(self, finished, txresponse, request, maxsize, warnsize): + def __init__(self, finished, txresponse, request, maxsize, warnsize, + fail_on_dataloss): self._finished = finished self._txresponse = txresponse self._request = request self._bodybuf = BytesIO() self._maxsize = maxsize self._warnsize = warnsize + self._fail_on_dataloss = fail_on_dataloss + self._fail_on_dataloss_warned = False self._reached_warnsize = False self._bytes_received = 0 @@ -415,7 +423,22 @@ def connectionLost(self, reason): body = self._bodybuf.getvalue() if reason.check(ResponseDone): self._finished.callback((self._txresponse, body, None)) - elif reason.check(PotentialDataLoss): + return + + if reason.check(PotentialDataLoss): self._finished.callback((self._txresponse, body, ['partial'])) - else: - self._finished.errback(reason) + return + + if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons): + if not self._fail_on_dataloss: + self._finished.callback((self._txresponse, body, ['dataloss'])) + return + + elif not self._fail_on_dataloss_warned: + logger.warn("Got data loss in %s. If you want to process broken " + "responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False" + " -- This message won't be shown in further requests", + self._txresponse.request.absoluteURI.decode()) + self._fail_on_dataloss_warned = True + + self._finished.errback(reason) diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index e0e39120cce..a5931a3d5d6 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -79,6 +79,8 @@ DOWNLOAD_MAXSIZE = 1024*1024*1024 # 1024m DOWNLOAD_WARNSIZE = 32*1024*1024 # 32m +DOWNLOAD_FAIL_ON_DATALOSS = True + DOWNLOADER = 'scrapy.core.downloader.Downloader' DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory' diff --git a/tests/test_downloader_handlers.py b/tests/test_downloader_handlers.py index e49a514b884..c58aa76f928 100644 --- a/tests/test_downloader_handlers.py +++ b/tests/test_downloader_handlers.py @@ -13,9 +13,11 @@ from twisted.python.filepath import FilePath from twisted.internet import reactor, defer, error from twisted.web import server, static, util, resource +from twisted.web._newclient import ResponseFailed +from twisted.web.http import _DataLoss from twisted.web.test.test_webclient import ForeverTakingResource, \ NoLengthResource, HostHeaderResource, \ - PayloadResource, BrokenDownloadResource + PayloadResource from twisted.cred import portal, checkers, credentials from w3lib.url import path_to_file_uri @@ -118,6 +120,35 @@ def render(self, request): return request.requestHeaders.getRawHeaders(b"content-length")[0] +class ChunkedResource(resource.Resource): + + def render(self, request): + def response(): + request.write(b"chunked ") + request.write(b"content\n") + request.finish() + reactor.callLater(0, response) + return server.NOT_DONE_YET + + +class BrokenDownloadResource(resource.Resource): + + def render(self, request): + def response(): + request.setHeader(b"Content-Length", b"20") + request.write(b"partial") + # We have to force a disconnection for HTTP/1.1 clients. Otherwise + # client keeps the connection open waiting for more data. + if hasattr(request.channel, 'loseConnection'): # twisted >=16.3.0 + request.channel.loseConnection() + else: + request.channel.transport.loseConnection() + request.finish() + + reactor.callLater(0, response) + return server.NOT_DONE_YET + + class EmptyContentTypeHeaderResource(resource.Resource): """ A testing resource which renders itself as the value of request body @@ -149,6 +180,7 @@ def setUp(self): r.putChild(b"host", HostHeaderResource()) r.putChild(b"payload", PayloadResource()) r.putChild(b"broken", BrokenDownloadResource()) + r.putChild(b"chunked", ChunkedResource()) r.putChild(b"contentlength", ContentLengthHeaderResource()) r.putChild(b"nocontenttype", EmptyContentTypeHeaderResource()) self.site = server.Site(r, timeout=None) @@ -341,6 +373,44 @@ def test_download_with_large_maxsize_per_spider(self): d.addCallback(self.assertEquals, b"0123456789") return d + def test_download_chunked_content(self): + request = Request(self.getURL('chunked')) + d = self.download_request(request, Spider('foo')) + d.addCallback(lambda r: r.body) + d.addCallback(self.assertEquals, b"chunked content\n") + return d + + def test_download_broken_content_cause_data_loss(self): + request = Request(self.getURL('broken')) + d = self.download_request(request, Spider('foo')) + + def checkDataLoss(failure): + if failure.check(ResponseFailed): + if any(r.check(_DataLoss) for r in failure.value.reasons): + return None + return failure + + d.addCallback(lambda _: self.fail("No DataLoss exception")) + d.addErrback(checkDataLoss) + return d + + def test_download_broken_content_allow_data_loss(self): + request = Request(self.getURL('broken'), meta={'download_fail_on_dataloss': False}) + d = self.download_request(request, Spider('foo')) + d.addCallback(lambda r: r.flags) + d.addCallback(self.assertEqual, ['dataloss']) + return d + + def test_download_broken_content_allow_data_loss_via_setting(self): + download_handler = self.download_handler_cls(Settings({ + 'DOWNLOAD_FAIL_ON_DATALOSS': False, + })) + request = Request(self.getURL('broken')) + d = download_handler.download_request(request, Spider('foo')) + d.addCallback(lambda r: r.flags) + d.addCallback(self.assertEqual, ['dataloss']) + return d + class Https11TestCase(Http11TestCase): scheme = 'https'