Handle data loss gracefully.

Websites that return a wrong ``Content-Length`` header may cause a data loss error. Also when a chunked response is not finished properly. This change adds a new setting ``DOWNLOAD_FAIL_ON_DATALOSS`` (default: ``True``) and request.meta key ``download_fail_on_dataloss``.
scrapy · Feb 28, 2017 · 96f5e25 · 96f5e25
1 parent 0e5ed21
commit 96f5e25
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 10 deletions.
diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst
@@ -303,6 +303,7 @@ Those are:
 * :reqmeta:`download_timeout`
 * :reqmeta:`download_maxsize`
 * :reqmeta:`download_latency`
+* :reqmeta:`download_fail_on_dataloss`
 * :reqmeta:`proxy`
 
 .. reqmeta:: bindaddress
@@ -330,6 +331,14 @@ started, i.e. HTTP message sent over the network. This meta key only becomes
 available when the response has been downloaded. While most other meta keys are
 used to control Scrapy behavior, this one is supposed to be read-only.
 
+.. reqmeta:: download_fail_on_dataloss
+
+download_fail_on_dataloss
+-------------------------
+
+Whether or not to fail on broken responses. See:
+:setting:`DOWNLOAD_FAIL_ON_DATALOSS`.
+
 .. _topics-request-response-ref-request-subclasses:
 
 Request subclasses

diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst
@@ -604,6 +604,32 @@ If you want to disable it set to 0.
 
     This feature needs Twisted >= 11.1.
 
+.. setting:: DOWNLOAD_FAIL_ON_DATALOSS
+
+DOWNLOAD_FAIL_ON_DATALOSS
+-------------------------
+
+Default: ``True``
+
+Whether or not to fail on broken responses, that is, declared
+``Content-Length`` does not match content sent by the server or chunked
+response was not properly finish. If ``True``, these responses raise a
+``ResponseFailed([_DataLoss])`` error. If ``False``, these responses
+are passed through and the flag ``dataloss`` is added to the response, i.e.:
+``'dataloss' in response.flags`` is ``True``.
+
+Optionally, this can be set per-request basis by using the
+:reqmeta:`download_fail_on_dataloss` Request.meta key to ``False``.
+
+.. note::
+
+  A broken response, or data loss error, may happen under several
+  circumstances, from server misconfiguration to network errors to data
+  corruption. It is up to the user to decide if it makes sense to process
+  broken responses considering they may contain partial or incomplete content.
+  If setting:`RETRY_ENABLED` is ``True`` and this setting is set to ``True``,
+  the ``ResponseFailed([_DataLoss])`` failure will be retried as usual.
+
 .. setting:: DUPEFILTER_CLASS
 
 DUPEFILTER_CLASS

diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py
@@ -12,9 +12,9 @@
 from twisted.web.http_headers import Headers as TxHeaders
 from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
 from twisted.internet.error import TimeoutError
-from twisted.web.http import PotentialDataLoss
+from twisted.web.http import _DataLoss, PotentialDataLoss
 from twisted.web.client import Agent, ProxyAgent, ResponseDone, \
-    HTTPConnectionPool
+    HTTPConnectionPool, ResponseFailed
 from twisted.internet.endpoints import TCP4ClientEndpoint
 
 from scrapy.http import Headers
@@ -51,13 +51,15 @@ def __init__(self, settings):
             warnings.warn(msg)
         self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
         self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
+        self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
         self._disconnect_timeout = 1
 
     def download_request(self, request, spider):
         """Return a deferred for the HTTP download"""
         agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
             maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
-            warnsize=getattr(spider, 'download_warnsize', self._default_warnsize))
+            warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
+            fail_on_dataloss=self._fail_on_dataloss)
         return agent.download_request(request)
 
     def close(self):
@@ -233,13 +235,14 @@ class ScrapyAgent(object):
     _TunnelingAgent = TunnelingAgent
 
     def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
-                 maxsize=0, warnsize=0):
+                 maxsize=0, warnsize=0, fail_on_dataloss=True):
         self._contextFactory = contextFactory
         self._connectTimeout = connectTimeout
         self._bindAddress = bindAddress
         self._pool = pool
         self._maxsize = maxsize
         self._warnsize = warnsize
+        self._fail_on_dataloss = fail_on_dataloss
         self._txresponse = None
 
     def _get_agent(self, request, timeout):
@@ -326,6 +329,7 @@ def _cb_bodyready(self, txresponse, request):
         maxsize = request.meta.get('download_maxsize', self._maxsize)
         warnsize = request.meta.get('download_warnsize', self._warnsize)
         expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
+        fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)
 
         if maxsize and expected_size > maxsize:
             error_msg = ("Cancelling download of %(url)s: expected response "
@@ -345,7 +349,8 @@ def _cancel(_):
             txresponse._transport._producer.loseConnection()
 
         d = defer.Deferred(_cancel)
-        txresponse.deliverBody(_ResponseReader(d, txresponse, request, maxsize, warnsize))
+        txresponse.deliverBody(_ResponseReader(
+            d, txresponse, request, maxsize, warnsize, fail_on_dataloss))
 
         # save response for timeouts
         self._txresponse = txresponse
@@ -380,13 +385,16 @@ def stopProducing(self):
 
 class _ResponseReader(protocol.Protocol):
 
-    def __init__(self, finished, txresponse, request, maxsize, warnsize):
+    def __init__(self, finished, txresponse, request, maxsize, warnsize,
+                 fail_on_dataloss):
         self._finished = finished
         self._txresponse = txresponse
         self._request = request
         self._bodybuf = BytesIO()
         self._maxsize  = maxsize
         self._warnsize  = warnsize
+        self._fail_on_dataloss = fail_on_dataloss
+        self._fail_on_dataloss_warned = False
         self._reached_warnsize = False
         self._bytes_received = 0
 
@@ -415,7 +423,22 @@ def connectionLost(self, reason):
         body = self._bodybuf.getvalue()
         if reason.check(ResponseDone):
             self._finished.callback((self._txresponse, body, None))
-        elif reason.check(PotentialDataLoss):
+            return
+
+        if reason.check(PotentialDataLoss):
             self._finished.callback((self._txresponse, body, ['partial']))
-        else:
-            self._finished.errback(reason)
+            return
+
+        if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
+            if not self._fail_on_dataloss:
+                self._finished.callback((self._txresponse, body, ['dataloss']))
+                return
+
+            elif not self._fail_on_dataloss_warned:
+                logger.warn("Got data loss in %s. If you want to process broken "
+                            "responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
+                            " -- This message won't be shown in further requests",
+                            self._txresponse.request.absoluteURI.decode())
+                self._fail_on_dataloss_warned = True
+
+        self._finished.errback(reason)
diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py
@@ -79,6 +79,8 @@
 DOWNLOAD_MAXSIZE = 1024*1024*1024   # 1024m
 DOWNLOAD_WARNSIZE = 32*1024*1024    # 32m
 
+DOWNLOAD_FAIL_ON_DATALOSS = True
+
 DOWNLOADER = 'scrapy.core.downloader.Downloader'
 
 DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory'

diff --git a/tests/test_downloader_handlers.py b/tests/test_downloader_handlers.py
@@ -13,9 +13,11 @@
 from twisted.python.filepath import FilePath
 from twisted.internet import reactor, defer, error
 from twisted.web import server, static, util, resource
+from twisted.web._newclient import ResponseFailed
+from twisted.web.http import _DataLoss
 from twisted.web.test.test_webclient import ForeverTakingResource, \
         NoLengthResource, HostHeaderResource, \
-        PayloadResource, BrokenDownloadResource
+        PayloadResource
 from twisted.cred import portal, checkers, credentials
 from w3lib.url import path_to_file_uri
 
@@ -118,6 +120,35 @@ def render(self, request):
         return request.requestHeaders.getRawHeaders(b"content-length")[0]
 
 
+class ChunkedResource(resource.Resource):
+
+    def render(self, request):
+        def response():
+            request.write(b"chunked ")
+            request.write(b"content\n")
+            request.finish()
+        reactor.callLater(0, response)
+        return server.NOT_DONE_YET
+
+
+class BrokenDownloadResource(resource.Resource):
+
+    def render(self, request):
+        def response():
+            request.setHeader(b"Content-Length", b"20")
+            request.write(b"partial")
+            # We have to force a disconnection for HTTP/1.1 clients. Otherwise
+            # client keeps the connection open waiting for more data.
+            if hasattr(request.channel, 'loseConnection'):  # twisted >=16.3.0
+                request.channel.loseConnection()
+            else:
+                request.channel.transport.loseConnection()
+            request.finish()
+
+        reactor.callLater(0, response)
+        return server.NOT_DONE_YET
+
+
 class EmptyContentTypeHeaderResource(resource.Resource):
     """
     A testing resource which renders itself as the value of request body
@@ -149,6 +180,7 @@ def setUp(self):
         r.putChild(b"host", HostHeaderResource())
         r.putChild(b"payload", PayloadResource())
         r.putChild(b"broken", BrokenDownloadResource())
+        r.putChild(b"chunked", ChunkedResource())
         r.putChild(b"contentlength", ContentLengthHeaderResource())
         r.putChild(b"nocontenttype", EmptyContentTypeHeaderResource())
         self.site = server.Site(r, timeout=None)
@@ -341,6 +373,44 @@ def test_download_with_large_maxsize_per_spider(self):
         d.addCallback(self.assertEquals, b"0123456789")
         return d
 
+    def test_download_chunked_content(self):
+        request = Request(self.getURL('chunked'))
+        d = self.download_request(request, Spider('foo'))
+        d.addCallback(lambda r: r.body)
+        d.addCallback(self.assertEquals, b"chunked content\n")
+        return d
+
+    def test_download_broken_content_cause_data_loss(self):
+        request = Request(self.getURL('broken'))
+        d = self.download_request(request, Spider('foo'))
+
+        def checkDataLoss(failure):
+            if failure.check(ResponseFailed):
+                if any(r.check(_DataLoss) for r in failure.value.reasons):
+                    return None
+            return failure
+
+        d.addCallback(lambda _: self.fail("No DataLoss exception"))
+        d.addErrback(checkDataLoss)
+        return d
+
+    def test_download_broken_content_allow_data_loss(self):
+        request = Request(self.getURL('broken'), meta={'download_fail_on_dataloss': False})
+        d = self.download_request(request, Spider('foo'))
+        d.addCallback(lambda r: r.flags)
+        d.addCallback(self.assertEqual, ['dataloss'])
+        return d
+
+    def test_download_broken_content_allow_data_loss_via_setting(self):
+        download_handler = self.download_handler_cls(Settings({
+            'DOWNLOAD_FAIL_ON_DATALOSS': False,
+        }))
+        request = Request(self.getURL('broken'))
+        d = download_handler.download_request(request, Spider('foo'))
+        d.addCallback(lambda r: r.flags)
+        d.addCallback(self.assertEqual, ['dataloss'])
+        return d
+
 
 class Https11TestCase(Http11TestCase):
     scheme = 'https'