Skip to content

Commit

Permalink
Handle data loss gracefully.
Browse files Browse the repository at this point in the history
Websites that return a wrong ``Content-Length`` header may cause a data
loss error. Also when a chunked response is not finished properly.

This change adds a new setting ``DOWNLOAD_FAIL_ON_DATALOSS`` (default:
``True``) and request.meta key ``download_fail_on_dataloss``.
  • Loading branch information
rmax committed Feb 28, 2017
1 parent 0e5ed21 commit 96f5e25
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 10 deletions.
9 changes: 9 additions & 0 deletions docs/topics/request-response.rst
Expand Up @@ -303,6 +303,7 @@ Those are:
* :reqmeta:`download_timeout`
* :reqmeta:`download_maxsize`
* :reqmeta:`download_latency`
* :reqmeta:`download_fail_on_dataloss`
* :reqmeta:`proxy`

.. reqmeta:: bindaddress
Expand Down Expand Up @@ -330,6 +331,14 @@ started, i.e. HTTP message sent over the network. This meta key only becomes
available when the response has been downloaded. While most other meta keys are
used to control Scrapy behavior, this one is supposed to be read-only.

.. reqmeta:: download_fail_on_dataloss

download_fail_on_dataloss
-------------------------

Whether or not to fail on broken responses. See:
:setting:`DOWNLOAD_FAIL_ON_DATALOSS`.

.. _topics-request-response-ref-request-subclasses:

Request subclasses
Expand Down
26 changes: 26 additions & 0 deletions docs/topics/settings.rst
Expand Up @@ -604,6 +604,32 @@ If you want to disable it set to 0.

This feature needs Twisted >= 11.1.

.. setting:: DOWNLOAD_FAIL_ON_DATALOSS

DOWNLOAD_FAIL_ON_DATALOSS
-------------------------

Default: ``True``

Whether or not to fail on broken responses, that is, declared
``Content-Length`` does not match content sent by the server or chunked
response was not properly finish. If ``True``, these responses raise a
``ResponseFailed([_DataLoss])`` error. If ``False``, these responses
are passed through and the flag ``dataloss`` is added to the response, i.e.:
``'dataloss' in response.flags`` is ``True``.

Optionally, this can be set per-request basis by using the
:reqmeta:`download_fail_on_dataloss` Request.meta key to ``False``.

.. note::

A broken response, or data loss error, may happen under several
circumstances, from server misconfiguration to network errors to data
corruption. It is up to the user to decide if it makes sense to process
broken responses considering they may contain partial or incomplete content.
If setting:`RETRY_ENABLED` is ``True`` and this setting is set to ``True``,
the ``ResponseFailed([_DataLoss])`` failure will be retried as usual.

.. setting:: DUPEFILTER_CLASS

DUPEFILTER_CLASS
Expand Down
41 changes: 32 additions & 9 deletions scrapy/core/downloader/handlers/http11.py
Expand Up @@ -12,9 +12,9 @@
from twisted.web.http_headers import Headers as TxHeaders
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
from twisted.internet.error import TimeoutError
from twisted.web.http import PotentialDataLoss
from twisted.web.http import _DataLoss, PotentialDataLoss
from twisted.web.client import Agent, ProxyAgent, ResponseDone, \
HTTPConnectionPool
HTTPConnectionPool, ResponseFailed
from twisted.internet.endpoints import TCP4ClientEndpoint

from scrapy.http import Headers
Expand Down Expand Up @@ -51,13 +51,15 @@ def __init__(self, settings):
warnings.warn(msg)
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
self._disconnect_timeout = 1

def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize))
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
fail_on_dataloss=self._fail_on_dataloss)
return agent.download_request(request)

def close(self):
Expand Down Expand Up @@ -233,13 +235,14 @@ class ScrapyAgent(object):
_TunnelingAgent = TunnelingAgent

def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
maxsize=0, warnsize=0):
maxsize=0, warnsize=0, fail_on_dataloss=True):
self._contextFactory = contextFactory
self._connectTimeout = connectTimeout
self._bindAddress = bindAddress
self._pool = pool
self._maxsize = maxsize
self._warnsize = warnsize
self._fail_on_dataloss = fail_on_dataloss
self._txresponse = None

def _get_agent(self, request, timeout):
Expand Down Expand Up @@ -326,6 +329,7 @@ def _cb_bodyready(self, txresponse, request):
maxsize = request.meta.get('download_maxsize', self._maxsize)
warnsize = request.meta.get('download_warnsize', self._warnsize)
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)

if maxsize and expected_size > maxsize:
error_msg = ("Cancelling download of %(url)s: expected response "
Expand All @@ -345,7 +349,8 @@ def _cancel(_):
txresponse._transport._producer.loseConnection()

d = defer.Deferred(_cancel)
txresponse.deliverBody(_ResponseReader(d, txresponse, request, maxsize, warnsize))
txresponse.deliverBody(_ResponseReader(
d, txresponse, request, maxsize, warnsize, fail_on_dataloss))

# save response for timeouts
self._txresponse = txresponse
Expand Down Expand Up @@ -380,13 +385,16 @@ def stopProducing(self):

class _ResponseReader(protocol.Protocol):

def __init__(self, finished, txresponse, request, maxsize, warnsize):
def __init__(self, finished, txresponse, request, maxsize, warnsize,
fail_on_dataloss):
self._finished = finished
self._txresponse = txresponse
self._request = request
self._bodybuf = BytesIO()
self._maxsize = maxsize
self._warnsize = warnsize
self._fail_on_dataloss = fail_on_dataloss
self._fail_on_dataloss_warned = False
self._reached_warnsize = False
self._bytes_received = 0

Expand Down Expand Up @@ -415,7 +423,22 @@ def connectionLost(self, reason):
body = self._bodybuf.getvalue()
if reason.check(ResponseDone):
self._finished.callback((self._txresponse, body, None))
elif reason.check(PotentialDataLoss):
return

if reason.check(PotentialDataLoss):
self._finished.callback((self._txresponse, body, ['partial']))
else:
self._finished.errback(reason)
return

if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
if not self._fail_on_dataloss:
self._finished.callback((self._txresponse, body, ['dataloss']))
return

elif not self._fail_on_dataloss_warned:
logger.warn("Got data loss in %s. If you want to process broken "
"responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
" -- This message won't be shown in further requests",
self._txresponse.request.absoluteURI.decode())
self._fail_on_dataloss_warned = True

self._finished.errback(reason)
2 changes: 2 additions & 0 deletions scrapy/settings/default_settings.py
Expand Up @@ -79,6 +79,8 @@
DOWNLOAD_MAXSIZE = 1024*1024*1024 # 1024m
DOWNLOAD_WARNSIZE = 32*1024*1024 # 32m

DOWNLOAD_FAIL_ON_DATALOSS = True

DOWNLOADER = 'scrapy.core.downloader.Downloader'

DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory'
Expand Down
72 changes: 71 additions & 1 deletion tests/test_downloader_handlers.py
Expand Up @@ -13,9 +13,11 @@
from twisted.python.filepath import FilePath
from twisted.internet import reactor, defer, error
from twisted.web import server, static, util, resource
from twisted.web._newclient import ResponseFailed
from twisted.web.http import _DataLoss
from twisted.web.test.test_webclient import ForeverTakingResource, \
NoLengthResource, HostHeaderResource, \
PayloadResource, BrokenDownloadResource
PayloadResource
from twisted.cred import portal, checkers, credentials
from w3lib.url import path_to_file_uri

Expand Down Expand Up @@ -118,6 +120,35 @@ def render(self, request):
return request.requestHeaders.getRawHeaders(b"content-length")[0]


class ChunkedResource(resource.Resource):

def render(self, request):
def response():
request.write(b"chunked ")
request.write(b"content\n")
request.finish()
reactor.callLater(0, response)
return server.NOT_DONE_YET


class BrokenDownloadResource(resource.Resource):

def render(self, request):
def response():
request.setHeader(b"Content-Length", b"20")
request.write(b"partial")
# We have to force a disconnection for HTTP/1.1 clients. Otherwise
# client keeps the connection open waiting for more data.
if hasattr(request.channel, 'loseConnection'): # twisted >=16.3.0
request.channel.loseConnection()
else:
request.channel.transport.loseConnection()
request.finish()

reactor.callLater(0, response)
return server.NOT_DONE_YET


class EmptyContentTypeHeaderResource(resource.Resource):
"""
A testing resource which renders itself as the value of request body
Expand Down Expand Up @@ -149,6 +180,7 @@ def setUp(self):
r.putChild(b"host", HostHeaderResource())
r.putChild(b"payload", PayloadResource())
r.putChild(b"broken", BrokenDownloadResource())
r.putChild(b"chunked", ChunkedResource())
r.putChild(b"contentlength", ContentLengthHeaderResource())
r.putChild(b"nocontenttype", EmptyContentTypeHeaderResource())
self.site = server.Site(r, timeout=None)
Expand Down Expand Up @@ -341,6 +373,44 @@ def test_download_with_large_maxsize_per_spider(self):
d.addCallback(self.assertEquals, b"0123456789")
return d

def test_download_chunked_content(self):
request = Request(self.getURL('chunked'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, b"chunked content\n")
return d

def test_download_broken_content_cause_data_loss(self):
request = Request(self.getURL('broken'))
d = self.download_request(request, Spider('foo'))

def checkDataLoss(failure):
if failure.check(ResponseFailed):
if any(r.check(_DataLoss) for r in failure.value.reasons):
return None
return failure

d.addCallback(lambda _: self.fail("No DataLoss exception"))
d.addErrback(checkDataLoss)
return d

def test_download_broken_content_allow_data_loss(self):
request = Request(self.getURL('broken'), meta={'download_fail_on_dataloss': False})
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.flags)
d.addCallback(self.assertEqual, ['dataloss'])
return d

def test_download_broken_content_allow_data_loss_via_setting(self):
download_handler = self.download_handler_cls(Settings({
'DOWNLOAD_FAIL_ON_DATALOSS': False,
}))
request = Request(self.getURL('broken'))
d = download_handler.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.flags)
d.addCallback(self.assertEqual, ['dataloss'])
return d


class Https11TestCase(Http11TestCase):
scheme = 'https'
Expand Down

0 comments on commit 96f5e25

Please sign in to comment.