Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allowed partial response on timeout if fail on dataloss is set to false #3492

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 14 additions & 1 deletion scrapy/core/downloader/handlers/http11.py
Expand Up @@ -334,11 +334,19 @@ def download_request(self, request):
# response body is ready to be consumed
d.addCallback(self._cb_bodyready, request)
d.addCallback(self._cb_bodydone, request, url)

fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)
# if fail on dataloss is set to false, call _cb_timeout callback instead of d.cancel to allow partial resposnse
callback = d.cancel if fail_on_dataloss else self._cb_timedout

# check download timeout
self._timeout_cl = reactor.callLater(timeout, d.cancel)
self._timeout_cl = reactor.callLater(timeout, callback)
d.addBoth(self._cb_timeout, request, url, timeout)
return d

def _cb_timedout(self):
self._txresponse._transport.stopProducing()

def _cb_timeout(self, result, request, url, timeout):
if self._timeout_cl.active():
self._timeout_cl.cancel()
Expand All @@ -348,6 +356,11 @@ def _cb_timeout(self, result, request, url, timeout):
if self._txresponse:
self._txresponse._transport.stopProducing()

fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)
# return result if fail on dataloss is set to false
if not fail_on_dataloss:
return result

raise TimeoutError("Getting %s took longer than %s seconds." % (url, timeout))

def _cb_latency(self, result, request, start_time):
Expand Down
25 changes: 24 additions & 1 deletion tests/test_downloader_handlers.py
Expand Up @@ -188,6 +188,16 @@ def response():
return server.NOT_DONE_YET


class InfiniteResource(resource.Resource):
def render(self, request):
def response():
for i in range(1024):
request.write(b"x" * 1024)

reactor.callLater(0, response)
return server.NOT_DONE_YET


class HttpTestCase(unittest.TestCase):

scheme = 'http'
Expand All @@ -209,6 +219,7 @@ def setUp(self):
r.putChild(b"host", HostHeaderResource())
r.putChild(b"payload", PayloadResource())
r.putChild(b"broken", BrokenDownloadResource())
r.putChild(b"infinite", InfiniteResource())
r.putChild(b"chunked", ChunkedResource())
r.putChild(b"broken-chunked", BrokenChunkedResource())
r.putChild(b"contentlength", ContentLengthHeaderResource())
Expand Down Expand Up @@ -343,6 +354,18 @@ def test_payload(self):
return d


class BodyOnInfiniteResponseTimeoutTestCase(HttpTestCase):
def test_reponse_body_on_not_data_loss(self):
def _test(response):
self.assertEqual(response.status, 200)
self.assertEqual(response.flags, ['dataloss'])

spider = Spider('foo')
meta = {'download_timeout': 5, 'download_fail_on_dataloss': False}
request = Request(self.getURL('infinite'), meta=meta)
return self.download_request(request, spider).addCallback(_test)


class DeprecatedHttpTestCase(HttpTestCase):
"""HTTP 1.0 test case"""
download_handler_cls = HttpDownloadHandler
Expand Down Expand Up @@ -506,6 +529,7 @@ def setUp(self):
super(Https11InvalidDNSId, self).setUp()
self.host = '127.0.0.1'


class Https11InvalidDNSPattern(Https11TestCase):
"""Connect to HTTPS hosts where the certificate are issued to an ip instead of a domain."""

Expand Down Expand Up @@ -662,7 +686,6 @@ def test_download_with_proxy_https_timeout(self):
timeout = yield self.assertFailure(d, error.TimeoutError)
self.assertIn(domain, timeout.osError)


class HttpDownloadHandlerMock(object):
def __init__(self, settings):
pass
Expand Down