Skip to content

Commit

Permalink
Conditional request attribute binding for responses (#4632)
Browse files Browse the repository at this point in the history
  • Loading branch information
elacuesta committed Aug 17, 2020
1 parent acb3b44 commit 2aa4f3c
Show file tree
Hide file tree
Showing 5 changed files with 250 additions and 35 deletions.
5 changes: 5 additions & 0 deletions docs/topics/signals.rst
Expand Up @@ -423,6 +423,11 @@ response_received
:param spider: the spider for which the response is intended
:type spider: :class:`~scrapy.spiders.Spider` object

.. note:: The ``request`` argument might not contain the original request that
reached the downloader, if a :ref:`topics-downloader-middleware` modifies
the :class:`~scrapy.http.Response` object and sets a specific ``request``
attribute.

response_downloaded
~~~~~~~~~~~~~~~~~~~

Expand Down
13 changes: 9 additions & 4 deletions scrapy/core/engine.py
Expand Up @@ -243,12 +243,17 @@ def _on_success(response):
% (type(response), response)
)
if isinstance(response, Response):
response.request = request # tie request to response received
logkws = self.logformatter.crawled(request, response, spider)
if response.request is None:
response.request = request
logkws = self.logformatter.crawled(response.request, response, spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
self.signals.send_catch_log(signals.response_received,
response=response, request=request, spider=spider)
self.signals.send_catch_log(
signal=signals.response_received,
response=response,
request=response.request,
spider=spider,
)
return response

def _on_complete(_):
Expand Down
62 changes: 31 additions & 31 deletions scrapy/core/scraper.py
Expand Up @@ -12,7 +12,7 @@
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy.http import Request, Response
from scrapy.utils.defer import defer_result, defer_succeed, iter_errback, parallel
from scrapy.utils.defer import defer_fail, defer_succeed, iter_errback, parallel
from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
from scrapy.utils.spider import iterate_spider_output
Expand Down Expand Up @@ -120,40 +120,40 @@ def _scrape_next(self, spider, slot):
response, request, deferred = slot.next_response_request_deferred()
self._scrape(response, request, spider).chainDeferred(deferred)

def _scrape(self, response, request, spider):
"""Handle the downloaded response or failure through the spider
callback/errback"""
if not isinstance(response, (Response, Failure)):
raise TypeError(
"Incorrect type: expected Response or Failure, got %s: %r"
% (type(response), response)
)

dfd = self._scrape2(response, request, spider) # returns spider's processed output
dfd.addErrback(self.handle_spider_error, request, response, spider)
dfd.addCallback(self.handle_spider_output, request, response, spider)
def _scrape(self, result, request, spider):
"""
Handle the downloaded response or failure through the spider callback/errback
"""
if not isinstance(result, (Response, Failure)):
raise TypeError("Incorrect type: expected Response or Failure, got %s: %r" % (type(result), result))
dfd = self._scrape2(result, request, spider) # returns spider's processed output
dfd.addErrback(self.handle_spider_error, request, result, spider)
dfd.addCallback(self.handle_spider_output, request, result, spider)
return dfd

def _scrape2(self, request_result, request, spider):
"""Handle the different cases of request's result been a Response or a
Failure"""
if not isinstance(request_result, Failure):
return self.spidermw.scrape_response(
self.call_spider, request_result, request, spider)
else:
dfd = self.call_spider(request_result, request, spider)
return dfd.addErrback(
self._log_download_errors, request_result, request, spider)
def _scrape2(self, result, request, spider):
"""
Handle the different cases of request's result been a Response or a Failure
"""
if isinstance(result, Response):
return self.spidermw.scrape_response(self.call_spider, result, request, spider)
else: # result is a Failure
dfd = self.call_spider(result, request, spider)
return dfd.addErrback(self._log_download_errors, result, request, spider)

def call_spider(self, result, request, spider):
result.request = request
dfd = defer_result(result)
callback = request.callback or spider._parse
warn_on_generator_with_return_value(spider, callback)
warn_on_generator_with_return_value(spider, request.errback)
dfd.addCallbacks(callback=callback,
errback=request.errback,
callbackKeywords=request.cb_kwargs)
if isinstance(result, Response):
if getattr(result, "request", None) is None:
result.request = request
callback = result.request.callback or spider._parse
warn_on_generator_with_return_value(spider, callback)
dfd = defer_succeed(result)
dfd.addCallback(callback, **result.request.cb_kwargs)
else: # result is a Failure
result.request = request
warn_on_generator_with_return_value(spider, request.errback)
dfd = defer_fail(result)
dfd.addErrback(request.errback)
return dfd.addCallback(iterate_spider_output)

def handle_spider_error(self, _failure, request, response, spider):
Expand Down
3 changes: 3 additions & 0 deletions setup.cfg
Expand Up @@ -130,6 +130,9 @@ ignore_errors = True
[mypy-tests.test_pipelines]
ignore_errors = True

[mypy-tests.test_request_attribute_binding]
ignore_errors = True

[mypy-tests.test_request_cb_kwargs]
ignore_errors = True

Expand Down
202 changes: 202 additions & 0 deletions tests/test_request_attribute_binding.py
@@ -0,0 +1,202 @@
from twisted.internet import defer
from twisted.trial.unittest import TestCase

from scrapy import Request, signals
from scrapy.crawler import CrawlerRunner
from scrapy.http.response import Response

from testfixtures import LogCapture

from tests.mockserver import MockServer
from tests.spiders import SingleRequestSpider


OVERRIDEN_URL = "https://example.org"


class ProcessResponseMiddleware:
def process_response(self, request, response, spider):
return response.replace(request=Request(OVERRIDEN_URL))


class RaiseExceptionRequestMiddleware:
def process_request(self, request, spider):
1 / 0
return request


class CatchExceptionOverrideRequestMiddleware:
def process_exception(self, request, exception, spider):
return Response(
url="http://localhost/",
body=b"Caught " + exception.__class__.__name__.encode("utf-8"),
request=Request(OVERRIDEN_URL),
)


class CatchExceptionDoNotOverrideRequestMiddleware:
def process_exception(self, request, exception, spider):
return Response(
url="http://localhost/",
body=b"Caught " + exception.__class__.__name__.encode("utf-8"),
)


class AlternativeCallbacksSpider(SingleRequestSpider):
name = "alternative_callbacks_spider"

def alt_callback(self, response, foo=None):
self.logger.info("alt_callback was invoked with foo=%s", foo)


class AlternativeCallbacksMiddleware:
def process_response(self, request, response, spider):
new_request = request.replace(
url=OVERRIDEN_URL,
callback=spider.alt_callback,
cb_kwargs={"foo": "bar"},
)
return response.replace(request=new_request)


class CrawlTestCase(TestCase):

def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()

def tearDown(self):
self.mockserver.__exit__(None, None, None)

@defer.inlineCallbacks
def test_response_200(self):
url = self.mockserver.url("/status?n=200")
crawler = CrawlerRunner().create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=url, mockserver=self.mockserver)
response = crawler.spider.meta["responses"][0]
self.assertEqual(response.request.url, url)

@defer.inlineCallbacks
def test_response_error(self):
for status in ("404", "500"):
url = self.mockserver.url("/status?n={}".format(status))
crawler = CrawlerRunner().create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=url, mockserver=self.mockserver)
failure = crawler.spider.meta["failure"]
response = failure.value.response
self.assertEqual(failure.request.url, url)
self.assertEqual(response.request.url, url)

@defer.inlineCallbacks
def test_downloader_middleware_raise_exception(self):
url = self.mockserver.url("/status?n=200")
runner = CrawlerRunner(settings={
"DOWNLOADER_MIDDLEWARES": {
__name__ + ".RaiseExceptionRequestMiddleware": 590,
},
})
crawler = runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=url, mockserver=self.mockserver)
failure = crawler.spider.meta["failure"]
self.assertEqual(failure.request.url, url)
self.assertIsInstance(failure.value, ZeroDivisionError)

@defer.inlineCallbacks
def test_downloader_middleware_override_request_in_process_response(self):
"""
Downloader middleware which returns a response with an specific 'request' attribute.
* The spider callback should receive the overriden response.request
* Handlers listening to the response_received signal should receive the overriden response.request
* The "crawled" log message should show the overriden response.request
"""
signal_params = {}

def signal_handler(response, request, spider):
signal_params["response"] = response
signal_params["request"] = request

url = self.mockserver.url("/status?n=200")
runner = CrawlerRunner(settings={
"DOWNLOADER_MIDDLEWARES": {
__name__ + ".ProcessResponseMiddleware": 595,
}
})
crawler = runner.create_crawler(SingleRequestSpider)
crawler.signals.connect(signal_handler, signal=signals.response_received)

with LogCapture() as log:
yield crawler.crawl(seed=url, mockserver=self.mockserver)

response = crawler.spider.meta["responses"][0]
self.assertEqual(response.request.url, OVERRIDEN_URL)

self.assertEqual(signal_params["response"].url, url)
self.assertEqual(signal_params["request"].url, OVERRIDEN_URL)

log.check_present(
("scrapy.core.engine", "DEBUG", "Crawled (200) <GET {}> (referer: None)".format(OVERRIDEN_URL)),
)

@defer.inlineCallbacks
def test_downloader_middleware_override_in_process_exception(self):
"""
An exception is raised but caught by the next middleware, which
returns a Response with a specific 'request' attribute.
The spider callback should receive the overriden response.request
"""
url = self.mockserver.url("/status?n=200")
runner = CrawlerRunner(settings={
"DOWNLOADER_MIDDLEWARES": {
__name__ + ".RaiseExceptionRequestMiddleware": 590,
__name__ + ".CatchExceptionOverrideRequestMiddleware": 595,
},
})
crawler = runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=url, mockserver=self.mockserver)
response = crawler.spider.meta["responses"][0]
self.assertEqual(response.body, b"Caught ZeroDivisionError")
self.assertEqual(response.request.url, OVERRIDEN_URL)

@defer.inlineCallbacks
def test_downloader_middleware_do_not_override_in_process_exception(self):
"""
An exception is raised but caught by the next middleware, which
returns a Response without a specific 'request' attribute.
The spider callback should receive the original response.request
"""
url = self.mockserver.url("/status?n=200")
runner = CrawlerRunner(settings={
"DOWNLOADER_MIDDLEWARES": {
__name__ + ".RaiseExceptionRequestMiddleware": 590,
__name__ + ".CatchExceptionDoNotOverrideRequestMiddleware": 595,
},
})
crawler = runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=url, mockserver=self.mockserver)
response = crawler.spider.meta["responses"][0]
self.assertEqual(response.body, b"Caught ZeroDivisionError")
self.assertEqual(response.request.url, url)

@defer.inlineCallbacks
def test_downloader_middleware_alternative_callback(self):
"""
Downloader middleware which returns a response with a
specific 'request' attribute, with an alternative callback
"""
runner = CrawlerRunner(settings={
"DOWNLOADER_MIDDLEWARES": {
__name__ + ".AlternativeCallbacksMiddleware": 595,
}
})
crawler = runner.create_crawler(AlternativeCallbacksSpider)

with LogCapture() as log:
url = self.mockserver.url("/status?n=200")
yield crawler.crawl(seed=url, mockserver=self.mockserver)

log.check_present(
("alternative_callbacks_spider", "INFO", "alt_callback was invoked with foo=bar"),
)

0 comments on commit 2aa4f3c

Please sign in to comment.