Merge pull request #5798 from Gallaecio/no-callback

Implement a NO_CALLBACK value for Request.callback
scrapy · Jan 30, 2023 · b337c98 · b337c98
2 parents da15d93 + 78eaf06
commit b337c98
Show file tree

Hide file tree

Showing 13 changed files with 98 additions and 34 deletions.
diff --git a/docs/topics/item-pipeline.rst b/docs/topics/item-pipeline.rst
@@ -191,6 +191,7 @@ item.
 
     import scrapy
     from itemadapter import ItemAdapter
+    from scrapy.http.request import NO_CALLBACK
     from scrapy.utils.defer import maybe_deferred_to_future
 
 
@@ -204,8 +205,10 @@ item.
             adapter = ItemAdapter(item)
             encoded_item_url = quote(adapter["url"])
             screenshot_url = self.SPLASH_URL.format(encoded_item_url)
-            request = scrapy.Request(screenshot_url)
-            response = await maybe_deferred_to_future(spider.crawler.engine.download(request, spider))
+            request = scrapy.Request(screenshot_url, callback=NO_CALLBACK)
+            response = await maybe_deferred_to_future(
+                spider.crawler.engine.download(request, spider)
+            )
 
             if response.status != 200:
                 # Error happened, return item.

diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst
@@ -32,11 +32,22 @@ Request objects
     :type url: str
 
     :param callback: the function that will be called with the response of this
-       request (once it's downloaded) as its first parameter. For more information
-       see :ref:`topics-request-response-ref-request-callback-arguments` below.
-       If a Request doesn't specify a callback, the spider's
-       :meth:`~scrapy.Spider.parse` method will be used.
-       Note that if exceptions are raised during processing, errback is called instead.
+       request (once it's downloaded) as its first parameter.
+
+       In addition to a function, the following values are supported:
+
+       -   ``None`` (default), which indicates that the spider's
+           :meth:`~scrapy.Spider.parse` method must be used.
+
+       -   :py:data:`scrapy.http.request.NO_CALLBACK`
+
+            .. autodata:: scrapy.http.request.NO_CALLBACK
+
+       For more information, see
+       :ref:`topics-request-response-ref-request-callback-arguments`.
+
+       .. note:: If exceptions are raised during processing, ``errback`` is
+                 called instead.
 
     :type callback: collections.abc.Callable
 
@@ -69,16 +80,24 @@ Request objects
 
         1. Using a dict::
 
-            request_with_cookies = Request(url="http://www.example.com",
-                                           cookies={'currency': 'USD', 'country': 'UY'})
+            request_with_cookies = Request(
+                url="http://www.example.com",
+                cookies={'currency': 'USD', 'country': 'UY'},
+            )
 
         2. Using a list of dicts::
 
-            request_with_cookies = Request(url="http://www.example.com",
-                                           cookies=[{'name': 'currency',
-                                                    'value': 'USD',
-                                                    'domain': 'example.com',
-                                                    'path': '/currency'}])
+            request_with_cookies = Request(
+                url="http://www.example.com",
+                cookies=[
+                    {
+                        'name': 'currency',
+                        'value': 'USD',
+                        'domain': 'example.com',
+                        'path': '/currency',
+                    },
+                ],
+            )
 
         The latter form allows for customizing the ``domain`` and ``path``
         attributes of the cookie. This is only useful if the cookies are saved

diff --git a/scrapy/downloadermiddlewares/robotstxt.py b/scrapy/downloadermiddlewares/robotstxt.py
@@ -10,6 +10,7 @@
 
 from scrapy.exceptions import IgnoreRequest, NotConfigured
 from scrapy.http import Request
+from scrapy.http.request import NO_CALLBACK
 from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.log import failure_to_exc_info
 from scrapy.utils.misc import load_object
@@ -72,6 +73,7 @@ def robot_parser(self, request, spider):
                 robotsurl,
                 priority=self.DOWNLOAD_PRIORITY,
                 meta={"dont_obey_robotstxt": True},
+                callback=NO_CALLBACK,
             )
             dfd = self.crawler.engine.download(robotsreq)
             dfd.addCallback(self._parse_robots, netloc, spider)

diff --git a/scrapy/http/request/__init__.py b/scrapy/http/request/__init__.py
@@ -20,6 +20,24 @@
 RequestTypeVar = TypeVar("RequestTypeVar", bound="Request")
 
 
+def NO_CALLBACK(*args, **kwargs):
+    """When assigned to the ``callback`` parameter of
+    :class:`~scrapy.http.Request`, it indicates that the request is not meant
+    to have a spider callback at all.
+
+    This value should be used by :ref:`components <topics-components>` that
+    create and handle their own requests, e.g. through
+    :meth:`scrapy.core.engine.ExecutionEngine.download`, so that downloader
+    middlewares handling such requests can treat them differently from requests
+    intended for the :meth:`~scrapy.Spider.parse` callback.
+    """
+    raise RuntimeError(
+        "The NO_CALLBACK callback has been called. This is a special callback "
+        "value intended for requests whose callback is never meant to be "
+        "called."
+    )
+
+
 class Request(object_ref):
     """Represents an HTTP request, which is usually generated in a Spider and
     executed by the Downloader, thus generating a :class:`Response`.
@@ -72,11 +90,11 @@ def __init__(
             raise TypeError(f"Request priority not an integer: {priority!r}")
         self.priority = priority
 
-        if callback is not None and not callable(callback):
+        if not (callable(callback) or callback is None):
             raise TypeError(
                 f"callback must be a callable, got {type(callback).__name__}"
             )
-        if errback is not None and not callable(errback):
+        if not (callable(errback) or errback is None):
             raise TypeError(f"errback must be a callable, got {type(errback).__name__}")
         self.callback = callback
         self.errback = errback

diff --git a/scrapy/pipelines/files.py b/scrapy/pipelines/files.py
@@ -22,6 +22,7 @@
 
 from scrapy.exceptions import IgnoreRequest, NotConfigured
 from scrapy.http import Request
+from scrapy.http.request import NO_CALLBACK
 from scrapy.pipelines.media import MediaPipeline
 from scrapy.settings import Settings
 from scrapy.utils.boto import is_botocore_available
@@ -516,7 +517,7 @@ def inc_stats(self, spider, status):
     # Overridable Interface
     def get_media_requests(self, item, info):
         urls = ItemAdapter(item).get(self.files_urls_field, [])
-        return [Request(u) for u in urls]
+        return [Request(u, callback=NO_CALLBACK) for u in urls]
 
     def file_downloaded(self, response, request, info, *, item=None):
         path = self.file_path(request, response=response, info=info, item=item)

diff --git a/scrapy/pipelines/images.py b/scrapy/pipelines/images.py
@@ -13,6 +13,7 @@
 
 from scrapy.exceptions import DropItem, NotConfigured, ScrapyDeprecationWarning
 from scrapy.http import Request
+from scrapy.http.request import NO_CALLBACK
 from scrapy.pipelines.files import FileException, FilesPipeline
 
 # TODO: from scrapy.pipelines.media import MediaPipeline
@@ -214,7 +215,7 @@ def convert_image(self, image, size=None, response_body=None):
 
     def get_media_requests(self, item, info):
         urls = ItemAdapter(item).get(self.images_urls_field, [])
-        return [Request(u) for u in urls]
+        return [Request(u, callback=NO_CALLBACK) for u in urls]
 
     def item_completed(self, results, item, info):
         with suppress(KeyError):

diff --git a/scrapy/pipelines/media.py b/scrapy/pipelines/media.py
@@ -7,6 +7,7 @@
 from twisted.internet.defer import Deferred, DeferredList
 from twisted.python.failure import Failure
 
+from scrapy.http.request import NO_CALLBACK
 from scrapy.settings import Settings
 from scrapy.utils.datatypes import SequenceExclude
 from scrapy.utils.defer import defer_result, mustbe_deferred
@@ -17,6 +18,10 @@
 logger = logging.getLogger(__name__)
 
 
+def _DUMMY_CALLBACK(response):
+    return response
+
+
 class MediaPipeline:
 
     LOG_FAILED_RESULTS = True
@@ -90,9 +95,12 @@ def process_item(self, item, spider):
 
     def _process_request(self, request, info, item):
         fp = self._fingerprinter.fingerprint(request)
-        cb = request.callback or (lambda _: _)
+        if not request.callback or request.callback is NO_CALLBACK:
+            cb = _DUMMY_CALLBACK
+        else:
+            cb = request.callback
         eb = request.errback
-        request.callback = None
+        request.callback = NO_CALLBACK
         request.errback = None
 
         # Return cached result if request was already seen

diff --git a/tests/test_downloadermiddleware_robotstxt.py b/tests/test_downloadermiddleware_robotstxt.py
@@ -9,6 +9,7 @@
 from scrapy.downloadermiddlewares.robotstxt import logger as mw_module_logger
 from scrapy.exceptions import IgnoreRequest, NotConfigured
 from scrapy.http import Request, Response, TextResponse
+from scrapy.http.request import NO_CALLBACK
 from scrapy.settings import Settings
 from tests.test_robotstxt_interface import reppy_available, rerp_available
 
@@ -57,6 +58,7 @@ def test_robotstxt(self):
         return DeferredList(
             [
                 self.assertNotIgnored(Request("http://site.local/allowed"), middleware),
+                maybeDeferred(self.assertRobotsTxtRequested, "http://site.local"),
                 self.assertIgnored(Request("http://site.local/admin/main"), middleware),
                 self.assertIgnored(Request("http://site.local/static/"), middleware),
                 self.assertIgnored(
@@ -238,6 +240,12 @@ def assertIgnored(self, request, middleware):
             maybeDeferred(middleware.process_request, request, spider), IgnoreRequest
         )
 
+    def assertRobotsTxtRequested(self, base_url):
+        calls = self.crawler.engine.download.call_args_list
+        request = calls[0][0][0]
+        self.assertEqual(request.url, f"{base_url}/robots.txt")
+        self.assertEqual(request.callback, NO_CALLBACK)
+
 
 class RobotsTxtMiddlewareWithRerpTest(RobotsTxtMiddlewareTest):
     if not rerp_available():

diff --git a/tests/test_http_request.py b/tests/test_http_request.py
@@ -14,6 +14,7 @@
     Request,
     XmlRpcRequest,
 )
+from scrapy.http.request import NO_CALLBACK
 from scrapy.utils.python import to_bytes, to_unicode
 
 
@@ -310,6 +311,14 @@ def a_function():
         self.assertIs(r4.callback, a_function)
         self.assertIs(r4.errback, a_function)
 
+        r5 = self.request_class(
+            url="http://example.com",
+            callback=NO_CALLBACK,
+            errback=NO_CALLBACK,
+        )
+        self.assertIs(r5.callback, NO_CALLBACK)
+        self.assertIs(r5.errback, NO_CALLBACK)
+
     def test_callback_and_errback_type(self):
         with self.assertRaises(TypeError):
             self.request_class("http://example.com", callback="a_function")
@@ -322,6 +331,10 @@ def test_callback_and_errback_type(self):
                 errback="a_function",
             )
 
+    def test_no_callback(self):
+        with self.assertRaises(RuntimeError):
+            NO_CALLBACK()
+
     def test_from_curl(self):
         # Note: more curated tests regarding curl conversion are in
         # `test_utils_curl.py`

diff --git a/tests/test_pipeline_files.py b/tests/test_pipeline_files.py
@@ -33,10 +33,7 @@
     skip_if_no_boto,
 )
 
-
-def _mocked_download_func(request, info):
-    response = request.meta.get("response")
-    return response() if callable(response) else response
+from .test_pipeline_media import _mocked_download_func
 
 
 class FilesPipelineTestCase(unittest.TestCase):

diff --git a/tests/test_pipeline_images.py b/tests/test_pipeline_images.py
@@ -32,20 +32,13 @@
         skip_pillow = None
 
 
-def _mocked_download_func(request, info):
-    response = request.meta.get("response")
-    return response() if callable(response) else response
-
-
 class ImagesPipelineTestCase(unittest.TestCase):
 
     skip = skip_pillow
 
     def setUp(self):
         self.tempdir = mkdtemp()
-        self.pipeline = ImagesPipeline(
-            self.tempdir, download_func=_mocked_download_func
-        )
+        self.pipeline = ImagesPipeline(self.tempdir)
 
     def tearDown(self):
         rmtree(self.tempdir)

diff --git a/tests/test_pipeline_media.py b/tests/test_pipeline_media.py
@@ -9,6 +9,7 @@
 
 from scrapy import signals
 from scrapy.http import Request, Response
+from scrapy.http.request import NO_CALLBACK
 from scrapy.pipelines.files import FileException
 from scrapy.pipelines.images import ImagesPipeline
 from scrapy.pipelines.media import MediaPipeline
@@ -30,6 +31,7 @@
 
 
 def _mocked_download_func(request, info):
+    assert request.callback is NO_CALLBACK
     response = request.meta.get("response")
     return response() if callable(response) else response
 

diff --git a/tox.ini b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = security,flake8,py,black
+envlist = security,flake8,black,typing,py
 minversion = 1.7.0
 
 [testenv]
@@ -201,4 +201,3 @@ deps =
     black==22.12.0
 commands =
     black {posargs:--check .}
-