Skip to content

Commit

Permalink
Merge pull request #5798 from Gallaecio/no-callback
Browse files Browse the repository at this point in the history
Implement a NO_CALLBACK value for Request.callback
  • Loading branch information
kmike committed Jan 30, 2023
2 parents da15d93 + 78eaf06 commit b337c98
Show file tree
Hide file tree
Showing 13 changed files with 98 additions and 34 deletions.
7 changes: 5 additions & 2 deletions docs/topics/item-pipeline.rst
Expand Up @@ -191,6 +191,7 @@ item.

import scrapy
from itemadapter import ItemAdapter
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.defer import maybe_deferred_to_future


Expand All @@ -204,8 +205,10 @@ item.
adapter = ItemAdapter(item)
encoded_item_url = quote(adapter["url"])
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
request = scrapy.Request(screenshot_url)
response = await maybe_deferred_to_future(spider.crawler.engine.download(request, spider))
request = scrapy.Request(screenshot_url, callback=NO_CALLBACK)
response = await maybe_deferred_to_future(
spider.crawler.engine.download(request, spider)
)

if response.status != 200:
# Error happened, return item.
Expand Down
43 changes: 31 additions & 12 deletions docs/topics/request-response.rst
Expand Up @@ -32,11 +32,22 @@ Request objects
:type url: str

:param callback: the function that will be called with the response of this
request (once it's downloaded) as its first parameter. For more information
see :ref:`topics-request-response-ref-request-callback-arguments` below.
If a Request doesn't specify a callback, the spider's
:meth:`~scrapy.Spider.parse` method will be used.
Note that if exceptions are raised during processing, errback is called instead.
request (once it's downloaded) as its first parameter.

In addition to a function, the following values are supported:

- ``None`` (default), which indicates that the spider's
:meth:`~scrapy.Spider.parse` method must be used.

- :py:data:`scrapy.http.request.NO_CALLBACK`

.. autodata:: scrapy.http.request.NO_CALLBACK

For more information, see
:ref:`topics-request-response-ref-request-callback-arguments`.

.. note:: If exceptions are raised during processing, ``errback`` is
called instead.

:type callback: collections.abc.Callable

Expand Down Expand Up @@ -69,16 +80,24 @@ Request objects

1. Using a dict::

request_with_cookies = Request(url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'})
request_with_cookies = Request(
url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'},
)

2. Using a list of dicts::

request_with_cookies = Request(url="http://www.example.com",
cookies=[{'name': 'currency',
'value': 'USD',
'domain': 'example.com',
'path': '/currency'}])
request_with_cookies = Request(
url="http://www.example.com",
cookies=[
{
'name': 'currency',
'value': 'USD',
'domain': 'example.com',
'path': '/currency',
},
],
)

The latter form allows for customizing the ``domain`` and ``path``
attributes of the cookie. This is only useful if the cookies are saved
Expand Down
2 changes: 2 additions & 0 deletions scrapy/downloadermiddlewares/robotstxt.py
Expand Up @@ -10,6 +10,7 @@

from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import load_object
Expand Down Expand Up @@ -72,6 +73,7 @@ def robot_parser(self, request, spider):
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={"dont_obey_robotstxt": True},
callback=NO_CALLBACK,
)
dfd = self.crawler.engine.download(robotsreq)
dfd.addCallback(self._parse_robots, netloc, spider)
Expand Down
22 changes: 20 additions & 2 deletions scrapy/http/request/__init__.py
Expand Up @@ -20,6 +20,24 @@
RequestTypeVar = TypeVar("RequestTypeVar", bound="Request")


def NO_CALLBACK(*args, **kwargs):
"""When assigned to the ``callback`` parameter of
:class:`~scrapy.http.Request`, it indicates that the request is not meant
to have a spider callback at all.
This value should be used by :ref:`components <topics-components>` that
create and handle their own requests, e.g. through
:meth:`scrapy.core.engine.ExecutionEngine.download`, so that downloader
middlewares handling such requests can treat them differently from requests
intended for the :meth:`~scrapy.Spider.parse` callback.
"""
raise RuntimeError(
"The NO_CALLBACK callback has been called. This is a special callback "
"value intended for requests whose callback is never meant to be "
"called."
)


class Request(object_ref):
"""Represents an HTTP request, which is usually generated in a Spider and
executed by the Downloader, thus generating a :class:`Response`.
Expand Down Expand Up @@ -72,11 +90,11 @@ def __init__(
raise TypeError(f"Request priority not an integer: {priority!r}")
self.priority = priority

if callback is not None and not callable(callback):
if not (callable(callback) or callback is None):
raise TypeError(
f"callback must be a callable, got {type(callback).__name__}"
)
if errback is not None and not callable(errback):
if not (callable(errback) or errback is None):
raise TypeError(f"errback must be a callable, got {type(errback).__name__}")
self.callback = callback
self.errback = errback
Expand Down
3 changes: 2 additions & 1 deletion scrapy/pipelines/files.py
Expand Up @@ -22,6 +22,7 @@

from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.boto import is_botocore_available
Expand Down Expand Up @@ -516,7 +517,7 @@ def inc_stats(self, spider, status):
# Overridable Interface
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
return [Request(u) for u in urls]
return [Request(u, callback=NO_CALLBACK) for u in urls]

def file_downloaded(self, response, request, info, *, item=None):
path = self.file_path(request, response=response, info=info, item=item)
Expand Down
3 changes: 2 additions & 1 deletion scrapy/pipelines/images.py
Expand Up @@ -13,6 +13,7 @@

from scrapy.exceptions import DropItem, NotConfigured, ScrapyDeprecationWarning
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.files import FileException, FilesPipeline

# TODO: from scrapy.pipelines.media import MediaPipeline
Expand Down Expand Up @@ -214,7 +215,7 @@ def convert_image(self, image, size=None, response_body=None):

def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.images_urls_field, [])
return [Request(u) for u in urls]
return [Request(u, callback=NO_CALLBACK) for u in urls]

def item_completed(self, results, item, info):
with suppress(KeyError):
Expand Down
12 changes: 10 additions & 2 deletions scrapy/pipelines/media.py
Expand Up @@ -7,6 +7,7 @@
from twisted.internet.defer import Deferred, DeferredList
from twisted.python.failure import Failure

from scrapy.http.request import NO_CALLBACK
from scrapy.settings import Settings
from scrapy.utils.datatypes import SequenceExclude
from scrapy.utils.defer import defer_result, mustbe_deferred
Expand All @@ -17,6 +18,10 @@
logger = logging.getLogger(__name__)


def _DUMMY_CALLBACK(response):
return response


class MediaPipeline:

LOG_FAILED_RESULTS = True
Expand Down Expand Up @@ -90,9 +95,12 @@ def process_item(self, item, spider):

def _process_request(self, request, info, item):
fp = self._fingerprinter.fingerprint(request)
cb = request.callback or (lambda _: _)
if not request.callback or request.callback is NO_CALLBACK:
cb = _DUMMY_CALLBACK
else:
cb = request.callback
eb = request.errback
request.callback = None
request.callback = NO_CALLBACK
request.errback = None

# Return cached result if request was already seen
Expand Down
8 changes: 8 additions & 0 deletions tests/test_downloadermiddleware_robotstxt.py
Expand Up @@ -9,6 +9,7 @@
from scrapy.downloadermiddlewares.robotstxt import logger as mw_module_logger
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request, Response, TextResponse
from scrapy.http.request import NO_CALLBACK
from scrapy.settings import Settings
from tests.test_robotstxt_interface import reppy_available, rerp_available

Expand Down Expand Up @@ -57,6 +58,7 @@ def test_robotstxt(self):
return DeferredList(
[
self.assertNotIgnored(Request("http://site.local/allowed"), middleware),
maybeDeferred(self.assertRobotsTxtRequested, "http://site.local"),
self.assertIgnored(Request("http://site.local/admin/main"), middleware),
self.assertIgnored(Request("http://site.local/static/"), middleware),
self.assertIgnored(
Expand Down Expand Up @@ -238,6 +240,12 @@ def assertIgnored(self, request, middleware):
maybeDeferred(middleware.process_request, request, spider), IgnoreRequest
)

def assertRobotsTxtRequested(self, base_url):
calls = self.crawler.engine.download.call_args_list
request = calls[0][0][0]
self.assertEqual(request.url, f"{base_url}/robots.txt")
self.assertEqual(request.callback, NO_CALLBACK)


class RobotsTxtMiddlewareWithRerpTest(RobotsTxtMiddlewareTest):
if not rerp_available():
Expand Down
13 changes: 13 additions & 0 deletions tests/test_http_request.py
Expand Up @@ -14,6 +14,7 @@
Request,
XmlRpcRequest,
)
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.python import to_bytes, to_unicode


Expand Down Expand Up @@ -310,6 +311,14 @@ def a_function():
self.assertIs(r4.callback, a_function)
self.assertIs(r4.errback, a_function)

r5 = self.request_class(
url="http://example.com",
callback=NO_CALLBACK,
errback=NO_CALLBACK,
)
self.assertIs(r5.callback, NO_CALLBACK)
self.assertIs(r5.errback, NO_CALLBACK)

def test_callback_and_errback_type(self):
with self.assertRaises(TypeError):
self.request_class("http://example.com", callback="a_function")
Expand All @@ -322,6 +331,10 @@ def test_callback_and_errback_type(self):
errback="a_function",
)

def test_no_callback(self):
with self.assertRaises(RuntimeError):
NO_CALLBACK()

def test_from_curl(self):
# Note: more curated tests regarding curl conversion are in
# `test_utils_curl.py`
Expand Down
5 changes: 1 addition & 4 deletions tests/test_pipeline_files.py
Expand Up @@ -33,10 +33,7 @@
skip_if_no_boto,
)


def _mocked_download_func(request, info):
response = request.meta.get("response")
return response() if callable(response) else response
from .test_pipeline_media import _mocked_download_func


class FilesPipelineTestCase(unittest.TestCase):
Expand Down
9 changes: 1 addition & 8 deletions tests/test_pipeline_images.py
Expand Up @@ -32,20 +32,13 @@
skip_pillow = None


def _mocked_download_func(request, info):
response = request.meta.get("response")
return response() if callable(response) else response


class ImagesPipelineTestCase(unittest.TestCase):

skip = skip_pillow

def setUp(self):
self.tempdir = mkdtemp()
self.pipeline = ImagesPipeline(
self.tempdir, download_func=_mocked_download_func
)
self.pipeline = ImagesPipeline(self.tempdir)

def tearDown(self):
rmtree(self.tempdir)
Expand Down
2 changes: 2 additions & 0 deletions tests/test_pipeline_media.py
Expand Up @@ -9,6 +9,7 @@

from scrapy import signals
from scrapy.http import Request, Response
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.files import FileException
from scrapy.pipelines.images import ImagesPipeline
from scrapy.pipelines.media import MediaPipeline
Expand All @@ -30,6 +31,7 @@


def _mocked_download_func(request, info):
assert request.callback is NO_CALLBACK
response = request.meta.get("response")
return response() if callable(response) else response

Expand Down
3 changes: 1 addition & 2 deletions tox.ini
Expand Up @@ -4,7 +4,7 @@
# and then run "tox" from this directory.

[tox]
envlist = security,flake8,py,black
envlist = security,flake8,black,typing,py
minversion = 1.7.0

[testenv]
Expand Down Expand Up @@ -201,4 +201,3 @@ deps =
black==22.12.0
commands =
black {posargs:--check .}

0 comments on commit b337c98

Please sign in to comment.