Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement a NO_CALLBACK value for Request.callback #5798

Merged
merged 15 commits into from Jan 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 5 additions & 2 deletions docs/topics/item-pipeline.rst
Expand Up @@ -191,6 +191,7 @@ item.

import scrapy
from itemadapter import ItemAdapter
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.defer import maybe_deferred_to_future


Expand All @@ -204,8 +205,10 @@ item.
adapter = ItemAdapter(item)
encoded_item_url = quote(adapter["url"])
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
request = scrapy.Request(screenshot_url)
response = await maybe_deferred_to_future(spider.crawler.engine.download(request, spider))
request = scrapy.Request(screenshot_url, callback=NO_CALLBACK)
response = await maybe_deferred_to_future(
spider.crawler.engine.download(request, spider)
)

if response.status != 200:
# Error happened, return item.
Expand Down
43 changes: 31 additions & 12 deletions docs/topics/request-response.rst
Expand Up @@ -32,11 +32,22 @@ Request objects
:type url: str

:param callback: the function that will be called with the response of this
request (once it's downloaded) as its first parameter. For more information
see :ref:`topics-request-response-ref-request-callback-arguments` below.
If a Request doesn't specify a callback, the spider's
:meth:`~scrapy.Spider.parse` method will be used.
Note that if exceptions are raised during processing, errback is called instead.
request (once it's downloaded) as its first parameter.

In addition to a function, the following values are supported:

- ``None`` (default), which indicates that the spider's
:meth:`~scrapy.Spider.parse` method must be used.

- :py:data:`scrapy.http.request.NO_CALLBACK`

.. autodata:: scrapy.http.request.NO_CALLBACK
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

For more information, see
:ref:`topics-request-response-ref-request-callback-arguments`.

.. note:: If exceptions are raised during processing, ``errback`` is
called instead.

:type callback: collections.abc.Callable

Expand Down Expand Up @@ -69,16 +80,24 @@ Request objects

1. Using a dict::

request_with_cookies = Request(url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'})
request_with_cookies = Request(
url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'},
)

2. Using a list of dicts::

request_with_cookies = Request(url="http://www.example.com",
cookies=[{'name': 'currency',
'value': 'USD',
'domain': 'example.com',
'path': '/currency'}])
request_with_cookies = Request(
url="http://www.example.com",
cookies=[
{
'name': 'currency',
'value': 'USD',
'domain': 'example.com',
'path': '/currency',
},
],
)

The latter form allows for customizing the ``domain`` and ``path``
attributes of the cookie. This is only useful if the cookies are saved
Expand Down
2 changes: 2 additions & 0 deletions scrapy/downloadermiddlewares/robotstxt.py
Expand Up @@ -10,6 +10,7 @@

from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import load_object
Expand Down Expand Up @@ -72,6 +73,7 @@ def robot_parser(self, request, spider):
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={"dont_obey_robotstxt": True},
callback=NO_CALLBACK,
)
dfd = self.crawler.engine.download(robotsreq)
dfd.addCallback(self._parse_robots, netloc, spider)
Expand Down
22 changes: 20 additions & 2 deletions scrapy/http/request/__init__.py
Expand Up @@ -20,6 +20,24 @@
RequestTypeVar = TypeVar("RequestTypeVar", bound="Request")


def NO_CALLBACK(*args, **kwargs):
"""When assigned to the ``callback`` parameter of
:class:`~scrapy.http.Request`, it indicates that the request is not meant
to have a spider callback at all.

This value should be used by :ref:`components <topics-components>` that
create and handle their own requests, e.g. through
:meth:`scrapy.core.engine.ExecutionEngine.download`, so that downloader
middlewares handling such requests can treat them differently from requests
intended for the :meth:`~scrapy.Spider.parse` callback.
"""
raise RuntimeError(
"The NO_CALLBACK callback has been called. This is a special callback "
"value intended for requests whose callback is never meant to be "
"called."
)


class Request(object_ref):
"""Represents an HTTP request, which is usually generated in a Spider and
executed by the Downloader, thus generating a :class:`Response`.
Expand Down Expand Up @@ -72,11 +90,11 @@ def __init__(
raise TypeError(f"Request priority not an integer: {priority!r}")
self.priority = priority

if callback is not None and not callable(callback):
if not (callable(callback) or callback is None):
raise TypeError(
f"callback must be a callable, got {type(callback).__name__}"
)
if errback is not None and not callable(errback):
if not (callable(errback) or errback is None):
raise TypeError(f"errback must be a callable, got {type(errback).__name__}")
self.callback = callback
self.errback = errback
Expand Down
3 changes: 2 additions & 1 deletion scrapy/pipelines/files.py
Expand Up @@ -22,6 +22,7 @@

from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.boto import is_botocore_available
Expand Down Expand Up @@ -516,7 +517,7 @@ def inc_stats(self, spider, status):
# Overridable Interface
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
return [Request(u) for u in urls]
return [Request(u, callback=NO_CALLBACK) for u in urls]

def file_downloaded(self, response, request, info, *, item=None):
path = self.file_path(request, response=response, info=info, item=item)
Expand Down
3 changes: 2 additions & 1 deletion scrapy/pipelines/images.py
Expand Up @@ -13,6 +13,7 @@

from scrapy.exceptions import DropItem, NotConfigured, ScrapyDeprecationWarning
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.files import FileException, FilesPipeline

# TODO: from scrapy.pipelines.media import MediaPipeline
Expand Down Expand Up @@ -214,7 +215,7 @@ def convert_image(self, image, size=None, response_body=None):

def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.images_urls_field, [])
return [Request(u) for u in urls]
return [Request(u, callback=NO_CALLBACK) for u in urls]

def item_completed(self, results, item, info):
with suppress(KeyError):
Expand Down
12 changes: 10 additions & 2 deletions scrapy/pipelines/media.py
Expand Up @@ -7,6 +7,7 @@
from twisted.internet.defer import Deferred, DeferredList
from twisted.python.failure import Failure

from scrapy.http.request import NO_CALLBACK
from scrapy.settings import Settings
from scrapy.utils.datatypes import SequenceExclude
from scrapy.utils.defer import defer_result, mustbe_deferred
Expand All @@ -17,6 +18,10 @@
logger = logging.getLogger(__name__)


def _DUMMY_CALLBACK(response):
return response


class MediaPipeline:

LOG_FAILED_RESULTS = True
Expand Down Expand Up @@ -90,9 +95,12 @@ def process_item(self, item, spider):

def _process_request(self, request, info, item):
fp = self._fingerprinter.fingerprint(request)
cb = request.callback or (lambda _: _)
if not request.callback or request.callback is NO_CALLBACK:
cb = _DUMMY_CALLBACK
else:
cb = request.callback
eb = request.errback
request.callback = None
Copy link
Member

@BurnzZ BurnzZ Jan 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since tests were actually passing for files, I had a deeper look at it, and I think tests pass because those Request(u) objects are later parsed with the very method here, so the callback gets set before the request object leaves the middleware. So I think no further changes may be necessary specific to files or images.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, it might still be cleaner to add the no callback marker to these requests, as they're not supposed to use "parse" callback.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we set callback to NO_CALLBACK twice, in get_media_requests and in _process_request (both called from process_item, one after the other)?

I am not against it, I just want to be certain that I made it clear enough that the reason the callback is not set here is because these request objects are processed further before they leave the pipeline, so with the current code there is no risk of anything outside the pipeline itself to receive a request with callback=None.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we set callback to NO_CALLBACK twice, in get_media_requests and in _process_request (both called from process_item, one after the other)?

Yes. I think if the reader sees FilesPipeline.get_media_requests() with Request(u, callback=NO_CALLBACK), it helps re-assure the idea that the parse() method isn't supposed to be involved here.

Although they could also further inspect MediaPipeline._process_request() and see that NO_CALLBACK is assigned, they won't have to if FilesPipeline.get_media_requests() already shows it.

Copy link
Member Author

@Gallaecio Gallaecio Jan 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m making the change, then.

I wonder if we should go further, though, by changing _process_request to:

  • Log a deprecation warning if callback is None.
  • Raise an exception if callback is anything other than None or NO_CALLBACK. Or the same behavior as above, to avoid a backward-incompatible change. But I think it may be wise to actually break such code, to force users to not set a callback that is being reset in _process_request.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Log a deprecation warning if callback is None.

+1 

Raise an exception if callback is anything other than None or NO_CALLBACK. Or the same behavior as above, to avoid a backward-incompatible change. But I think it may be wise to actually break such code, to force users to not set a callback that is being reset in _process_request.

I'm not quite sure about this, since there might be some Scrapy project out there that does things differently with their MediaPipeline/FilePipeline. For example, they've overridden _process_request to not directly use the downloader.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But I think it may be wise to actually break such code, to force users to not set a callback that is being reset in _process_request.

The callback is actually not just reset, but stored and used. So maybe my point is void, we should continue to support callbacks on requests from get_media_requests() as usual. _process_request will make sure that the request leaves the pipeline with callback=NO_CALLBACK, but the original callback will be called nonetheless by the pipeline.

request.callback = NO_CALLBACK
request.errback = None

# Return cached result if request was already seen
Expand Down
8 changes: 8 additions & 0 deletions tests/test_downloadermiddleware_robotstxt.py
Expand Up @@ -9,6 +9,7 @@
from scrapy.downloadermiddlewares.robotstxt import logger as mw_module_logger
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request, Response, TextResponse
from scrapy.http.request import NO_CALLBACK
from scrapy.settings import Settings
from tests.test_robotstxt_interface import reppy_available, rerp_available

Expand Down Expand Up @@ -57,6 +58,7 @@ def test_robotstxt(self):
return DeferredList(
[
self.assertNotIgnored(Request("http://site.local/allowed"), middleware),
maybeDeferred(self.assertRobotsTxtRequested, "http://site.local"),
self.assertIgnored(Request("http://site.local/admin/main"), middleware),
self.assertIgnored(Request("http://site.local/static/"), middleware),
self.assertIgnored(
Expand Down Expand Up @@ -238,6 +240,12 @@ def assertIgnored(self, request, middleware):
maybeDeferred(middleware.process_request, request, spider), IgnoreRequest
)

def assertRobotsTxtRequested(self, base_url):
calls = self.crawler.engine.download.call_args_list
request = calls[0][0][0]
self.assertEqual(request.url, f"{base_url}/robots.txt")
self.assertEqual(request.callback, NO_CALLBACK)


class RobotsTxtMiddlewareWithRerpTest(RobotsTxtMiddlewareTest):
if not rerp_available():
Expand Down
13 changes: 13 additions & 0 deletions tests/test_http_request.py
Expand Up @@ -14,6 +14,7 @@
Request,
XmlRpcRequest,
)
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.python import to_bytes, to_unicode


Expand Down Expand Up @@ -310,6 +311,14 @@ def a_function():
self.assertIs(r4.callback, a_function)
self.assertIs(r4.errback, a_function)

r5 = self.request_class(
url="http://example.com",
callback=NO_CALLBACK,
errback=NO_CALLBACK,
)
self.assertIs(r5.callback, NO_CALLBACK)
self.assertIs(r5.errback, NO_CALLBACK)

def test_callback_and_errback_type(self):
with self.assertRaises(TypeError):
self.request_class("http://example.com", callback="a_function")
Expand All @@ -322,6 +331,10 @@ def test_callback_and_errback_type(self):
errback="a_function",
)

def test_no_callback(self):
with self.assertRaises(RuntimeError):
NO_CALLBACK()

def test_from_curl(self):
# Note: more curated tests regarding curl conversion are in
# `test_utils_curl.py`
Expand Down
5 changes: 1 addition & 4 deletions tests/test_pipeline_files.py
Expand Up @@ -33,10 +33,7 @@
skip_if_no_boto,
)


def _mocked_download_func(request, info):
response = request.meta.get("response")
return response() if callable(response) else response
from .test_pipeline_media import _mocked_download_func


class FilesPipelineTestCase(unittest.TestCase):
Expand Down
9 changes: 1 addition & 8 deletions tests/test_pipeline_images.py
Expand Up @@ -32,20 +32,13 @@
skip_pillow = None


def _mocked_download_func(request, info):
response = request.meta.get("response")
return response() if callable(response) else response


class ImagesPipelineTestCase(unittest.TestCase):

skip = skip_pillow

def setUp(self):
self.tempdir = mkdtemp()
self.pipeline = ImagesPipeline(
self.tempdir, download_func=_mocked_download_func
)
self.pipeline = ImagesPipeline(self.tempdir)

def tearDown(self):
rmtree(self.tempdir)
Expand Down
2 changes: 2 additions & 0 deletions tests/test_pipeline_media.py
Expand Up @@ -9,6 +9,7 @@

from scrapy import signals
from scrapy.http import Request, Response
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.files import FileException
from scrapy.pipelines.images import ImagesPipeline
from scrapy.pipelines.media import MediaPipeline
Expand All @@ -30,6 +31,7 @@


def _mocked_download_func(request, info):
assert request.callback is NO_CALLBACK
response = request.meta.get("response")
return response() if callable(response) else response

Expand Down
3 changes: 1 addition & 2 deletions tox.ini
Expand Up @@ -4,7 +4,7 @@
# and then run "tox" from this directory.

[tox]
envlist = security,flake8,py,black
envlist = security,flake8,black,typing,py
minversion = 1.7.0

[testenv]
Expand Down Expand Up @@ -201,4 +201,3 @@ deps =
black==22.12.0
commands =
black {posargs:--check .}