scrapy-plugins · Gallaecio · Dec 29, 2023 · Dec 29, 2023 · Dec 29, 2023 · Jan 17, 2024
diff --git a/docs/index.rst b/docs/index.rst
@@ -28,6 +28,7 @@ either :ref:`globally <transparent>` or :ref:`per request <automap>`, or
    usage/retry
    usage/scrapy-poet
    usage/stats
+   usage/actions
    usage/fingerprint
    usage/proxy
 

diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst
@@ -6,6 +6,44 @@ Settings
 
 :ref:`Settings <topics-settings>` for scrapy-zyte-api.
 
+.. _ZYTE_API_ACTION_ERROR_RETRY_ENABLED:
+
+ZYTE_API_ACTION_ERROR_RETRY_ENABLED
+===================================
+
+Default: ``True``
+
+Enables retries of Zyte API requests if responses contain an action error.
+
+Maximum retries and priority adjustment are handled with the same settings and
+request metadata keywords as regular retries in Scrapy, see
+:setting:`RETRY_TIMES <scrapy:RETRY_TIMES>` and
+:setting:`RETRY_PRIORITY_ADJUST <scrapy:RETRY_PRIORITY_ADJUST>`.
+
+See also :ref:`ZYTE_API_ACTION_ERROR_HANDLING`.
+
+.. _ZYTE_API_ACTION_ERROR_HANDLING:
+
+ZYTE_API_ACTION_ERROR_HANDLING
+==============================
+
+Default: ``"pass"``
+
+Determines how to handle Zyte API responses that contain an action error:
+
+-   ``"pass"``: Responses are treated as valid responses, i.e. they will reach
+    your spider callback.
+
+-   ``"ignore"``: Responses are dropped, i.e. they will not reach your spider.
+
+-   ``"err"``: :class:`~scrapy_zyte_api.exceptions.ActionError` is raised. It
+    will be processed by downloader middlewares as an exception, and will reach
+    your spider errback if you set one.
+
+.. autoexception:: scrapy_zyte_api.exceptions.ActionError
+    :members:
+
+
 .. _ZYTE_API_AUTOMAP_PARAMS:
 
 ZYTE_API_AUTOMAP_PARAMS

diff --git a/docs/usage/actions.rst b/docs/usage/actions.rst
@@ -0,0 +1,47 @@
+.. _actions:
+
+======================
+Handling action errors
+======================
+
+Even though Zyte API considers a response successful :ref:`even if a browser
+action fails <zyte-api-successful-responses>`, scrapy-zyte-api retries such
+responses by default. See :ref:`ZYTE_API_ACTION_ERROR_RETRY_ENABLED`.
+
+You can also use :ref:`ZYTE_API_ACTION_ERROR_HANDLING` to determine how such
+responses are handled when they are not retried or when retries are exceeded:
+treated as a success (default), ignored, or treated as an error.
+
+Action error caching
+====================
+
+If you use
+:class:`~scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware`, you might
+want to use a custom :setting:`HTTPCACHE_POLICY <scrapy:HTTPCACHE_POLICY>` to
+prevent responses with failed actions (i.e. after exceeding retries) to be
+cached:
+
+.. code-block:: python
+    :caption: myproject/extensions.py
+
+    from scrapy import Request
+    from scrapy.extensions.httpcache import DummyPolicy
+    from scrapy.http import Response
+    from scrapy_zyte_api.responses import ZyteAPIResponse, ZyteAPITextResponse
+
+    class ZyteAPIFailedActionsPolicy(DummyPolicy):
+
+        def should_cache_response(self, response: Response, request: Request):
+            if (
+                isinstance(response, (ZyteAPIResponse, ZyteAPITextResponse))
+                and any("error" in action for action in response.raw_api_response.get("actions", []))
+            ):
+                return False
+            return super().should_cache_response(response, request)
+
+And enable it in your settings:
+
+.. code-block:: python
+    :caption: myproject/settings.py
+
+    HTTPCACHE_POLICY = "myproject.extensions.ZyteAPIFailedActionsPolicy"
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,9 @@ files = [
 ]
 
 [tool.pytest.ini_options]
+filterwarnings = [
+    "ignore:::twisted",
+]
 junit_family = "xunit2"
 testpaths = [
     "scrapy_zyte_api/",

diff --git a/scrapy_zyte_api/_middlewares.py b/scrapy_zyte_api/_middlewares.py
@@ -1,11 +1,114 @@
 import logging
-from typing import cast
+from typing import Optional, Union, cast
 
-from scrapy import Request, signals
+from scrapy import Request, Spider, signals
 from scrapy.exceptions import IgnoreRequest
+from scrapy.http import Response
+from scrapy.utils.python import global_object_name
 from zyte_api.aio.errors import RequestError
 
 from ._params import _ParamParser
+from .exceptions import ActionError
+from .responses import ZyteAPIResponse, ZyteAPITextResponse
+
+try:
+    from scrapy.downloadermiddlewares.retry import get_retry_request
+except ImportError:
+    # Backport get_retry_request for Scrapy < 2.5.0
+
+    from logging import Logger
+    from typing import Type
+
+    from scrapy.downloadermiddlewares.retry import logger as retry_logger
+
+    def get_retry_request(
+        request: Request,
+        *,
+        spider: Spider,
+        reason: Union[str, Exception, Type[Exception]] = "unspecified",
+        max_retry_times: Optional[int] = None,
+        priority_adjust: Optional[int] = None,
+        logger: Logger = retry_logger,
+        stats_base_key: str = "retry",
+    ) -> Optional[Request]:
+        """
+        Returns a new :class:`~scrapy.Request` object to retry the specified
+        request, or ``None`` if retries of the specified request have been
+        exhausted.
+
+        For example, in a :class:`~scrapy.Spider` callback, you could use it as
+        follows::
+
+            def parse(self, response):
+                if not response.text:
+                    new_request_or_none = get_retry_request(
+                        response.request,
+                        spider=self,
+                        reason='empty',
+                    )
+                    return new_request_or_none
+
+        *spider* is the :class:`~scrapy.Spider` instance which is asking for the
+        retry request. It is used to access the :ref:`settings <topics-settings>`
+        and :ref:`stats <topics-stats>`, and to provide extra logging context (see
+        :func:`logging.debug`).
+
+        *reason* is a string or an :class:`Exception` object that indicates the
+        reason why the request needs to be retried. It is used to name retry stats.
+
+        *max_retry_times* is a number that determines the maximum number of times
+        that *request* can be retried. If not specified or ``None``, the number is
+        read from the :reqmeta:`max_retry_times` meta key of the request. If the
+        :reqmeta:`max_retry_times` meta key is not defined or ``None``, the number
+        is read from the :setting:`RETRY_TIMES` setting.
+
+        *priority_adjust* is a number that determines how the priority of the new
+        request changes in relation to *request*. If not specified, the number is
+        read from the :setting:`RETRY_PRIORITY_ADJUST` setting.
+
+        *logger* is the logging.Logger object to be used when logging messages
+
+        *stats_base_key* is a string to be used as the base key for the
+        retry-related job stats
+        """
+        settings = spider.crawler.settings
+        assert spider.crawler.stats
+        stats = spider.crawler.stats
+        retry_times = request.meta.get("retry_times", 0) + 1
+        if max_retry_times is None:
+            max_retry_times = request.meta.get("max_retry_times")
+            if max_retry_times is None:
+                max_retry_times = settings.getint("RETRY_TIMES")
+        if retry_times <= max_retry_times:
+            logger.debug(
+                "Retrying %(request)s (failed %(retry_times)d times): %(reason)s",
+                {"request": request, "retry_times": retry_times, "reason": reason},
+                extra={"spider": spider},
+            )
+            new_request: Request = request.copy()
+            new_request.meta["retry_times"] = retry_times
+            new_request.dont_filter = True
+            if priority_adjust is None:
+                priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST")
+            new_request.priority = request.priority + priority_adjust
+
+            if callable(reason):
+                reason = reason()
+            if isinstance(reason, Exception):
+                reason = global_object_name(reason.__class__)
+
+            stats.inc_value(f"{stats_base_key}/count")
+            stats.inc_value(f"{stats_base_key}/reason_count/{reason}")
+            return new_request
+        stats.inc_value(f"{stats_base_key}/max_reached")
+        logger.error(
+            "Gave up retrying %(request)s (failed %(retry_times)d times): "
+            "%(reason)s",
+            {"request": request, "retry_times": retry_times, "reason": reason},
+            extra={"spider": spider},
+        )
+        return None
+
 
 logger = logging.getLogger(__name__)
 
@@ -43,6 +146,13 @@
         self._forbidden_domain_start_request_count = 0
         self._total_start_request_count = 0
 
+        self._retry_action_errors = crawler.settings.getbool(
+            "ZYTE_API_ACTION_ERROR_RETRY_ENABLED", True
+        )
+        self._max_retry_times = crawler.settings.getint("RETRY_TIMES")
+        self._priority_adjust = crawler.settings.getint("RETRY_PRIORITY_ADJUST")
+        self._load_action_error_handling()
+
         self._max_requests = crawler.settings.getint("ZYTE_API_MAX_REQUESTS")
         if self._max_requests:
             logger.info(
@@ -56,6 +166,18 @@
             self._start_requests_processed, signal=_start_requests_processed
         )
 
+    def _load_action_error_handling(self):
+        value = self._crawler.settings.get("ZYTE_API_ACTION_ERROR_HANDLING", "pass")
+        if value in ("pass", "ignore", "err"):
+            self._action_error_handling = value
+        else:
+            fallback_value = "pass"
+            logger.error(
+                f"Setting ZYTE_API_ACTION_ERROR_HANDLING got an unexpected "
+                f"value: {value!r}. Falling back to {fallback_value!r}."
+            )
+            self._action_error_handling = fallback_value
+
     def _get_spm_mw(self):
         spm_mw_classes = []
 
@@ -164,6 +286,52 @@
             self._crawler.spider, "failed_forbidden_domain"
         )
 
+    def _handle_action_error(self, response):
+        if self._action_error_handling == "pass":
+            return response
+        elif self._action_error_handling == "ignore":
+            raise IgnoreRequest
+        else:
+            assert self._action_error_handling == "err"
+            raise ActionError(response)
+
+    def process_response(
+        self, request: Request, response: Response, spider: Spider
+    ) -> Union[Request, Response]:
+        if not isinstance(response, (ZyteAPIResponse, ZyteAPITextResponse)):
+            return response
+
+        assert response.raw_api_response is not None
+        action_error = any(
+            "error" in action for action in response.raw_api_response.get("actions", [])
+        )
+        if not action_error:
+            return response
+
+        if not self._retry_action_errors or request.meta.get("dont_retry", False):
+            return self._handle_action_error(response)
+
+        return self._retry(
+            request, reason="action-error", spider=spider
+        ) or self._handle_action_error(response)
+
+    def _retry(
+        self,
+        request: Request,
+        *,
+        reason: str,
+        spider: Spider,
+    ) -> Optional[Request]:
+        max_retry_times = request.meta.get("max_retry_times", self._max_retry_times)
+        priority_adjust = request.meta.get("priority_adjust", self._priority_adjust)
+        return get_retry_request(
+            request,
+            reason=reason,
+            spider=spider,
+            max_retry_times=max_retry_times,
+            priority_adjust=priority_adjust,
+        )
+
 
 class ScrapyZyteAPISpiderMiddleware(_BaseMiddleware):
     def __init__(self, crawler):

diff --git a/scrapy_zyte_api/exceptions.py b/scrapy_zyte_api/exceptions.py
@@ -0,0 +1,17 @@
+from typing import Union
+
+from .responses import ZyteAPIResponse, ZyteAPITextResponse
+
+
+class ActionError(ValueError):
+    """Exception raised when a Zyte API response contains an action error."""
+
+    def __init__(self, response, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        #: Offending Zyte API response.
+        #:
+        #: You can inspect the outcome of actions in the ``"actions"`` key of
+        #: :attr:`response.raw_api_response
+        #: <scrapy_zyte_api.responses.ZyteAPITextResponse.raw_api_response>`.
+        self.response: Union[ZyteAPIResponse, ZyteAPITextResponse] = response