diff --git a/README.rst b/README.rst index 4791878f..950b1c7e 100644 --- a/README.rst +++ b/README.rst @@ -275,7 +275,8 @@ possible: - ``statusCode`` becomes ``response.status``. -- ``httpResponseHeaders`` becomes ``response.headers``. +- ``httpResponseHeaders`` and ``experimental.responseCookies`` become + ``response.headers``. - ``browserHtml`` and ``httpResponseBody`` are mapped into both ``response.text`` (``str``) and ``response.body`` (``bytes``). @@ -351,6 +352,50 @@ mode (see **Using transparent mode** above) or for a specific request (see **Sending requests with automatically-mapped parameters** above), Zyte API parameters are chosen as follows by default: +- ``Request.url`` becomes ``url``, same as in requests with manually-defined + parameters. + +- If ``Request.method`` is something other than ``"GET"``, it becomes + ``httpRequestMethod``. + +- ``Request.headers`` become ``customHttpRequestHeaders``. + +- ``Request.body`` becomes ``httpRequestBody``. + +- If the ``ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED`` Scrapy setting is + ``True``, the COOKIES_ENABLED_ Scrapy setting is ``True`` (default), and + provided request metadata does not set dont_merge_cookies_ to ``True``: + + .. _COOKIES_ENABLED: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-setting-COOKIES_ENABLED + .. _dont_merge_cookies: https://docs.scrapy.org/en/latest/topics/request-response.html#std-reqmeta-dont_merge_cookies + + - ``experimental.responseCookies`` is set to ``True``. + + - Cookies from the request `cookie jar`_ become + ``experimental.requestCookies``. + + .. _cookie jar: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-reqmeta-cookiejar + + All cookies from the cookie jar are set, regardless of their cookie + domain. This is because Zyte API requests may involve requests to + different domains (e.g. when following cross-domain redirects, or + during browser rendering). + + If the cookies to be set exceed the limit defined in the + ``ZYTE_API_MAX_COOKIES`` setting (100 by default), a warning is logged, + and only as many cookies as the limit allows are set for the target + request. To silence this warning, set ``experimental.requestCookies`` + manually, e.g. to an empty dict. Alternatively, if Zyte API starts + supporting more than 100 request cookies, update the + ``ZYTE_API_MAX_COOKIES`` setting accordingly. + + If you are using a custom downloader middleware to handle request + cookiejars, you can point the ``ZYTE_API_COOKIE_MIDDLEWARE`` setting to + its import path to make scrapy-zyte-api work with it. The downloader + middleware is expected to have a ``jars`` property with the same + signature as in the built-in Scrapy downloader middleware for cookie + handling. + - ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. This is subject to change without prior notice in future versions of @@ -392,16 +437,6 @@ parameters are chosen as follows by default: future, Zyte API may be able to handle this decoding automatically, so we would stop setting ``httpResponseHeaders`` to ``True`` by default. -- ``Request.url`` becomes ``url``, same as in requests with manually-defined - parameters. - -- If ``Request.method`` is something other than ``"GET"``, it becomes - ``httpRequestMethod``. - -- ``Request.headers`` become ``customHttpRequestHeaders``. - -- ``Request.body`` becomes ``httpRequestBody``. - For example, the following Scrapy request: .. code-block:: python @@ -411,6 +446,7 @@ For example, the following Scrapy request: url="https://httpbin.org/anything", headers={"Content-Type": "application/json"}, body=b'{"foo": "bar"}', + cookies={"a": "b"}, ) Results in a request to the Zyte API data extraction endpoint with the @@ -419,12 +455,27 @@ following parameters: .. code-block:: javascript { + "customHttpRequestHeaders": [ + { + "name": "Content-Type", + "value": "application/json" + } + ], + "experimental": { + "requestCookies": [ + { + "name": "a", + "value": "b", + "domain": "" + } + ], + "responseCookies": true + }, "httpResponseBody": true, "httpResponseHeaders": true, - "url": "https://httpbin.org/anything", + "httpRequestBody": "eyJmb28iOiAiYmFyIn0=", "httpRequestMethod": "POST", - "customHttpRequestHeaders": [{"name": "Content-Type", "value": "application/json"}], - "httpRequestBody": "eyJmb28iOiAiYmFyIn0=" + "url": "https://httpbin.org/anything" } You may set the ``zyte_api_automap`` key in @@ -452,8 +503,11 @@ following parameters: { "browserHtml": true, - "url": "https://quotes.toscrape.com", + "experimental": { + "responseCookies": true + }, "requestHeaders": {"referer": "https://example.com/"}, + "url": "https://quotes.toscrape.com" } When mapping headers, headers not supported by Zyte API are excluded from the @@ -467,7 +521,7 @@ headers are included or excluded from header mapping: .. code-block:: python - ["Cookie", "User-Agent"] + ["User-Agent"] - ``ZYTE_API_BROWSER_HEADERS`` determines headers that *can* be mapped as ``requestHeaders``. It is a ``dict``, where keys are header names and @@ -667,6 +721,8 @@ fingerprinting: - Metadata parameters (``echoData``, ``jobId``) +- Experimental parameters (``experimental``) + Changing the fingerprinting of non-Zyte-API requests ---------------------------------------------------- diff --git a/scrapy_zyte_api/_cookies.py b/scrapy_zyte_api/_cookies.py new file mode 100644 index 00000000..4f360241 --- /dev/null +++ b/scrapy_zyte_api/_cookies.py @@ -0,0 +1,53 @@ +from http.cookiejar import Cookie +from typing import Any, Dict, List + +from scrapy.http import Request +from scrapy.http.cookies import CookieJar + + +def _get_cookie_jar(request: Request, cookie_jars: Dict[Any, CookieJar]) -> CookieJar: + jar_id = request.meta.get("cookiejar") + return cookie_jars[jar_id] + + +def _process_cookies( + api_response: Dict[str, Any], request: Request, cookie_jars: Dict[Any, CookieJar] +): + response_cookies = api_response.get("experimental", {}).get("responseCookies") + if not response_cookies: + return + cookie_jar = _get_cookie_jar(request, cookie_jars) + for response_cookie in response_cookies: + rest = {} + http_only = response_cookie.get("httpOnly", None) + if http_only is not None: + rest["httpOnly"] = http_only + same_site = response_cookie.get("sameSite", None) + if same_site is not None: + rest["sameSite"] = same_site + cookie = Cookie( + version=1, + name=response_cookie["name"], + value=response_cookie["value"], + port=None, + port_specified=False, + domain=response_cookie["domain"], + domain_specified=True, + domain_initial_dot=response_cookie["domain"].startswith("."), + path=response_cookie.get("path", "/"), + path_specified="path" in response_cookie, + secure=response_cookie.get("secure", False), + expires=response_cookie.get("expires", None), + discard=False, + comment=None, + comment_url=None, + rest=rest, + ) + cookie_jar.set_cookie(cookie) + + +def _get_all_cookies( + request: Request, cookie_jars: Dict[Any, CookieJar] +) -> List[Cookie]: + cookie_jar = _get_cookie_jar(request, cookie_jars) + return list(cookie_jar.jar) diff --git a/scrapy_zyte_api/_downloader_middleware.py b/scrapy_zyte_api/_downloader_middleware.py index e33e348f..bc452512 100644 --- a/scrapy_zyte_api/_downloader_middleware.py +++ b/scrapy_zyte_api/_downloader_middleware.py @@ -10,7 +10,7 @@ def from_crawler(cls, crawler): return cls(crawler) def __init__(self, crawler) -> None: - self._param_parser = _ParamParser(crawler.settings) + self._param_parser = _ParamParser(crawler) self._crawler = crawler def process_request(self, request, spider): diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index 6be292d6..45e7b7c9 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -1,13 +1,16 @@ from base64 import b64decode, b64encode from copy import copy from logging import getLogger -from typing import Any, Dict, Mapping, Optional, Set +from typing import Any, Dict, List, Mapping, Optional, Set from warnings import warn from scrapy import Request +from scrapy.http.cookies import CookieJar from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT +from ._cookies import _get_all_cookies + logger = getLogger(__name__) _DEFAULT_API_PARAMS = { @@ -53,7 +56,10 @@ def _map_custom_http_request_headers( header_parameter="customHttpRequestHeaders", ): if lowercase_k in skip_headers: - if lowercase_k != b"user-agent" or decoded_v != DEFAULT_USER_AGENT: + if not ( + lowercase_k == b"cookie" + or (lowercase_k == b"user-agent" and decoded_v == DEFAULT_USER_AGENT) + ): logger.warning( f"Request {request} defines header {k}, which " f"cannot be mapped into the Zyte API " @@ -89,6 +95,7 @@ def _map_request_headers( lowercase_k == b"accept-language" and decoded_v == DEFAULT_REQUEST_HEADERS["Accept-Language"] ) + or lowercase_k == b"cookie" or (lowercase_k == b"user-agent" and decoded_v == DEFAULT_USER_AGENT) ): logger.warning( @@ -183,6 +190,64 @@ def _set_http_response_headers_from_request( api_params.pop("httpResponseHeaders") +def _set_http_response_cookies_from_request( + *, + api_params: Dict[str, Any], +): + api_params.setdefault("experimental", {}) + api_params["experimental"].setdefault("responseCookies", True) + if api_params["experimental"]["responseCookies"] is False: + del api_params["experimental"]["responseCookies"] + + +def _set_http_request_cookies_from_request( + *, + api_params: Dict[str, Any], + request: Request, + cookie_jars: Dict[Any, CookieJar], + max_cookies: int, +): + api_params.setdefault("experimental", {}) + if "requestCookies" in api_params["experimental"]: + if api_params["experimental"]["requestCookies"] is False: + del api_params["experimental"]["requestCookies"] + return + output_cookies = [] + input_cookies = _get_all_cookies(request, cookie_jars) + input_cookie_count = len(input_cookies) + if input_cookie_count > max_cookies: + logger.warning( + ( + "Request %(request)r would get %(count)r cookies, but request " + "cookie automatic mapping is limited to %(max)r cookies " + "(see the ZYTE_API_MAX_COOKIES setting), so only %(max)r " + "cookies have been added to this request. To silence this " + "warning, set the request cookies manually through the " + "experimental.requestCookies Zyte API parameter instead. " + "Alternatively, if Zyte API starts supporting more than " + "%(max)r request cookies, update the ZYTE_API_MAX_COOKIES " + "setting accordingly." + ), + { + "request": request, + "count": input_cookie_count, + "max": max_cookies, + }, + ) + input_cookies = input_cookies[:max_cookies] + for input_cookie in input_cookies: + output_cookie = { + "name": input_cookie.name, + "value": input_cookie.value, + "domain": input_cookie.domain, + } + if input_cookie.path_specified: + output_cookie["path"] = input_cookie.path + output_cookies.append(output_cookie) + if output_cookies: + api_params["experimental"]["requestCookies"] = output_cookies + + def _set_http_request_method_from_request( *, api_params: Dict[str, Any], @@ -254,6 +319,9 @@ def _update_api_params_from_request( meta_params: Dict[str, Any], skip_headers: Set[str], browser_headers: Dict[str, str], + cookies_enabled: bool, + cookie_jars: Optional[Dict[Any, CookieJar]], + max_cookies: int, ): _set_http_response_body_from_request(api_params=api_params, request=request) _set_http_response_headers_from_request( @@ -269,6 +337,17 @@ def _update_api_params_from_request( browser_headers=browser_headers, ) _set_http_request_body_from_request(api_params=api_params, request=request) + if cookies_enabled: + assert cookie_jars is not None # typing + _set_http_response_cookies_from_request(api_params=api_params) + _set_http_request_cookies_from_request( + api_params=api_params, + request=request, + cookie_jars=cookie_jars, + max_cookies=max_cookies, + ) + if not api_params["experimental"]: + del api_params["experimental"] _unset_unneeded_api_params( api_params=api_params, request=request, default_params=default_params ) @@ -299,21 +378,33 @@ def _merge_params( param: str, setting: str, request: Request, + context: Optional[List[str]] = None, ): params = copy(default_params) meta_params = copy(meta_params) + context = context or [] for k in list(meta_params): - if meta_params[k] is not None: + if isinstance(meta_params[k], dict): + meta_params[k] = _merge_params( + default_params=params.get(k, {}), + meta_params=meta_params[k], + param=param, + setting=setting, + request=request, + context=context + [k], + ) + if meta_params[k] not in (None, {}): continue meta_params.pop(k) if k in params: params.pop(k) else: + qual_param = ".".join(context + [k]) logger.warning( - f"In request {request} {param!r} parameter {k} is None, " - f"which is a value reserved to unset parameters defined in " - f"the {setting} setting, but the setting does not define such " - f"a parameter." + f"In request {request} {param!r} parameter {qual_param} is " + f"None, which is a value reserved to unset parameters defined " + f"in the {setting} setting, but the setting does not define " + f"such a parameter." ) params.update(meta_params) return params @@ -358,6 +449,9 @@ def _get_automap_params( default_params: Dict[str, Any], skip_headers: Set[str], browser_headers: Dict[str, str], + cookies_enabled: bool, + cookie_jars: Optional[Dict[Any, CookieJar]], + max_cookies: int, ): meta_params = request.meta.get("zyte_api_automap", default_enabled) if meta_params is False: @@ -384,6 +478,9 @@ def _get_automap_params( meta_params=meta_params, skip_headers=skip_headers, browser_headers=browser_headers, + cookies_enabled=cookies_enabled, + cookie_jars=cookie_jars, + max_cookies=max_cookies, ) return params @@ -398,6 +495,9 @@ def _get_api_params( skip_headers: Set[str], browser_headers: Dict[str, str], job_id: Optional[str], + cookies_enabled: bool, + cookie_jars: Optional[Dict[Any, CookieJar]], + max_cookies: int, ) -> Optional[dict]: """Returns a dictionary of API parameters that must be sent to Zyte API for the specified request, or None if the request should not be sent through @@ -410,6 +510,9 @@ def _get_api_params( default_params=automap_params, skip_headers=skip_headers, browser_headers=browser_headers, + cookies_enabled=cookies_enabled, + cookie_jars=cookie_jars, + max_cookies=max_cookies, ) if api_params is None: return None @@ -430,11 +533,12 @@ def _get_api_params( def _load_default_params(settings, setting): params = settings.getdict(setting) for param in list(params): - if params[param] is not None: + if params[param] not in (None, {}): continue logger.warning( - f"Parameter {param!r} in the {setting} setting is None. Default " - f"parameters should never be None." + f"Parameter {param!r} in the {setting} setting is " + f"{params[param]!r}. Default parameters should never be " + f"{params[param]!r}." ) params.pop(param) return params @@ -459,16 +563,35 @@ def _load_browser_headers(settings): class _ParamParser: - def __init__(self, settings): + def __init__(self, crawler, cookies_enabled=None): + settings = crawler.settings self._automap_params = _load_default_params(settings, "ZYTE_API_AUTOMAP_PARAMS") self._browser_headers = _load_browser_headers(settings) self._default_params = _load_default_params(settings, "ZYTE_API_DEFAULT_PARAMS") self._job_id = settings.get("JOB") self._transparent_mode = settings.getbool("ZYTE_API_TRANSPARENT_MODE", False) self._skip_headers = _load_skip_headers(settings) + self._warn_on_cookies = False + if cookies_enabled is not None: + self._cookies_enabled = cookies_enabled + elif settings.getbool("ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED") is True: + self._cookies_enabled = settings.getbool("COOKIES_ENABLED") + if not self._cookies_enabled: + logger.warning( + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is True, but it " + "will have no effect because COOKIES_ENABLED is False." + ) + else: + self._cookies_enabled = False + self._warn_on_cookies = settings.getbool("COOKIES_ENABLED") + self._max_cookies = settings.getint("ZYTE_API_MAX_COOKIES", 100) + self._crawler = crawler + self._cookie_jars = None def parse(self, request): - return _get_api_params( + dont_merge_cookies = request.meta.get("dont_merge_cookies", False) + cookies_enabled = self._cookies_enabled and not dont_merge_cookies + params = _get_api_params( request, default_params=self._default_params, transparent_mode=self._transparent_mode, @@ -476,4 +599,35 @@ def parse(self, request): skip_headers=self._skip_headers, browser_headers=self._browser_headers, job_id=self._job_id, + cookies_enabled=cookies_enabled, + cookie_jars=self._cookie_jars, + max_cookies=self._max_cookies, + ) + if not dont_merge_cookies and self._warn_on_cookies: + self._handle_warn_on_cookies(request, params) + return params + + def _handle_warn_on_cookies(self, request, params): + if params and params.get("experimental", {}).get("requestCookies") is not None: + return + if self._cookie_jars is None: + return + input_cookies = _get_all_cookies(request, self._cookie_jars) + if len(input_cookies) <= 0: + return + logger.warning( + ( + "Cookies are enabled for request %(request)r, and there are " + "cookies in the cookiejar, but " + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is False, so automatic " + "mapping will not map cookies for this or any other request. " + "To silence this warning, disable cookies for all requests " + "that use automated mapping, either with the " + "COOKIES_ENABLED setting or with the dont_merge_cookies " + "request metadata key." + ), + { + "request": request, + }, ) + self._warn_on_cookies = False diff --git a/scrapy_zyte_api/_request_fingerprinter.py b/scrapy_zyte_api/_request_fingerprinter.py index 6d942fb1..6523df2b 100644 --- a/scrapy_zyte_api/_request_fingerprinter.py +++ b/scrapy_zyte_api/_request_fingerprinter.py @@ -35,12 +35,13 @@ def __init__(self, crawler): crawler=crawler, ) self._cache: "WeakKeyDictionary[Request, bytes]" = WeakKeyDictionary() - self._param_parser = _ParamParser(settings) + self._param_parser = _ParamParser(crawler, cookies_enabled=False) self._skip_keys = ( "customHttpRequestHeaders", "echoData", "jobId", "requestHeaders", + "experimental", ) def _keep_fragments(self, api_params): diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 39d4b08f..663e94eb 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -3,7 +3,7 @@ from copy import deepcopy from typing import Generator, Optional, Union -from scrapy import Spider +from scrapy import Spider, signals from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy.crawler import Crawler from scrapy.exceptions import NotConfigured @@ -71,7 +71,15 @@ def __init__( verify_installed_reactor( "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ) - self._param_parser = _ParamParser(settings) + self._cookies_enabled = settings.getbool("COOKIES_ENABLED") + self._cookie_jars = None + self._cookie_mw_cls = load_object( + settings.get( + "ZYTE_API_COOKIE_MIDDLEWARE", + "scrapy.downloadermiddlewares.cookies.CookiesMiddleware", + ) + ) + self._param_parser = _ParamParser(crawler) self._retry_policy = _load_retry_policy(settings) self._stats = crawler.stats self._session = create_session(connection_pool_size=self._client.n_conn) @@ -83,6 +91,25 @@ def __init__( f"({self._truncate_limit}) is invalid. It must be 0 or a " f"positive integer." ) + crawler.signals.connect(self.engine_started, signal=signals.engine_started) + if not hasattr(self, "_crawler"): # Scrapy 2.1 and earlier + self._crawler = crawler + + def engine_started(self): + if not self._cookies_enabled: + return + for middleware in self._crawler.engine.downloader.middleware.middlewares: + if isinstance(middleware, self._cookie_mw_cls): + self._cookie_jars = middleware.jars + self._param_parser._cookie_jars = self._cookie_jars + return + middleware_path = ( + f"{self._cookie_mw_cls.__module__}.{self._cookie_mw_cls.__qualname__}" + ) + raise RuntimeError( + f"Could not find a configured downloader middleware that is an " + f"instance of {middleware_path} (see ZYTE_API_COOKIE_MIDDLEWARE)." + ) @staticmethod def _build_client(settings): @@ -189,7 +216,8 @@ async def _download_request( finally: self._update_stats() - return _process_response(api_response, request) + assert self._cookie_jars is not None # typing + return _process_response(api_response, request, self._cookie_jars) def _log_request(self, params): if not self._must_log_request: diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 58bc0afb..bd5d50ab 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -1,10 +1,13 @@ from base64 import b64decode -from typing import Dict, List, Optional, Tuple, Union +from datetime import datetime +from typing import Any, Dict, List, Optional, Tuple, Union from scrapy import Request -from scrapy.http import Response, TextResponse, HtmlResponse +from scrapy.http import HtmlResponse, Response, TextResponse +from scrapy.http.cookies import CookieJar from scrapy.responsetypes import responsetypes +from scrapy_zyte_api._cookies import _process_cookies from scrapy_zyte_api.utils import ( _RESPONSE_HAS_ATTRIBUTES, _RESPONSE_HAS_IP_ADDRESS, @@ -59,15 +62,48 @@ def raw_api_response(self) -> Optional[Dict]: """ return self._raw_api_response + @staticmethod + def _response_cookie_to_header_value(cookie): + result = f"{cookie['name']}={cookie['value']}; Domain={cookie['domain']}" + path = cookie.get("path") + if path is not None: + result += f"; Path={path}" + expires = cookie.get("expires") + if expires is not None: + expires_date = datetime.utcfromtimestamp(expires) + expires_date_string = expires_date.strftime("%a, %d %b %Y %H:%M:%S GMT") + result += f"; Expires={expires_date_string}" + if cookie.get("httpOnly"): + result += "; HttpOnly" + if cookie.get("secure"): + result += "; Secure" + same_site = cookie.get("sameSite") + if same_site is not None: + result += f"; SameSite={same_site}" + return result + @classmethod - def _prepare_headers(cls, init_headers: Optional[List[Dict[str, str]]]): - if not init_headers: - return None - return { - h["name"]: h["value"] - for h in init_headers - if h["name"].lower() not in cls.REMOVE_HEADERS - } + def _prepare_headers(cls, api_response: Dict[str, Any]): + result: Dict[str, List[str]] = {} + input_headers: Optional[List[Dict[str, str]]] = api_response.get( + "httpResponseHeaders" + ) + if input_headers: + result = { + h["name"]: [h["value"]] + for h in input_headers + if h["name"].lower() not in cls.REMOVE_HEADERS + } + input_cookies: Optional[List[Dict[str, str]]] = api_response.get( + "experimental", {} + ).get("responseCookies") + if input_cookies: + result["Set-Cookie"] = [] + for cookie in input_cookies: + result["Set-Cookie"].append( + cls._response_cookie_to_header_value(cookie) + ) + return result class ZyteAPITextResponse(ZyteAPIMixin, HtmlResponse): @@ -92,7 +128,7 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): encoding=encoding, request=request, flags=["zyte-api"], - headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), + headers=cls._prepare_headers(api_response), raw_api_response=api_response, ) @@ -113,7 +149,7 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): body=b64decode(api_response.get("httpResponseBody") or ""), request=request, flags=["zyte-api"], - headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), + headers=cls._prepare_headers(api_response), raw_api_response=api_response, ) @@ -126,7 +162,7 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): def _process_response( - api_response: _API_RESPONSE, request: Request + api_response: _API_RESPONSE, request: Request, cookie_jars: Dict[Any, CookieJar] ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: """Given a Zyte API Response and the ``scrapy.Request`` that asked for it, this returns either a ``ZyteAPITextResponse`` or ``ZyteAPIResponse`` depending @@ -139,6 +175,8 @@ def _process_response( # - https://github.com/scrapy-plugins/scrapy-zyte-api/pull/10#issuecomment-1131406460 # For now, at least one of them should be present. + _process_cookies(api_response, request, cookie_jars) + if api_response.get("browserHtml"): # Using TextResponse because browserHtml always returns a browser-rendered page # even when requesting files (like images) diff --git a/tests/__init__.py b/tests/__init__.py index 5a35538c..4841aa17 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -2,9 +2,9 @@ from os import environ from typing import Optional -from scrapy.exceptions import NotConfigured -from scrapy.utils.misc import create_instance -from scrapy.utils.test import get_crawler +from scrapy import Spider +from scrapy.crawler import Crawler +from scrapy.utils.test import get_crawler as _get_crawler from zyte_api.aio.client import AsyncClient from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler @@ -17,26 +17,46 @@ "http": "scrapy_zyte_api.handler.ScrapyZyteAPIDownloadHandler", "https": "scrapy_zyte_api.handler.ScrapyZyteAPIDownloadHandler", }, + "REQUEST_FINGERPRINTER_CLASS": "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter", "ZYTE_API_KEY": _API_KEY, "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", } UNSET = object() +class DummySpider(Spider): + name = "dummy" + + +def get_crawler(settings=None, spider_cls=DummySpider, setup_engine=True): + settings = settings or {} + crawler = _get_crawler(settings_dict=settings, spidercls=spider_cls) + if setup_engine: + setup_crawler_engine(crawler) + return crawler + + +def get_downloader_middleware(crawler, cls): + for middleware in crawler.engine.downloader.middleware.middlewares: + if isinstance(middleware, cls): + return middleware + class_path = f"{cls.__module__}.{cls.__qualname__}" + raise ValueError(f"Cannot find downloader middleware {class_path}") + + +def get_download_handler(crawler, schema): + return crawler.engine.downloader.handlers._get_handler(schema) + + @asynccontextmanager async def make_handler(settings: dict, api_url: Optional[str] = None): - settings = settings or {} - settings["ZYTE_API_KEY"] = "a" + settings = {**SETTINGS, **settings} if api_url is not None: settings["ZYTE_API_URL"] = api_url - crawler = get_crawler(settings_dict=settings) - try: - handler = create_instance( - ScrapyZyteAPIDownloadHandler, - settings=None, - crawler=crawler, - ) - except NotConfigured: # i.e. ZYTE_API_ENABLED=False + crawler = get_crawler(settings) + handler = get_download_handler(crawler, "https") + if not isinstance(handler, ScrapyZyteAPIDownloadHandler): + # i.e. ZYTE_API_ENABLED=False handler = None try: yield handler @@ -54,3 +74,19 @@ def set_env(**env_vars): finally: environ.clear() environ.update(old_environ) + + +def setup_crawler_engine(crawler: Crawler): + """Run the crawl steps until engine setup, so that crawler.engine is not + None. + + https://github.com/scrapy/scrapy/blob/8fbebfa943c3352f5ba49f46531a6ccdd0b52b60/scrapy/crawler.py#L116-L122 + """ + + crawler.crawling = True + crawler.spider = crawler._create_spider() + crawler.engine = crawler._create_engine() + + handler = get_download_handler(crawler, "https") + if hasattr(handler, "engine_started"): + handler.engine_started() diff --git a/tests/mockserver.py b/tests/mockserver.py index 920620cb..289b1430 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -19,7 +19,7 @@ from scrapy_zyte_api.responses import _API_RESPONSE -from . import make_handler +from . import SETTINGS, make_handler def get_ephemeral_port(): @@ -30,6 +30,7 @@ def get_ephemeral_port(): @ensureDeferred async def produce_request_response(mockserver, meta, settings=None): + settings = settings if settings is not None else {**SETTINGS} async with mockserver.make_handler(settings) as handler: req = Request(mockserver.urljoin("/"), meta=meta) resp = await handler.download_request(req, None) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 16526df2..0424986c 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,9 +1,11 @@ import sys from asyncio import iscoroutine +from collections import defaultdict from copy import copy from functools import partial +from http.cookiejar import Cookie from inspect import isclass -from typing import Any, Dict +from typing import Any, Dict, cast from unittest import mock from unittest.mock import patch @@ -11,17 +13,26 @@ from _pytest.logging import LogCaptureFixture # NOQA from pytest_twisted import ensureDeferred from scrapy import Request, Spider +from scrapy.downloadermiddlewares.cookies import CookiesMiddleware from scrapy.exceptions import CloseSpider from scrapy.http import Response, TextResponse +from scrapy.http.cookies import CookieJar from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT -from scrapy.utils.test import get_crawler from twisted.internet.defer import Deferred from zyte_api.aio.errors import RequestError +from scrapy_zyte_api._cookies import _get_cookie_jar from scrapy_zyte_api.handler import _ParamParser - -from . import DEFAULT_CLIENT_CONCURRENCY, SETTINGS +from scrapy_zyte_api.responses import _process_response + +from . import ( + DEFAULT_CLIENT_CONCURRENCY, + SETTINGS, + get_crawler, + get_download_handler, + get_downloader_middleware, +) from .mockserver import DelayedResource, MockServer, produce_request_response @@ -230,13 +241,14 @@ async def parse(self, response): raise CloseSpider crawler = get_crawler( - TestSpider, { **SETTINGS, "CONCURRENT_REQUESTS": concurrency, "CONCURRENT_REQUESTS_PER_DOMAIN": concurrency, "ZYTE_API_URL": server.urljoin("/"), }, + TestSpider, + setup_engine=False, ) await crawler.crawl() @@ -249,6 +261,8 @@ async def parse(self, response): TRANSPARENT_MODE = False SKIP_HEADERS = {b"cookie", b"user-agent"} JOB_ID = None +COOKIES_ENABLED = False +MAX_COOKIES = 100 GET_API_PARAMS_KWARGS = { "default_params": DEFAULT_PARAMS, "transparent_mode": TRANSPARENT_MODE, @@ -256,6 +270,8 @@ async def parse(self, response): "skip_headers": SKIP_HEADERS, "browser_headers": BROWSER_HEADERS, "job_id": JOB_ID, + "cookies_enabled": COOKIES_ENABLED, + "max_cookies": MAX_COOKIES, } @@ -265,16 +281,18 @@ async def test_params_parser_input_default(mockserver): for key in GET_API_PARAMS_KWARGS: actual = getattr(handler._param_parser, f"_{key}") expected = GET_API_PARAMS_KWARGS[key] - assert actual == expected + assert actual == expected, key @ensureDeferred async def test_param_parser_input_custom(mockserver): settings = { "JOB": "1/2/3", + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, "ZYTE_API_AUTOMAP_PARAMS": {"c": "d"}, "ZYTE_API_BROWSER_HEADERS": {"B": "b"}, "ZYTE_API_DEFAULT_PARAMS": {"a": "b"}, + "ZYTE_API_MAX_COOKIES": 1, "ZYTE_API_SKIP_HEADERS": {"A"}, "ZYTE_API_TRANSPARENT_MODE": True, } @@ -282,8 +300,10 @@ async def test_param_parser_input_custom(mockserver): parser = handler._param_parser assert parser._automap_params == {"c": "d"} assert parser._browser_headers == {b"b": "b"} + assert parser._cookies_enabled is True assert parser._default_params == {"a": "b"} assert parser._job_id == "1/2/3" + assert parser._max_cookies == 1 assert parser._skip_headers == {b"a"} assert parser._transparent_mode is True @@ -404,9 +424,10 @@ def test_transparent_mode_toggling(setting, meta, expected): :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ request = Request(url="https://example.com", meta=meta) - settings = {"ZYTE_API_TRANSPARENT_MODE": setting} - crawler = get_crawler(settings_dict=settings) - param_parser = _ParamParser(crawler.settings) + settings = {**SETTINGS, "ZYTE_API_TRANSPARENT_MODE": setting} + crawler = get_crawler(settings) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser func = partial(param_parser.parse, request) if isclass(expected): with pytest.raises(expected): @@ -426,7 +447,7 @@ def test_api_disabling_deprecated(meta): request = Request(url="https://example.com") request.meta["zyte_api"] = meta crawler = get_crawler() - param_parser = _ParamParser(crawler.settings) + param_parser = _ParamParser(crawler) with pytest.warns(DeprecationWarning, match=r".* Use False instead\.$"): api_params = param_parser.parse(request) assert api_params is None @@ -440,7 +461,7 @@ def test_bad_meta_type(key, value): :exc:`ValueError` exception.""" request = Request(url="https://example.com", meta={key: value}) crawler = get_crawler() - param_parser = _ParamParser(crawler.settings) + param_parser = _ParamParser(crawler) with pytest.raises(ValueError): param_parser.parse(request) @@ -457,9 +478,10 @@ async def test_job_id(meta, mockserver): :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ request = Request(url="https://example.com", meta={meta: True}) - settings = {"JOB": "1/2/3"} - crawler = get_crawler(settings_dict=settings) - param_parser = _ParamParser(crawler.settings) + settings: Dict[str, Any] = {**SETTINGS, "JOB": "1/2/3"} + crawler = get_crawler(settings) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser api_params = param_parser.parse(request) assert api_params["jobId"] == "1/2/3" @@ -495,12 +517,23 @@ async def test_default_params_none(mockserver, caplog): [ ({}, {}, {}, []), ({}, {"b": 2}, {"b": 2}, []), - ({}, {"b": None}, {}, ["does not define such a parameter"]), + ({}, {"b": None}, {}, ["parameter b is None"]), ({"a": 1}, {}, {"a": 1}, []), ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}, []), - ({"a": 1}, {"b": None}, {"a": 1}, ["does not define such a parameter"]), + ({"a": 1}, {"b": None}, {"a": 1}, ["parameter b is None"]), ({"a": 1}, {"a": 2}, {"a": 2}, []), ({"a": 1}, {"a": None}, {}, []), + ({"a": {"b": 1}}, {}, {"a": {"b": 1}}, []), + ({"a": {"b": 1}}, {"a": {"c": 1}}, {"a": {"b": 1, "c": 1}}, []), + ( + {"a": {"b": 1}}, + {"a": {"c": None}}, + {"a": {"b": 1}}, + ["parameter a.c is None"], + ), + ({"a": {"b": 1}}, {"a": {"b": 2}}, {"a": {"b": 2}}, []), + ({"a": {"b": 1}}, {"a": {"b": None}}, {}, []), + ({"a": {"b": 1, "c": 1}}, {"a": {"b": None}}, {"a": {"c": 1}}, []), ], ) @pytest.mark.parametrize( @@ -531,9 +564,10 @@ def test_default_params_merging( """ request = Request(url="https://example.com") request.meta[meta_key] = meta - settings = {setting_key: setting} - crawler = get_crawler(settings_dict=settings) - param_parser = _ParamParser(crawler.settings) + settings = {**SETTINGS, setting_key: setting} + crawler = get_crawler(settings) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser with caplog.at_level("WARNING"): api_params = param_parser.parse(request) for key in ignore_keys: @@ -584,19 +618,53 @@ def test_default_params_immutability(setting_key, meta_key, setting, meta): request = Request(url="https://example.com") request.meta[meta_key] = meta default_params = copy(setting) - settings = {setting_key: setting} - crawler = get_crawler(settings_dict=settings) - param_parser = _ParamParser(crawler.settings) + settings = {**SETTINGS, setting_key: setting} + crawler = get_crawler(settings) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser param_parser.parse(request) assert default_params == setting -def _test_automap(settings, request_kwargs, meta, expected, warnings, caplog): +def _test_automap( + settings, request_kwargs, meta, expected, warnings, caplog, cookie_jar=None +): request = Request(url="https://example.com", **request_kwargs) request.meta["zyte_api_automap"] = meta - settings = {**settings, "ZYTE_API_TRANSPARENT_MODE": True} - crawler = get_crawler(settings_dict=settings) - param_parser = _ParamParser(crawler.settings) + settings = {**SETTINGS, **settings, "ZYTE_API_TRANSPARENT_MODE": True} + crawler = get_crawler(settings) + if "cookies" in request_kwargs: + try: + cookie_middleware = get_downloader_middleware(crawler, CookiesMiddleware) + except ValueError: + pass + else: + cookie_middleware.process_request(request, spider=None) + if cookie_jar: + _cookie_jar = _get_cookie_jar(request, cookie_middleware.jars) + for cookie in cookie_jar: + _cookie = Cookie( + version=1, + name=cookie["name"], + value=cookie["value"], + port=None, + port_specified=False, + domain=cookie["domain"], + domain_specified=True, + domain_initial_dot=cookie["domain"].startswith("."), + path=cookie.get("path", "/"), + path_specified="path" in cookie, + secure=cookie.get("secure", False), + expires=cookie.get("expires", None), + discard=False, + comment=None, + comment_url=None, + rest={}, + ) + _cookie_jar.set_cookie(_cookie) + + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser with caplog.at_level("WARNING"): api_params = param_parser.parse(request) api_params.pop("url") @@ -1320,15 +1388,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): # Unsupported headers not present in Scrapy requests by default are # dropped with a warning. # If all headers are unsupported, the header parameter is not even set. - ( - {"Cookie": "a=b"}, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), ( {"a": "b"}, {"browserHtml": True}, @@ -1338,15 +1397,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): ["cannot be mapped"], ), # Headers with an empty string as value are not silently ignored. - ( - {"Cookie": ""}, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), ( {"a": ""}, {"browserHtml": True}, @@ -1442,10 +1492,9 @@ def test_automap_headers(headers, meta, expected, warnings, caplog): # in the future. ( { - "ZYTE_API_SKIP_HEADERS": ["Cookie"], + "ZYTE_API_SKIP_HEADERS": [], }, { - "Cookie": "", "User-Agent": "", }, {}, @@ -1456,9 +1505,7 @@ def test_automap_headers(headers, meta, expected, warnings, caplog): {"name": "User-Agent", "value": ""}, ], }, - [ - "defines header b'Cookie', which cannot be mapped", - ], + [], ), # You may update the ZYTE_API_BROWSER_HEADERS setting to extend support # for new fields that the requestHeaders parameter may support in the @@ -1484,6 +1531,773 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca _test_automap(settings, {"headers": headers}, meta, expected, warnings, caplog) +REQUEST_INPUT_COOKIES_EMPTY: Dict[str, str] = {} +REQUEST_INPUT_COOKIES_MINIMAL_DICT = {"a": "b"} +REQUEST_INPUT_COOKIES_MINIMAL_LIST = [{"name": "a", "value": "b"}] +REQUEST_INPUT_COOKIES_MAXIMAL = [ + {"name": "c", "value": "d", "domain": "example.com", "path": "/"} +] +REQUEST_OUTPUT_COOKIES_MINIMAL = [{"name": "a", "value": "b", "domain": "example.com"}] +REQUEST_OUTPUT_COOKIES_MAXIMAL = [ + {"name": "c", "value": "d", "domain": ".example.com", "path": "/"} +] + + +@pytest.mark.parametrize( + "settings,cookies,meta,params,expected,warnings,cookie_jar", + [ + # Cookies, both for requests and for responses, are enabled based on + # both ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED (default: False) and + # COOKIES_ENABLED (default: True). + *( + ( + settings, + input_cookies, + {}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + setup_warnings + or ( + run_time_warnings + if cast(Dict, settings).get("COOKIES_ENABLED", True) + else [] + ), + [], + ) + for input_cookies, run_time_warnings in ( + ( + REQUEST_INPUT_COOKIES_EMPTY, + [], + ), + ( + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + [ + "there are cookies in the cookiejar, but ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is False", + ], + ), + ) + for settings, setup_warnings in ( + ( + {}, + [], + ), + ( + { + "COOKIES_ENABLED": True, + }, + [], + ), + ( + { + "COOKIES_ENABLED": False, + }, + [], + ), + ( + { + "COOKIES_ENABLED": False, + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + [ + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is True, but it will have no effect because COOKIES_ENABLED is False.", + ], + ), + ( + { + "COOKIES_ENABLED": False, + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": False, + }, + [], + ), + ) + ), + *( + ( + settings, + input_cookies, + {}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "responseCookies": True, + **cast(Dict, output_cookies), + }, + }, + [], + [], + ) + for input_cookies, output_cookies in ( + ( + REQUEST_INPUT_COOKIES_EMPTY, + {}, + ), + ( + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {"requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL}, + ), + ) + for settings in ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + { + "COOKIES_ENABLED": True, + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + ) + ), + # Do not warn about request cookies not being mapped if cookies are + # manually set. + *( + ( + settings, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + { + "experimental": { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + }, + }, + [], + [], + ) + for settings in ( + {}, + { + "COOKIES_ENABLED": True, + }, + ) + ), + # dont_merge_cookies=True on request metadata disables cookies. + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + { + "dont_merge_cookies": True, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "dont_merge_cookies": True, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + [], + ), + # Do not warn about request cookies not being mapped if + # dont_merge_cookies=True is set on request metadata. + *( + ( + settings, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + { + "dont_merge_cookies": True, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + [ + { + "name": "foo", + "value": "bar", + "domain": "example.com", + } + ], + ) + for settings in ( + {}, + { + "COOKIES_ENABLED": True, + }, + ) + ), + # Cookies can be disabled setting the corresponding Zyte API parameter + # to False. + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + {}, + { + "experimental": { + "responseCookies": False, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + {}, + { + "experimental": { + "requestCookies": False, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": {"responseCookies": True}, + }, + [], + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_EMPTY, + {}, + { + "experimental": { + "responseCookies": False, + "requestCookies": False, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + { + "experimental": { + "responseCookies": False, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + }, + }, + [], + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + { + "experimental": { + "requestCookies": False, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": {"responseCookies": True}, + }, + [], + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + { + "experimental": { + "responseCookies": False, + "requestCookies": False, + } + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + [], + ), + # Cookies work for browser requests as well. + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + { + "browserHtml": True, + }, + { + "browserHtml": True, + "experimental": { + "responseCookies": True, + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + }, + }, + [], + [], + ), + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + { + "screenshot": True, + }, + { + "screenshot": True, + "experimental": { + "responseCookies": True, + "requestCookies": REQUEST_OUTPUT_COOKIES_MINIMAL, + }, + }, + [], + [], + ), + # Cookies are mapped correctly, both with minimum and maximum cookie + # parameters. + *( + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + input, + {}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "responseCookies": True, + "requestCookies": output, + }, + }, + [], + [], + ) + for input, output in ( + ( + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + REQUEST_OUTPUT_COOKIES_MINIMAL, + ), + ( + REQUEST_INPUT_COOKIES_MINIMAL_LIST, + REQUEST_OUTPUT_COOKIES_MINIMAL, + ), + ( + REQUEST_INPUT_COOKIES_MAXIMAL, + REQUEST_OUTPUT_COOKIES_MAXIMAL, + ), + ) + ), + # requestCookies, if set manually, prevents automatic mapping. + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + REQUEST_INPUT_COOKIES_MINIMAL_DICT, + {}, + { + "experimental": { + "requestCookies": REQUEST_OUTPUT_COOKIES_MAXIMAL, + }, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "responseCookies": True, + "requestCookies": REQUEST_OUTPUT_COOKIES_MAXIMAL, + }, + }, + [], + [], + ), + # Mapping multiple cookies works. + ( + { + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + }, + {"a": "b", "c": "d"}, + {}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "responseCookies": True, + "requestCookies": [ + {"name": "a", "value": "b", "domain": "example.com"}, + {"name": "c", "value": "d", "domain": "example.com"}, + ], + }, + }, + [], + [], + ), + ], +) +def test_automap_cookies( + settings, cookies, meta, params, expected, warnings, cookie_jar, caplog +): + _test_automap( + settings, + {"cookies": cookies, "meta": meta}, + params, + expected, + warnings, + caplog, + cookie_jar=cookie_jar, + ) + + +@pytest.mark.parametrize( + "meta", + [ + {}, + {"zyte_api_automap": {"browserHtml": True}}, + ], +) +def test_automap_all_cookies(meta): + """Because of scenarios like cross-domain redirects and browser rendering, + Zyte API requests should include all cookie jar cookies, regardless of + the target URL domain.""" + settings: Dict[str, Any] = { + **SETTINGS, + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + "ZYTE_API_TRANSPARENT_MODE": True, + } + crawler = get_crawler(settings) + cookie_middleware = get_downloader_middleware(crawler, CookiesMiddleware) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser + + # Start from a cookiejar with an existing cookie for a.example. + pre_request = Request( + url="https://a.example", + meta=meta, + cookies={"a": "b"}, + ) + cookie_middleware.process_request(pre_request, spider=None) + + # Send a request to c.example, with a cookie for b.example, and ensure that + # it includes the cookies for a.example and b.example. + request1 = Request( + url="https://c.example", + meta=meta, + cookies=[ + { + "name": "c", + "value": "d", + "domain": "b.example", + }, + ], + ) + cookie_middleware.process_request(request1, spider=None) + api_params = param_parser.parse(request1) + assert api_params["experimental"]["requestCookies"] == [ + {"name": "a", "value": "b", "domain": "a.example"}, + # https://github.com/scrapy/scrapy/issues/5841 + # {"name": "c", "value": "d", "domain": "b.example"}, + ] + + # Have the response set a cookie for c.example and d.example. + api_response: Dict[str, Any] = { + "url": "https://c.example", + "httpResponseBody": "", + "statusCode": 200, + "experimental": { + "responseCookies": [ + { + "name": "e", + "value": "f", + "domain": ".c.example", + }, + { + "name": "g", + "value": "h", + "domain": ".d.example", + }, + ], + }, + } + assert handler._cookie_jars is not None # typing + response = _process_response(api_response, request1, handler._cookie_jars) + cookie_middleware.process_response(request1, response, spider=None) + + # Send a second request to e.example, and ensure that cookies + # for all other domains are included. + request2 = Request( + url="https://e.example", + meta=meta, + ) + cookie_middleware.process_request(request2, spider=None) + api_params = param_parser.parse(request2) + assert api_params["experimental"]["requestCookies"] == [ + {"name": "e", "value": "f", "domain": ".c.example"}, + {"name": "g", "value": "h", "domain": ".d.example"}, + {"name": "a", "value": "b", "domain": "a.example"}, + # https://github.com/scrapy/scrapy/issues/5841 + # {"name": "c", "value": "d", "domain": "b.example"}, + ] + + +@pytest.mark.parametrize( + "meta", + [ + {}, + {"zyte_api_automap": {"browserHtml": True}}, + ], +) +def test_automap_cookie_jar(meta): + """Test that cookies from the right jar are used.""" + request1 = Request( + url="https://example.com/1", meta={**meta, "cookiejar": "a"}, cookies={"z": "y"} + ) + request2 = Request(url="https://example.com/2", meta={**meta, "cookiejar": "b"}) + request3 = Request( + url="https://example.com/3", meta={**meta, "cookiejar": "a"}, cookies={"x": "w"} + ) + request4 = Request(url="https://example.com/4", meta={**meta, "cookiejar": "a"}) + settings: Dict[str, Any] = { + **SETTINGS, + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + "ZYTE_API_TRANSPARENT_MODE": True, + } + crawler = get_crawler(settings) + cookie_middleware = get_downloader_middleware(crawler, CookiesMiddleware) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser + + cookie_middleware.process_request(request1, spider=None) + api_params = param_parser.parse(request1) + assert api_params["experimental"]["requestCookies"] == [ + {"name": "z", "value": "y", "domain": "example.com"} + ] + + cookie_middleware.process_request(request2, spider=None) + api_params = param_parser.parse(request2) + assert "requestCookies" not in api_params["experimental"] + + cookie_middleware.process_request(request3, spider=None) + + api_params = param_parser.parse(request3) + assert api_params["experimental"]["requestCookies"] == [ + {"name": "x", "value": "w", "domain": "example.com"}, + {"name": "z", "value": "y", "domain": "example.com"}, + ] + + cookie_middleware.process_request(request4, spider=None) + api_params = param_parser.parse(request4) + assert api_params["experimental"]["requestCookies"] == [ + {"name": "x", "value": "w", "domain": "example.com"}, + {"name": "z", "value": "y", "domain": "example.com"}, + ] + + +@pytest.mark.parametrize( + "meta", + [ + {}, + {"zyte_api_automap": {"browserHtml": True}}, + ], +) +def test_automap_cookie_limit(meta, caplog): + settings: Dict[str, Any] = { + **SETTINGS, + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + "ZYTE_API_MAX_COOKIES": 1, + "ZYTE_API_TRANSPARENT_MODE": True, + } + crawler = get_crawler(settings) + cookie_middleware = get_downloader_middleware(crawler, CookiesMiddleware) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser + cookiejar = 0 + + # Verify that request with 1 cookie works as expected. + request = Request( + url="https://example.com/1", + meta={**meta, "cookiejar": cookiejar}, + cookies={"z": "y"}, + ) + cookiejar += 1 + cookie_middleware.process_request(request, spider=None) + with caplog.at_level("WARNING"): + api_params = param_parser.parse(request) + assert api_params["experimental"]["requestCookies"] == [ + {"name": "z", "value": "y", "domain": "example.com"} + ] + assert not caplog.records + caplog.clear() + + # Verify that requests with 2 cookies results in only 1 cookie set and a + # warning. + request = Request( + url="https://example.com/1", + meta={**meta, "cookiejar": cookiejar}, + cookies={"z": "y", "x": "w"}, + ) + cookiejar += 1 + cookie_middleware.process_request(request, spider=None) + with caplog.at_level("WARNING"): + api_params = param_parser.parse(request) + assert api_params["experimental"]["requestCookies"] == [ + {"name": "x", "value": "w", "domain": "example.com"} + ] + assert "would get 2 cookies" in caplog.text + assert "limited to 1 cookies" in caplog.text + caplog.clear() + + # Verify that 1 cookie in the cookie jar and 1 cookie in the request count + # as 2 cookies, resulting in only 1 cookie set and a warning. + pre_request = Request( + url="https://example.com/1", + meta={**meta, "cookiejar": cookiejar}, + cookies={"z": "y"}, + ) + cookie_middleware.process_request(pre_request, spider=None) + request = Request( + url="https://example.com/1", + meta={**meta, "cookiejar": cookiejar}, + cookies={"x": "w"}, + ) + cookiejar += 1 + cookie_middleware.process_request(request, spider=None) + with caplog.at_level("WARNING"): + api_params = param_parser.parse(request) + assert api_params["experimental"]["requestCookies"] == [ + {"name": "x", "value": "w", "domain": "example.com"} + ] + assert "would get 2 cookies" in caplog.text + assert "limited to 1 cookies" in caplog.text + caplog.clear() + + # Vefify that unrelated-domain cookies count for the limit. + pre_request = Request( + url="https://other.example/1", + meta={**meta, "cookiejar": cookiejar}, + cookies={"z": "y"}, + ) + cookie_middleware.process_request(pre_request, spider=None) + request = Request( + url="https://example.com/1", + meta={**meta, "cookiejar": cookiejar}, + cookies={"x": "w"}, + ) + cookiejar += 1 + cookie_middleware.process_request(request, spider=None) + with caplog.at_level("WARNING"): + api_params = param_parser.parse(request) + assert api_params["experimental"]["requestCookies"] == [ + {"name": "x", "value": "w", "domain": "example.com"} + ] + assert "would get 2 cookies" in caplog.text + assert "limited to 1 cookies" in caplog.text + caplog.clear() + + +class CustomCookieJar(CookieJar): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.jar.set_cookie( + Cookie( + 1, + "z", + "y", + None, + False, + "example.com", + True, + False, + "/", + False, + False, + None, + False, + None, + None, + {}, + ) + ) + + +class CustomCookieMiddleware(CookiesMiddleware): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.jars = defaultdict(CustomCookieJar) + + +def test_automap_custom_cookie_middleware(): + mw_cls = CustomCookieMiddleware + settings = { + **SETTINGS, + "DOWNLOADER_MIDDLEWARES": { + "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": None, + f"{mw_cls.__module__}.{mw_cls.__qualname__}": 700, + }, + "ZYTE_API_COOKIE_MIDDLEWARE": f"{mw_cls.__module__}.{mw_cls.__qualname__}", + "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, + "ZYTE_API_TRANSPARENT_MODE": True, + } + crawler = get_crawler(settings) + cookie_middleware = get_downloader_middleware(crawler, mw_cls) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser + + request = Request(url="https://example.com/1") + cookie_middleware.process_request(request, spider=None) + api_params = param_parser.parse(request) + assert api_params["experimental"]["requestCookies"] == [ + {"name": "z", "value": "y", "domain": "example.com"} + ] + + @pytest.mark.parametrize( "body,meta,expected,warnings", [ @@ -1608,7 +2422,18 @@ def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): ( {"browserHtml": True}, {"screenshot": True, "browserHtml": False}, - {"screenshot": True}, + { + "screenshot": True, + }, + [], + ), + ( + {}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, [], ), ], @@ -1620,11 +2445,13 @@ def test_default_params_automap(default_params, meta, expected, warnings, caplog request = Request(url="https://example.com") request.meta["zyte_api_automap"] = meta settings = { + **SETTINGS, "ZYTE_API_AUTOMAP_PARAMS": default_params, "ZYTE_API_TRANSPARENT_MODE": True, } - crawler = get_crawler(settings_dict=settings) - param_parser = _ParamParser(crawler.settings) + crawler = get_crawler(settings) + handler = get_download_handler(crawler, "https") + param_parser = handler._param_parser with caplog.at_level("WARNING"): api_params = param_parser.parse(request) api_params.pop("url") diff --git a/tests/test_request_fingerprinter.py b/tests/test_request_fingerprinter.py index 60ff7a20..9784d342 100644 --- a/tests/test_request_fingerprinter.py +++ b/tests/test_request_fingerprinter.py @@ -1,3 +1,5 @@ +from typing import Any, Dict + import pytest from packaging.version import Version from scrapy import __version__ as SCRAPY_VERSION @@ -6,12 +8,12 @@ pytest.skip("Skipping tests for Scrapy ≥ 2.7", allow_module_level=True) from scrapy import Request -from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS -from scrapy.utils.misc import create_instance, load_object -from scrapy.utils.test import get_crawler +from scrapy.utils.misc import create_instance from scrapy_zyte_api import ScrapyZyteAPIRequestFingerprinter +from . import SETTINGS, get_crawler + def test_cache(): crawler = get_crawler() @@ -31,7 +33,7 @@ def fingerprint(self, request): settings = { "ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS": CustomFingerprinter, } - crawler = get_crawler(settings_dict=settings) + crawler = get_crawler(settings) fingerprinter = create_instance( ScrapyZyteAPIRequestFingerprinter, settings=crawler.settings, crawler=crawler ) @@ -42,15 +44,12 @@ def fingerprint(self, request): def test_fallback_default(): - crawler = get_crawler() - fingerprinter = create_instance( - ScrapyZyteAPIRequestFingerprinter, settings=crawler.settings, crawler=crawler - ) - fallback_fingerprinter = create_instance( - load_object(REQUEST_FINGERPRINTER_CLASS), - settings=crawler.settings, - crawler=crawler, + crawler = get_crawler(SETTINGS) + fingerprinter = crawler.request_fingerprinter + fallback_fingerprinter = ( + crawler.request_fingerprinter._fallback_request_fingerprinter ) + request = Request("https://example.com") new_fingerprint = fingerprinter.fingerprint(request) old_fingerprint = fallback_fingerprinter.fingerprint(request) @@ -181,7 +180,7 @@ def test_known_fingerprints(url, params, fingerprint): def test_metadata(): settings = {"JOB": "1/2/3"} - crawler = get_crawler(settings_dict=settings) + crawler = get_crawler(settings) job_fingerprinter = create_instance( ScrapyZyteAPIRequestFingerprinter, settings=crawler.settings, crawler=crawler ) @@ -209,18 +208,15 @@ def test_only_end_parameters_matter(): parameters, that the fingerprint is the same if the parameters actually sent to Zyte API are the same.""" - settings = { + settings: Dict[str, Any] = { + **SETTINGS, "ZYTE_API_TRANSPARENT_MODE": True, } - crawler = get_crawler(settings_dict=settings) - transparent_fingerprinter = create_instance( - ScrapyZyteAPIRequestFingerprinter, settings=crawler.settings, crawler=crawler - ) + crawler = get_crawler(settings) + transparent_fingerprinter = crawler.request_fingerprinter - crawler = get_crawler() - default_fingerprinter = create_instance( - ScrapyZyteAPIRequestFingerprinter, settings=crawler.settings, crawler=crawler - ) + crawler = get_crawler(SETTINGS) + default_fingerprinter = crawler.request_fingerprinter request = Request("https://example.com") fingerprint1 = transparent_fingerprinter.fingerprint(request) @@ -228,7 +224,15 @@ def test_only_end_parameters_matter(): raw_request = Request( "https://example.com", - meta={"zyte_api": {"httpResponseBody": True, "httpResponseHeaders": True}}, + meta={ + "zyte_api": { + "httpResponseBody": True, + "httpResponseHeaders": True, + "experimental": { + "responseCookies": True, + }, + } + }, ) fingerprint3 = transparent_fingerprinter.fingerprint(raw_request) fingerprint4 = default_fingerprinter.fingerprint(raw_request) diff --git a/tests/test_responses.py b/tests/test_responses.py index a1d364e4..aeafb048 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -1,16 +1,19 @@ from base64 import b64encode +from collections import defaultdict +from functools import partial import pytest from scrapy import Request from scrapy.exceptions import NotSupported from scrapy.http import Response, TextResponse +from scrapy.http.cookies import CookieJar from scrapy_zyte_api.responses import ( _API_RESPONSE, ZyteAPIResponse, ZyteAPITextResponse, - _process_response, ) +from scrapy_zyte_api.responses import _process_response as _unwrapped_process_response from scrapy_zyte_api.utils import _RESPONSE_HAS_IP_ADDRESS, _RESPONSE_HAS_PROTOCOL PAGE_CONTENT = "The cake is a lie!" @@ -18,6 +21,35 @@ URL = "https://example.com" +INPUT_COOKIES = [ + { + "name": "a", + "value": "b", + "domain": ".example.com", + "path": "/", + "expires": 1679893056, + "httpOnly": True, + "secure": True, + }, +] +OUTPUT_COOKIE_HEADERS = { + b"Set-Cookie": [ + ( + b"a=b; " + b"Domain=.example.com; " + b"Path=/; " + b"Expires=Mon, 27 Mar 2023 04:57:36 GMT; " + b"HttpOnly; " + b"Secure" + ) + ] +} + +_process_response = partial( + _unwrapped_process_response, cookie_jars=defaultdict(CookieJar) +) + + def raw_api_response_browser(): return { "url": URL, @@ -29,6 +61,9 @@ def raw_api_response_browser(): {"name": "Content-Length", "value": len(PAGE_CONTENT)}, ], "statusCode": 200, + "experimental": { + "responseCookies": INPUT_COOKIES, + }, } @@ -42,6 +77,9 @@ def raw_api_response_body(): {"name": "Content-Length", "value": len(PAGE_CONTENT)}, ], "statusCode": 200, + "experimental": { + "responseCookies": INPUT_COOKIES, + }, } @@ -56,6 +94,9 @@ def raw_api_response_mixed(): {"name": "Content-Length", "value": len(PAGE_CONTENT_2)}, ], "statusCode": 200, + "experimental": { + "responseCookies": INPUT_COOKIES, + }, } @@ -103,6 +144,7 @@ def test_text_from_api_response(api_response, cls, content_length): expected_headers = { b"Content-Type": [b"text/html"], b"Content-Length": [str(content_length).encode()], + **OUTPUT_COOKIE_HEADERS, } assert response.headers == expected_headers assert response.body == EXPECTED_BODY @@ -184,7 +226,7 @@ def test_response_headers_removal(api_response, cls): """Headers like 'Content-Encoding' should be removed later in the response instance returned to Scrapy. - However, it should still be present inside 'raw_api_response.headers'. + However, they should still be present inside 'raw_api_response.headers'. """ additional_headers = [ {"name": "Content-Encoding", "value": "gzip"}, @@ -195,7 +237,10 @@ def test_response_headers_removal(api_response, cls): response = cls.from_api_response(raw_response) - assert response.headers == {b"X-Some-Other-Value": [b"123"]} + assert response.headers == { + b"X-Some-Other-Value": [b"123"], + **OUTPUT_COOKIE_HEADERS, + } assert ( response.raw_api_response["httpResponseHeaders"] == raw_response["httpResponseHeaders"]