From cf0f32cff9df9bdb5d97dd665ae160135d88c184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 May 2024 17:35:54 +0200 Subject: [PATCH] Use a different stat prefix for Zyte API, and improve wording overall to minimize confusion (#120) --- README.rst | 10 ++- docs/index.rst | 4 ++ docs/stats.rst | 12 ++++ scrapy_zyte_smartproxy/middleware.py | 71 ++++++++++++--------- setup.py | 4 +- tests/test_all.py | 95 ++++++++++++++++++++++------ tox.ini | 7 +- 7 files changed, 146 insertions(+), 57 deletions(-) create mode 100644 docs/stats.rst diff --git a/README.rst b/README.rst index e79a7e1..e645df3 100644 --- a/README.rst +++ b/README.rst @@ -14,8 +14,14 @@ scrapy-zyte-smartproxy :target: http://codecov.io/github/scrapy-plugins/scrapy-zyte-smartproxy?branch=master :alt: Code Coverage -scrapy-zyte-smartproxy provides easy use of `Zyte Smart Proxy Manager -`_ (formerly Crawlera) with Scrapy. +scrapy-zyte-smartproxy is a `Scrapy downloader middleware`_ to use one of +Zyte’s proxy services: either the `proxy mode`_ of `Zyte API`_ or `Zyte Smart +Proxy Manager`_ (formerly Crawlera). + +.. _Scrapy downloader middleware: https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +.. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html +.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html +.. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/ Requirements ============ diff --git a/docs/index.rst b/docs/index.rst index 5d2335d..827f501 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,6 +6,7 @@ scrapy-zyte-smartproxy |version| documentation :hidden: headers + stats settings news @@ -61,6 +62,9 @@ Configuration ZYTE_SMARTPROXY_URL = "http://api.zyte.com:8011" + .. tip:: This URL is logged, so that you can tell which value was used + from crawl logs. + - To use the default Zyte Smart Proxy Manager endpoint, leave it unset. - To use a custom Zyte Smart Proxy Manager endpoint, in case you have a diff --git a/docs/stats.rst b/docs/stats.rst new file mode 100644 index 0000000..1c71f75 --- /dev/null +++ b/docs/stats.rst @@ -0,0 +1,12 @@ +Stats +===== + +This Scrapy plugin tracks some stats. + +Stats for the `proxy mode`_ of `Zyte API`_ and stats for `Zyte Smart +Proxy Manager`_ (formerly Crawlera) have a different prefix, ``zyte_api_proxy`` +and ``zyte_smartproxy`` respectively. + +.. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html +.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html +.. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/ diff --git a/scrapy_zyte_smartproxy/middleware.py b/scrapy_zyte_smartproxy/middleware.py index e7f954a..feba0b2 100644 --- a/scrapy_zyte_smartproxy/middleware.py +++ b/scrapy_zyte_smartproxy/middleware.py @@ -86,7 +86,7 @@ def _make_auth_url(self, spider): auth = self.get_proxyauth(spider) if not auth.startswith(b'Basic '): raise ValueError( - 'Zyte Smart Proxy Manager only supports HTTP basic access ' + 'Zyte proxy services only support HTTP basic access ' 'authentication, but %s.%s.get_proxyauth() returned %r' % (self.__module__, self.__class__.__name__, auth) ) @@ -111,7 +111,7 @@ def open_spider(self, spider): if not self.apikey: logger.warning( - "Zyte Smart Proxy Manager cannot be used without an API key", + "Zyte proxy services cannot be used without an API key", extra={'spider': spider}, ) return @@ -120,7 +120,7 @@ def open_spider(self, spider): self._authless_url = _remove_auth(self._auth_url) logger.info( - "Using Zyte Smart Proxy Manager at %s (apikey: %s)" % ( + "Using Zyte proxy service %s with an API key ending in %s" % ( self.url, self.apikey[:7] ), extra={'spider': spider}, @@ -131,8 +131,8 @@ def open_spider(self, spider): spider.download_delay = 0 logger.info( "ZyteSmartProxyMiddleware: disabling download delays in " - "Scrapy to optimize delays introduced by Zyte Smart Proxy " - "Manager. To avoid this behaviour you can use the " + "Scrapy to optimize delays introduced by Zyte proxy services. " + "To avoid this behaviour you can use the " "ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind " "that this may slow down the crawl significantly", extra={'spider': spider}, @@ -196,7 +196,9 @@ def get_proxyauth(self, spider): return basic_auth_header(self.apikey, '') def _targets_zyte_api(self, request): - auth_url = request.meta["proxy"] + if self._auth_url is None: + return False + auth_url = request.meta.get("proxy", self._auth_url) targets_zyte_api = self._targets.get(auth_url, None) if targets_zyte_api is None: targets_zyte_api = urlparse(auth_url).hostname == "api.zyte.com" @@ -220,6 +222,10 @@ def _translate_headers(self, request, targets_zyte_api): request, ) + def _inc_stat(self, stat, targets_zyte_api, value=1): + prefix = "zyte_api_proxy" if targets_zyte_api else "zyte_smartproxy" + self.crawler.stats.inc_value("{}/{}".format(prefix, stat), value) + def process_request(self, request, spider): if self._is_enabled_for_request(request): if 'proxy' not in request.meta: @@ -246,8 +252,8 @@ def process_request(self, request, spider): user_agent_header = "Zyte-Client" if targets_zyte_api else "X-Crawlera-Client" from scrapy_zyte_smartproxy import __version__ request.headers[user_agent_header] = 'scrapy-zyte-smartproxy/%s' % __version__ - self.crawler.stats.inc_value('zyte_smartproxy/request') - self.crawler.stats.inc_value('zyte_smartproxy/request/method/%s' % request.method) + self._inc_stat("request", targets_zyte_api=targets_zyte_api) + self._inc_stat("request/method/{}".format(request.method), targets_zyte_api=targets_zyte_api) self._translate_headers(request, targets_zyte_api=targets_zyte_api) self._clean_zyte_smartproxy_headers(request, targets_zyte_api=targets_zyte_api) else: @@ -285,8 +291,10 @@ def _process_error(self, response): def process_response(self, request, response, spider): zyte_smartproxy_error = self._process_error(response) + targets_zyte_api = self._targets_zyte_api(request) + if not self._is_enabled_for_request(request): - return self._handle_not_enabled_response(request, response) + return self._handle_not_enabled_response(request, response, targets_zyte_api=targets_zyte_api) if not self._is_zyte_smartproxy_or_zapi_response(response): return response @@ -299,9 +307,9 @@ def process_response(self, request, response, spider): reason = 'noslaves' else: reason = 'autherror' - self._set_custom_delay(request, next(self.exp_backoff), reason=reason) + self._set_custom_delay(request, next(self.exp_backoff), reason=reason, targets_zyte_api=targets_zyte_api) else: - self.crawler.stats.inc_value('zyte_smartproxy/delay/reset_backoff') + self._inc_stat("delay/reset_backoff", targets_zyte_api=targets_zyte_api) self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) if self._is_auth_error(response): @@ -309,9 +317,9 @@ def process_response(self, request, response, spider): # authenticate users we must retry retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0) if retries < self.max_auth_retry_times: - return self._retry_auth(response, request, spider) + return self._retry_auth(response, request, spider, targets_zyte_api=targets_zyte_api) else: - self.crawler.stats.inc_value('zyte_smartproxy/retries/auth/max_reached') + self._inc_stat("retries/auth/max_reached", targets_zyte_api=targets_zyte_api) logger.warning( "Max retries for authentication issues reached, please check auth" " information settings", @@ -325,17 +333,17 @@ def process_response(self, request, response, spider): else: after = response.headers.get('retry-after') if after: - self._set_custom_delay(request, float(after), reason='banned') - self.crawler.stats.inc_value('zyte_smartproxy/response/banned') + self._set_custom_delay(request, float(after), reason='banned', targets_zyte_api=targets_zyte_api) + self._inc_stat("response/banned", targets_zyte_api=targets_zyte_api) else: self._bans[key] = 0 # If placed behind `RedirectMiddleware`, it would not count 3xx responses - self.crawler.stats.inc_value('zyte_smartproxy/response') - self.crawler.stats.inc_value('zyte_smartproxy/response/status/%s' % response.status) + self._inc_stat("response", targets_zyte_api=targets_zyte_api) + self._inc_stat("response/status/{}".format(response.status), targets_zyte_api=targets_zyte_api) if zyte_smartproxy_error: - self.crawler.stats.inc_value('zyte_smartproxy/response/error') - self.crawler.stats.inc_value( - 'zyte_smartproxy/response/error/%s' % zyte_smartproxy_error.decode('utf8')) + self._inc_stat("response/error", targets_zyte_api=targets_zyte_api) + error_msg = zyte_smartproxy_error.decode('utf8') + self._inc_stat("response/error/{}".format(error_msg), targets_zyte_api=targets_zyte_api) return response def process_exception(self, request, exception, spider): @@ -344,30 +352,33 @@ def process_exception(self, request, exception, spider): if isinstance(exception, (ConnectionRefusedError, ConnectionDone)): # Handle Zyte Smart Proxy Manager downtime self._clear_dns_cache() - self._set_custom_delay(request, self.connection_refused_delay, reason='conn_refused') + targets_zyte_api = self._targets_zyte_api(request) + self._set_custom_delay(request, self.connection_refused_delay, reason='conn_refused', targets_zyte_api=targets_zyte_api) - def _handle_not_enabled_response(self, request, response): + def _handle_not_enabled_response(self, request, response, targets_zyte_api): if self._should_enable_for_response(response): domain = self._get_url_domain(request.url) self.enabled_for_domain[domain] = True retryreq = request.copy() retryreq.dont_filter = True - self.crawler.stats.inc_value('zyte_smartproxy/retries/should_have_been_enabled') + self._inc_stat("retries/should_have_been_enabled", targets_zyte_api=targets_zyte_api) return retryreq return response - def _retry_auth(self, response, request, spider): + def _retry_auth(self, response, request, spider, targets_zyte_api): logger.warning( - "Retrying a Zyte Smart Proxy Manager request due to an " - "authentication issue", + ( + "Retrying a request due to an authentication issue with " + "the configured Zyte proxy service" + ), extra={'spider': self.spider}, ) retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0) + 1 retryreq = request.copy() retryreq.meta['zyte_smartproxy_auth_retry_times'] = retries retryreq.dont_filter = True - self.crawler.stats.inc_value('zyte_smartproxy/retries/auth') + self._inc_stat("retries/auth", targets_zyte_api=targets_zyte_api) return retryreq def _clear_dns_cache(self): @@ -402,7 +413,7 @@ def _get_slot(self, request): key = self._get_slot_key(request) return key, self.crawler.engine.downloader.slots.get(key) - def _set_custom_delay(self, request, delay, reason=None): + def _set_custom_delay(self, request, delay, targets_zyte_api, reason=None): """Set custom delay for slot and save original one.""" key, slot = self._get_slot(request) if not slot: @@ -411,8 +422,8 @@ def _set_custom_delay(self, request, delay, reason=None): self._saved_delays[key] = slot.delay slot.delay = delay if reason is not None: - self.crawler.stats.inc_value('zyte_smartproxy/delay/%s' % reason) - self.crawler.stats.inc_value('zyte_smartproxy/delay/%s/total' % reason, delay) + self._inc_stat("delay/{}".format(reason), targets_zyte_api=targets_zyte_api) + self._inc_stat("delay/{}/total".format(reason), value=delay, targets_zyte_api=targets_zyte_api) def _restore_original_delay(self, request): """Restore original delay for slot if it was changed.""" diff --git a/setup.py b/setup.py index 45ad7a5..6b2db5f 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup -with open("README.rst") as f: - readme = f.read() +with open("README.rst", "rb") as f: + readme = f.read().decode("utf-8") setup( diff --git a/tests/test_all.py b/tests/test_all.py index a49ca4f..80c632d 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -1,6 +1,7 @@ import binascii import os import pytest +from copy import copy from random import choice from unittest import TestCase try: @@ -405,10 +406,12 @@ def test_jobid_header(self): self.assertEqual(req3.headers.get('X-Crawlera-Jobid'), None) self.assertEqual(req3.headers.get('Zyte-JobId'), b'2816') - def test_stats(self): + def _test_stats(self, settings, prefix): self.spider.zyte_smartproxy_enabled = True spider = self.spider - crawler = self._mock_crawler(spider, self.settings) + settings = copy(settings) + settings["ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES"] = [555] + crawler = self._mock_crawler(spider, settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) @@ -416,19 +419,19 @@ def test_stats(self): req = Request('http://example.com') assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/request'), 1) - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/request/method/GET'), 1) + self.assertEqual(crawler.stats.get_value('{}/request'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value('{}/request/method/GET'.format(prefix)), 1) res = self._mock_zyte_smartproxy_response(req.url) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response'), 1) - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response/status/200'), 1) + self.assertEqual(crawler.stats.get_value('{}/response'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value('{}/response/status/200'.format(prefix)), 1) req = Request('http://example.com/other', method='POST') assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/request'), 2) - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/request/method/POST'), 1) + self.assertEqual(crawler.stats.get_value('{}/request'.format(prefix)), 2) + self.assertEqual(crawler.stats.get_value('{}/request/method/POST'.format(prefix)), 1) res = self._mock_zyte_smartproxy_response( req.url, @@ -436,23 +439,74 @@ def test_stats(self): headers={'Zyte-Error': 'somethingbad'} ) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response'), 2) - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response/status/{}'.format(mw.ban_code)), 1) - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response/error/somethingbad'), 1) + self.assertEqual(crawler.stats.get_value('{}/response'.format(prefix)), 2) + self.assertEqual(crawler.stats.get_value('{}/response/status/{}'.format(prefix, mw.ban_code)), 1) + self.assertEqual(crawler.stats.get_value('{}/response/error'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value('{}/response/error/somethingbad'.format(prefix)), 1) self.assertEqual(res.headers["X-Crawlera-Error"], b"somethingbad") self.assertEqual(res.headers["Zyte-Error"], b"somethingbad") + res = self._mock_zyte_smartproxy_response( req.url, status=mw.ban_code, - headers={'X-Crawlera-Error': 'banned'} + headers={'X-Crawlera-Error': 'banned', "Retry-After": "1"} ) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response'), 3) - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response/status/{}'.format(mw.ban_code)), 2) - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response/banned'), 1) + self.assertEqual(crawler.stats.get_value('{}/response'.format(prefix)), 3) + self.assertEqual(crawler.stats.get_value('{}/response/status/{}'.format(prefix, mw.ban_code)), 2) + self.assertEqual(crawler.stats.get_value('{}/response/banned'.format(prefix)), 1) self.assertEqual(res.headers["X-Crawlera-Error"], b"banned") self.assertEqual(res.headers["Zyte-Error"], b"banned") + res = self._mock_zyte_smartproxy_response( + req.url, + status=mw.ban_code, + headers={'X-Crawlera-Error': 'banned', "Retry-After": "1"} + ) + slot_key = "example.com" + crawler.engine.downloader.slots[slot_key] = MockedSlot() + req.meta["download_slot"] = "example.com" + assert mw.process_response(req, res, spider) is res + del req.meta["download_slot"] + self.assertEqual(crawler.stats.get_value('{}/delay/banned'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value('{}/delay/banned/total'.format(prefix)), 1) + + res = self._mock_zyte_smartproxy_response( + req.url, + status=407, + headers={'X-Crawlera-Error': 'bad_proxy_auth'}, + ) + assert isinstance(mw.process_response(req, res, spider), Request) + self.assertEqual(crawler.stats.get_value('{}/retries/auth'.format(prefix)), 1) + + res = self._mock_zyte_smartproxy_response( + req.url, + status=407, + headers={'X-Crawlera-Error': 'bad_proxy_auth'}, + ) + req.meta["zyte_smartproxy_auth_retry_times"] = 11 + assert mw.process_response(req, res, spider) is res + del req.meta["zyte_smartproxy_auth_retry_times"] + self.assertEqual(crawler.stats.get_value('{}/retries/auth'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value('{}/retries/auth/max_reached'.format(prefix)), 1) + + res = self._mock_zyte_smartproxy_response( + req.url, + status=555, + ) + req.meta["dont_proxy"] = True + assert isinstance(mw.process_response(req, res, spider), Request) + del req.meta["dont_proxy"] + self.assertEqual(crawler.stats.get_value('{}/retries/should_have_been_enabled'.format(prefix)), 1) + + def test_stats_spm(self): + self._test_stats(self.settings, "zyte_smartproxy") + + def test_stats_zapi(self): + settings = copy(self.settings) + settings["ZYTE_SMARTPROXY_URL"] = "http://api.zyte.com:8011" + self._test_stats(settings, "zyte_api_proxy") + def _make_fake_request(self, spider, zyte_smartproxy_enabled, **kwargs): spider.zyte_smartproxy_enabled = zyte_smartproxy_enabled crawler = self._mock_crawler(spider, self.settings) @@ -783,15 +837,15 @@ def test_open_spider_logging(self, mock_logger): mw.open_spider(spider) expected_calls = [ call( - "Using Zyte Smart Proxy Manager at %s (apikey: %s)" % ( + "Using Zyte proxy service %s with an API key ending in %s" % ( self.mwcls.url, 'apikey' ), extra={'spider': spider}, ), call( "ZyteSmartProxyMiddleware: disabling download delays in " - "Scrapy to optimize delays introduced by Zyte Smart Proxy " - "Manager. To avoid this behaviour you can use the " + "Scrapy to optimize delays introduced by Zyte proxy services. " + "To avoid this behaviour you can use the " "ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind " "that this may slow down the crawl significantly", extra={'spider': spider}, @@ -889,7 +943,7 @@ def test_no_apikey_warning_zyte_smartproxy_enabled(self, mock_logger): mw.open_spider(self.spider) self.assertTrue(mw.enabled) mock_logger.warning.assert_called_with( - "Zyte Smart Proxy Manager cannot be used without an API key", + "Zyte proxy services cannot be used without an API key", extra={'spider': self.spider} ) @@ -902,7 +956,7 @@ def test_no_apikey_warning_force_enable(self, mock_logger): mw.open_spider(self.spider) self.assertFalse(mw.enabled) mock_logger.warning.assert_called_with( - "Zyte Smart Proxy Manager cannot be used without an API key", + "Zyte proxy services cannot be used without an API key", extra={'spider': self.spider} ) @@ -971,6 +1025,7 @@ def test_no_slot(self): # there are no slot named 'example.com' noslaves_req = Request(url, meta={'download_slot': 'example.com'}) + assert mw.process_request(noslaves_req, self.spider) is None headers = {'X-Crawlera-Error': 'noslaves'} noslaves_res = self._mock_zyte_smartproxy_response( diff --git a/tox.ini b/tox.ini index 7353e76..723652b 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ # tox.ini [tox] -envlist = min,py27,py34,py35,py36,py37,py38,py39,py310,py311,docs,security +envlist = min,py27,py34,py35,py36,py37,py38,py39,py310,py311,py312,docs,security [testenv] deps = @@ -14,7 +14,8 @@ basepython = python2.7 deps = Scrapy==1.4.0 six==1.10.0 - Twisted==13.1.0 # https://github.com/scrapy/scrapy/blob/1.4.0/setup.py#L45 + # https://github.com/scrapy/scrapy/blob/1.4.0/setup.py#L45 + Twisted==13.1.0 w3lib==1.17.0 -rtests/requirements.txt @@ -24,7 +25,7 @@ deps = Scrapy six # Latest Twisted that does not install an embedded version of incremental - # that’s incompatible with Python 3.4. + # that is incompatible with Python 3.4. Twisted==16.4.1 w3lib -rtests/requirements.txt