Skip to content

Commit

Permalink
Use a different stat prefix for Zyte API, and improve wording overall…
Browse files Browse the repository at this point in the history
… to minimize confusion (#120)
  • Loading branch information
Gallaecio committed May 8, 2024
1 parent ffdb38f commit cf0f32c
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 57 deletions.
10 changes: 8 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@ scrapy-zyte-smartproxy
:target: http://codecov.io/github/scrapy-plugins/scrapy-zyte-smartproxy?branch=master
:alt: Code Coverage

scrapy-zyte-smartproxy provides easy use of `Zyte Smart Proxy Manager
<https://www.zyte.com/smart-proxy-manager/>`_ (formerly Crawlera) with Scrapy.
scrapy-zyte-smartproxy is a `Scrapy downloader middleware`_ to use one of
Zyte’s proxy services: either the `proxy mode`_ of `Zyte API`_ or `Zyte Smart
Proxy Manager`_ (formerly Crawlera).

.. _Scrapy downloader middleware: https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
.. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html
.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html
.. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/

Requirements
============
Expand Down
4 changes: 4 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ scrapy-zyte-smartproxy |version| documentation
:hidden:

headers
stats
settings
news

Expand Down Expand Up @@ -61,6 +62,9 @@ Configuration
ZYTE_SMARTPROXY_URL = "http://api.zyte.com:8011"
.. tip:: This URL is logged, so that you can tell which value was used
from crawl logs.

- To use the default Zyte Smart Proxy Manager endpoint, leave it unset.

- To use a custom Zyte Smart Proxy Manager endpoint, in case you have a
Expand Down
12 changes: 12 additions & 0 deletions docs/stats.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Stats
=====

This Scrapy plugin tracks some stats.

Stats for the `proxy mode`_ of `Zyte API`_ and stats for `Zyte Smart
Proxy Manager`_ (formerly Crawlera) have a different prefix, ``zyte_api_proxy``
and ``zyte_smartproxy`` respectively.

.. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html
.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html
.. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/
71 changes: 41 additions & 30 deletions scrapy_zyte_smartproxy/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _make_auth_url(self, spider):
auth = self.get_proxyauth(spider)
if not auth.startswith(b'Basic '):
raise ValueError(
'Zyte Smart Proxy Manager only supports HTTP basic access '
'Zyte proxy services only support HTTP basic access '
'authentication, but %s.%s.get_proxyauth() returned %r'
% (self.__module__, self.__class__.__name__, auth)
)
Expand All @@ -111,7 +111,7 @@ def open_spider(self, spider):

if not self.apikey:
logger.warning(
"Zyte Smart Proxy Manager cannot be used without an API key",
"Zyte proxy services cannot be used without an API key",
extra={'spider': spider},
)
return
Expand All @@ -120,7 +120,7 @@ def open_spider(self, spider):
self._authless_url = _remove_auth(self._auth_url)

logger.info(
"Using Zyte Smart Proxy Manager at %s (apikey: %s)" % (
"Using Zyte proxy service %s with an API key ending in %s" % (
self.url, self.apikey[:7]
),
extra={'spider': spider},
Expand All @@ -131,8 +131,8 @@ def open_spider(self, spider):
spider.download_delay = 0
logger.info(
"ZyteSmartProxyMiddleware: disabling download delays in "
"Scrapy to optimize delays introduced by Zyte Smart Proxy "
"Manager. To avoid this behaviour you can use the "
"Scrapy to optimize delays introduced by Zyte proxy services. "
"To avoid this behaviour you can use the "
"ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind "
"that this may slow down the crawl significantly",
extra={'spider': spider},
Expand Down Expand Up @@ -196,7 +196,9 @@ def get_proxyauth(self, spider):
return basic_auth_header(self.apikey, '')

def _targets_zyte_api(self, request):
auth_url = request.meta["proxy"]
if self._auth_url is None:
return False
auth_url = request.meta.get("proxy", self._auth_url)
targets_zyte_api = self._targets.get(auth_url, None)
if targets_zyte_api is None:
targets_zyte_api = urlparse(auth_url).hostname == "api.zyte.com"
Expand All @@ -220,6 +222,10 @@ def _translate_headers(self, request, targets_zyte_api):
request,
)

def _inc_stat(self, stat, targets_zyte_api, value=1):
prefix = "zyte_api_proxy" if targets_zyte_api else "zyte_smartproxy"
self.crawler.stats.inc_value("{}/{}".format(prefix, stat), value)

def process_request(self, request, spider):
if self._is_enabled_for_request(request):
if 'proxy' not in request.meta:
Expand All @@ -246,8 +252,8 @@ def process_request(self, request, spider):
user_agent_header = "Zyte-Client" if targets_zyte_api else "X-Crawlera-Client"
from scrapy_zyte_smartproxy import __version__
request.headers[user_agent_header] = 'scrapy-zyte-smartproxy/%s' % __version__
self.crawler.stats.inc_value('zyte_smartproxy/request')
self.crawler.stats.inc_value('zyte_smartproxy/request/method/%s' % request.method)
self._inc_stat("request", targets_zyte_api=targets_zyte_api)
self._inc_stat("request/method/{}".format(request.method), targets_zyte_api=targets_zyte_api)
self._translate_headers(request, targets_zyte_api=targets_zyte_api)
self._clean_zyte_smartproxy_headers(request, targets_zyte_api=targets_zyte_api)
else:
Expand Down Expand Up @@ -285,8 +291,10 @@ def _process_error(self, response):
def process_response(self, request, response, spider):
zyte_smartproxy_error = self._process_error(response)

targets_zyte_api = self._targets_zyte_api(request)

if not self._is_enabled_for_request(request):
return self._handle_not_enabled_response(request, response)
return self._handle_not_enabled_response(request, response, targets_zyte_api=targets_zyte_api)

if not self._is_zyte_smartproxy_or_zapi_response(response):
return response
Expand All @@ -299,19 +307,19 @@ def process_response(self, request, response, spider):
reason = 'noslaves'
else:
reason = 'autherror'
self._set_custom_delay(request, next(self.exp_backoff), reason=reason)
self._set_custom_delay(request, next(self.exp_backoff), reason=reason, targets_zyte_api=targets_zyte_api)
else:
self.crawler.stats.inc_value('zyte_smartproxy/delay/reset_backoff')
self._inc_stat("delay/reset_backoff", targets_zyte_api=targets_zyte_api)
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)

if self._is_auth_error(response):
# When Zyte Smart Proxy Manager has issues it might not be able to
# authenticate users we must retry
retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0)
if retries < self.max_auth_retry_times:
return self._retry_auth(response, request, spider)
return self._retry_auth(response, request, spider, targets_zyte_api=targets_zyte_api)
else:
self.crawler.stats.inc_value('zyte_smartproxy/retries/auth/max_reached')
self._inc_stat("retries/auth/max_reached", targets_zyte_api=targets_zyte_api)
logger.warning(
"Max retries for authentication issues reached, please check auth"
" information settings",
Expand All @@ -325,17 +333,17 @@ def process_response(self, request, response, spider):
else:
after = response.headers.get('retry-after')
if after:
self._set_custom_delay(request, float(after), reason='banned')
self.crawler.stats.inc_value('zyte_smartproxy/response/banned')
self._set_custom_delay(request, float(after), reason='banned', targets_zyte_api=targets_zyte_api)
self._inc_stat("response/banned", targets_zyte_api=targets_zyte_api)
else:
self._bans[key] = 0
# If placed behind `RedirectMiddleware`, it would not count 3xx responses
self.crawler.stats.inc_value('zyte_smartproxy/response')
self.crawler.stats.inc_value('zyte_smartproxy/response/status/%s' % response.status)
self._inc_stat("response", targets_zyte_api=targets_zyte_api)
self._inc_stat("response/status/{}".format(response.status), targets_zyte_api=targets_zyte_api)
if zyte_smartproxy_error:
self.crawler.stats.inc_value('zyte_smartproxy/response/error')
self.crawler.stats.inc_value(
'zyte_smartproxy/response/error/%s' % zyte_smartproxy_error.decode('utf8'))
self._inc_stat("response/error", targets_zyte_api=targets_zyte_api)
error_msg = zyte_smartproxy_error.decode('utf8')
self._inc_stat("response/error/{}".format(error_msg), targets_zyte_api=targets_zyte_api)
return response

def process_exception(self, request, exception, spider):
Expand All @@ -344,30 +352,33 @@ def process_exception(self, request, exception, spider):
if isinstance(exception, (ConnectionRefusedError, ConnectionDone)):
# Handle Zyte Smart Proxy Manager downtime
self._clear_dns_cache()
self._set_custom_delay(request, self.connection_refused_delay, reason='conn_refused')
targets_zyte_api = self._targets_zyte_api(request)
self._set_custom_delay(request, self.connection_refused_delay, reason='conn_refused', targets_zyte_api=targets_zyte_api)

def _handle_not_enabled_response(self, request, response):
def _handle_not_enabled_response(self, request, response, targets_zyte_api):
if self._should_enable_for_response(response):
domain = self._get_url_domain(request.url)
self.enabled_for_domain[domain] = True

retryreq = request.copy()
retryreq.dont_filter = True
self.crawler.stats.inc_value('zyte_smartproxy/retries/should_have_been_enabled')
self._inc_stat("retries/should_have_been_enabled", targets_zyte_api=targets_zyte_api)
return retryreq
return response

def _retry_auth(self, response, request, spider):
def _retry_auth(self, response, request, spider, targets_zyte_api):
logger.warning(
"Retrying a Zyte Smart Proxy Manager request due to an "
"authentication issue",
(
"Retrying a request due to an authentication issue with "
"the configured Zyte proxy service"
),
extra={'spider': self.spider},
)
retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0) + 1
retryreq = request.copy()
retryreq.meta['zyte_smartproxy_auth_retry_times'] = retries
retryreq.dont_filter = True
self.crawler.stats.inc_value('zyte_smartproxy/retries/auth')
self._inc_stat("retries/auth", targets_zyte_api=targets_zyte_api)
return retryreq

def _clear_dns_cache(self):
Expand Down Expand Up @@ -402,7 +413,7 @@ def _get_slot(self, request):
key = self._get_slot_key(request)
return key, self.crawler.engine.downloader.slots.get(key)

def _set_custom_delay(self, request, delay, reason=None):
def _set_custom_delay(self, request, delay, targets_zyte_api, reason=None):
"""Set custom delay for slot and save original one."""
key, slot = self._get_slot(request)
if not slot:
Expand All @@ -411,8 +422,8 @@ def _set_custom_delay(self, request, delay, reason=None):
self._saved_delays[key] = slot.delay
slot.delay = delay
if reason is not None:
self.crawler.stats.inc_value('zyte_smartproxy/delay/%s' % reason)
self.crawler.stats.inc_value('zyte_smartproxy/delay/%s/total' % reason, delay)
self._inc_stat("delay/{}".format(reason), targets_zyte_api=targets_zyte_api)
self._inc_stat("delay/{}/total".format(reason), value=delay, targets_zyte_api=targets_zyte_api)

def _restore_original_delay(self, request):
"""Restore original delay for slot if it was changed."""
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup

with open("README.rst") as f:
readme = f.read()
with open("README.rst", "rb") as f:
readme = f.read().decode("utf-8")


setup(
Expand Down

0 comments on commit cf0f32c

Please sign in to comment.