Skip to content

Commit

Permalink
Add stats extension based on Redis (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
Germey committed Mar 14, 2021
1 parent 95b4751 commit dee413a
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 1 deletion.
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ Use the following settings in your project:
# Ensure all spiders share same duplicates filter through redis.
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# Enables stats shared based on Redis
STATS_CLASS = "scrapy_redis.stats.RedisStatsCollector"
# Default requests serializer is pickle, but it can be changed to any module
# with loads and dumps functions. Note that pickle is not compatible between
# python versions.
Expand Down
4 changes: 3 additions & 1 deletion src/scrapy_redis/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

PIPELINE_KEY = '%(spider)s:items'

STATS_KEY = '%(spider)s:stats'

REDIS_CLS = redis.StrictRedis
REDIS_ENCODING = 'utf-8'
# Sane connection defaults.
Expand All @@ -20,7 +22,7 @@
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'
SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'

SCHEDULER_PERSIST = False
START_URLS_KEY = '%(name)s:start_urls'
START_URLS_AS_SET = False
START_URLS_AS_ZSET = False
78 changes: 78 additions & 0 deletions src/scrapy_redis/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from scrapy.statscollectors import StatsCollector
from .connection import from_settings as redis_from_settings
from .defaults import STATS_KEY, SCHEDULER_PERSIST


class RedisStatsCollector(StatsCollector):
"""
Stats Collector based on Redis
"""

def __init__(self, crawler, spider=None):
super().__init__(crawler)
self.server = redis_from_settings(crawler.settings)
self.spider = spider
self.spider_name = spider.name if spider else crawler.spidercls.name
self.stats_key = crawler.settings.get('STATS_KEY', STATS_KEY)
self.persist = crawler.settings.get(
'SCHEDULER_PERSIST', SCHEDULER_PERSIST)

def _get_key(self, spider=None):
"""Return the hash name of stats"""
if spider:
self.stats_key % {'spider': spider.name}
if self.spider:
return self.stats_key % {'spider': self.spider.name}
return self.stats_key % {'spider': self.spider_name or 'scrapy'}

@classmethod
def from_crawler(cls, crawler):
return cls(crawler)

def get_value(self, key, default=None, spider=None):
"""Return the value of hash stats"""
if self.server.hexists(self._get_key(spider), key):
return int(self.server.hget(self._get_key(spider), key))
else:
return default

def get_stats(self, spider=None):
"""Return the all of the values of hash stats"""
return self.server.hgetall(self._get_key(spider))

def set_value(self, key, value, spider=None):
"""Set the value according to hash key of stats"""
self.server.hset(self._get_key(spider), key, value)

def set_stats(self, stats, spider=None):
"""Set all the hash stats"""
self.server.hmset(self._get_key(spider), stats)

def inc_value(self, key, count=1, start=0, spider=None):
"""Set increment of value according to key"""
if not self.server.hexists(self._get_key(spider), key):
self.set_value(key, start)
self.server.hincrby(self._get_key(spider), key, count)

def max_value(self, key, value, spider=None):
"""Set max value between current and new value"""
self.set_value(key, max(self.get_value(key, value), value))

def min_value(self, key, value, spider=None):
"""Set min value between current and new value"""
self.set_value(key, min(self.get_value(key, value), value))

def clear_stats(self, spider=None):
"""Clarn all the hash stats"""
self.server.delete(self._get_key(spider))

def open_spider(self, spider):
"""Set spider to self"""
if spider:
self.spider = spider

def close_spider(self, spider, reason):
"""Clear spider and clear stats"""
self.spider = None
if not self.persist:
self.clear_stats(spider)

0 comments on commit dee413a

Please sign in to comment.