From 33153855ea04d95545970970c3be08226b679f10 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Sun, 7 May 2023 22:29:23 +0300 Subject: [PATCH 01/24] periodic_log: implemented as separate extension --- scrapy/extensions/logstats_extended.py | 112 +++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 scrapy/extensions/logstats_extended.py diff --git a/scrapy/extensions/logstats_extended.py b/scrapy/extensions/logstats_extended.py new file mode 100644 index 00000000000..7cba0dc2b98 --- /dev/null +++ b/scrapy/extensions/logstats_extended.py @@ -0,0 +1,112 @@ +import logging +from datetime import datetime + +from twisted.internet import task + +from scrapy import signals +from scrapy.exceptions import NotConfigured +from scrapy.utils.serialize import ScrapyJSONEncoder + +logger = logging.getLogger(__name__) + + +class LogStatsExtended: + """Log basic scraping stats periodically""" + + def __init__( + self, stats, interval=60.0, extended=False, ext_include=None, ext_exclude=None + ): + self.stats = stats + self.interval = interval + self.multiplier = 60.0 / self.interval + self.task = None + self.extended = extended + self.encoder = ScrapyJSONEncoder(sort_keys=True, indent=4) + self.ext_include = ext_include + self.ext_exclude = ext_exclude + + @classmethod + def from_crawler(cls, crawler): + interval = crawler.settings.getfloat("LOGSTATS_INTERVAL") + extended = crawler.settings.getbool("LOGSTATS_EXTENDED_ENABLED") + ext_include = crawler.settings.getlist("LOGSTATS_EXTENDED_INCLUDE", []) + ext_exclude = crawler.settings.getlist("LOGSTATS_EXTENDED_EXCLUDE", []) + if not interval: + raise NotConfigured + o = cls(crawler.stats, interval, extended, ext_include, ext_exclude) + crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) + crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) + return o + + def spider_opened(self, spider): + self.time_prev = datetime.utcnow() + self.delta_prev = {} + self.stats_prev = {} + + self.task = task.LoopingCall(self.log, spider) + self.task.start(self.interval) + + def log(self, spider): + data = {} + data.update(self.log_timing()) + data.update(self.log_delta()) + data.update(self.log_crawler_stats()) + logger.info(self.encoder.encode(data)) + + + def log_delta(self): + num_stats = { + k: v + for k, v in self.stats._stats.items() + if isinstance(v, (int, float)) and self.delta_param_allowed(k) + } + delta = {k: v - self.delta_prev.get(k, 0) for k, v in num_stats.items()} + self.delta_prev = num_stats + return {"delta": delta} + + def log_timing(self): + now = datetime.utcnow() + time = { + "log_interval": self.interval, + "start_time": self.stats._stats["start_time"], + "utcnow": now, + "log_interval_real": (now - self.time_prev).total_seconds(), + "elapsed": (now - self.stats._stats["start_time"]).total_seconds(), + } + self.time_prev = now + return {"time": time} + + def log_time(self): + num_stats = { + k: v + for k, v in self.stats._stats.items() + if isinstance(v, (int, float)) and self.delta_param_allowed(k) + } + delta = {k: v - self.stats_prev.get(k, 0) for k, v in num_stats.items()} + self.stats_prev = num_stats + return {"delta": delta} + + def log_crawler_stats(self): + return {"stats": self.stats.get_stats()} + + def delta_param_allowed(self, stat_name): + for p in self.ext_exclude: + if p in stat_name: + return False + for p in self.ext_include: + if p in stat_name: + return True + if self.ext_include: + return False + else: + return True + + def spider_closed(self, spider, reason): + data = {} + data.update(self.log_timing()) + data.update(self.log_delta()) + data.update(self.log_crawler_stats()) + logger.info(self.encoder.encode(data)) + + if self.task and self.task.running: + self.task.stop() \ No newline at end of file From 84fb0edd5fc9c994931dc8931bf3d8a44a63aa57 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Sun, 21 May 2023 20:22:35 +0300 Subject: [PATCH 02/24] periodic_log: extension updated --- scrapy/extensions/logstats_extended.py | 94 ++++++++++++++++---------- 1 file changed, 57 insertions(+), 37 deletions(-) diff --git a/scrapy/extensions/logstats_extended.py b/scrapy/extensions/logstats_extended.py index 7cba0dc2b98..63703dace3c 100644 --- a/scrapy/extensions/logstats_extended.py +++ b/scrapy/extensions/logstats_extended.py @@ -14,26 +14,57 @@ class LogStatsExtended: """Log basic scraping stats periodically""" def __init__( - self, stats, interval=60.0, extended=False, ext_include=None, ext_exclude=None + self, stats, interval=60.0, + ext_stats_enabled=False, + ext_stats_include=None, + ext_stats_exclude=None, + ext_delta_enabled=False, + ext_delta_include=None, + ext_delta_exclude=None, + ext_timing_enabled=False, + ext_timing_include=None, + ext_timing_exclude=None, ): self.stats = stats self.interval = interval self.multiplier = 60.0 / self.interval self.task = None - self.extended = extended self.encoder = ScrapyJSONEncoder(sort_keys=True, indent=4) - self.ext_include = ext_include - self.ext_exclude = ext_exclude + self.ext_stats_enabled = ext_stats_enabled + self.ext_stats_include = ext_stats_include + self.ext_stats_exclude = ext_stats_exclude + self.ext_delta_enabled = ext_delta_enabled + self.ext_delta_include = ext_delta_include + self.ext_delta_exclude = ext_delta_exclude + self.ext_timing_enabled = ext_timing_enabled + self.ext_timing_include = ext_timing_include + self.ext_timing_exclude = ext_timing_exclude + @classmethod def from_crawler(cls, crawler): interval = crawler.settings.getfloat("LOGSTATS_INTERVAL") - extended = crawler.settings.getbool("LOGSTATS_EXTENDED_ENABLED") - ext_include = crawler.settings.getlist("LOGSTATS_EXTENDED_INCLUDE", []) - ext_exclude = crawler.settings.getlist("LOGSTATS_EXTENDED_EXCLUDE", []) + ext_stats_enabled = crawler.settings.getbool("LOGSTATS_EXT_STATS_ENABLED") + ext_stats_include = crawler.settings.getlist("LOGSTATS_EXT_STATS_INCLUDE", []) + ext_stats_exclude = crawler.settings.getlist("LOGSTATS_EXT_STATS_EXCLUDE", []) + ext_delta_enabled = crawler.settings.getbool("LOGSTATS_EXT_DELTA_ENABLED") + ext_delta_include = crawler.settings.getlist("LOGSTATS_EXT_DELTA_INCLUDE", []) + ext_delta_exclude = crawler.settings.getlist("LOGSTATS_EXT_DELTA_EXCLUDE", []) + ext_timing_enabled = crawler.settings.getbool("LOGSTATS_EXT_TIMING_ENABLED") if not interval: raise NotConfigured - o = cls(crawler.stats, interval, extended, ext_include, ext_exclude) + if not (ext_stats_enabled or ext_delta_enabled or ext_timing_enabled): + raise NotConfigured + o = cls(crawler.stats, + interval, + ext_stats_enabled, + ext_stats_include, + ext_stats_exclude, + ext_delta_enabled, + ext_delta_include, + ext_delta_exclude, + ext_timing_enabled, + ) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o @@ -48,17 +79,19 @@ def spider_opened(self, spider): def log(self, spider): data = {} - data.update(self.log_timing()) - data.update(self.log_delta()) - data.update(self.log_crawler_stats()) + if self.ext_timing_enabled: + data.update(self.log_timing()) + if self.ext_delta_enabled: + data.update(self.log_delta()) + if self.ext_stats_enabled: + data.update(self.log_crawler_stats()) logger.info(self.encoder.encode(data)) - def log_delta(self): num_stats = { k: v for k, v in self.stats._stats.items() - if isinstance(v, (int, float)) and self.delta_param_allowed(k) + if isinstance(v, (int, float)) and self.param_allowed(k,self.ext_delta_include,self.ext_delta_exclude) } delta = {k: v - self.delta_prev.get(k, 0) for k, v in num_stats.items()} self.delta_prev = num_stats @@ -76,37 +109,24 @@ def log_timing(self): self.time_prev = now return {"time": time} - def log_time(self): - num_stats = { + def log_crawler_stats(self): + stats = { k: v for k, v in self.stats._stats.items() - if isinstance(v, (int, float)) and self.delta_param_allowed(k) - } - delta = {k: v - self.stats_prev.get(k, 0) for k, v in num_stats.items()} - self.stats_prev = num_stats - return {"delta": delta} + if self.param_allowed(k,self.ext_stats_include, self.ext_stats_exclude)} + return {"stats": stats} - def log_crawler_stats(self): - return {"stats": self.stats.get_stats()} - - def delta_param_allowed(self, stat_name): - for p in self.ext_exclude: + def param_allowed(self, stat_name, include, exclude): + for p in exclude: if p in stat_name: return False - for p in self.ext_include: + for p in include: if p in stat_name: return True - if self.ext_include: - return False - else: - return True + return False - def spider_closed(self, spider, reason): - data = {} - data.update(self.log_timing()) - data.update(self.log_delta()) - data.update(self.log_crawler_stats()) - logger.info(self.encoder.encode(data)) + def spider_closed(self, spider, reason): + self.log(spider) if self.task and self.task.running: - self.task.stop() \ No newline at end of file + self.task.stop() From a2f238d927329ac308edcf09fde4e85dd9f61e44 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Mon, 22 May 2023 23:24:53 +0300 Subject: [PATCH 03/24] periodic_log: settings input in extension updated --- scrapy/extensions/logstats_extended.py | 48 +++++++++++++------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/scrapy/extensions/logstats_extended.py b/scrapy/extensions/logstats_extended.py index 63703dace3c..4f970d3d7a1 100644 --- a/scrapy/extensions/logstats_extended.py +++ b/scrapy/extensions/logstats_extended.py @@ -15,54 +15,52 @@ class LogStatsExtended: def __init__( self, stats, interval=60.0, - ext_stats_enabled=False, - ext_stats_include=None, - ext_stats_exclude=None, - ext_delta_enabled=False, - ext_delta_include=None, - ext_delta_exclude=None, + ext_stats={}, + ext_delta={}, ext_timing_enabled=False, - ext_timing_include=None, - ext_timing_exclude=None, ): self.stats = stats self.interval = interval self.multiplier = 60.0 / self.interval self.task = None self.encoder = ScrapyJSONEncoder(sort_keys=True, indent=4) - self.ext_stats_enabled = ext_stats_enabled - self.ext_stats_include = ext_stats_include - self.ext_stats_exclude = ext_stats_exclude - self.ext_delta_enabled = ext_delta_enabled - self.ext_delta_include = ext_delta_include - self.ext_delta_exclude = ext_delta_exclude + self.ext_stats_enabled = bool(ext_stats) + self.ext_stats_include = ext_stats.get("include", []) + self.ext_stats_exclude = ext_stats.get("exclude", []) + self.ext_delta_enabled = bool(ext_delta) + self.ext_delta_include = ext_delta.get("include", []) + self.ext_delta_exclude = ext_delta.get("exclude", []) self.ext_timing_enabled = ext_timing_enabled - self.ext_timing_include = ext_timing_include - self.ext_timing_exclude = ext_timing_exclude @classmethod def from_crawler(cls, crawler): interval = crawler.settings.getfloat("LOGSTATS_INTERVAL") + try: + ext_stats = crawler.settings.getdict("PERIODIC_LOG_STATS") + except: + ext_stats = {"enabled": True} if crawler.settings.getbool("PERIODIC_LOG_STATS") else None + + try: + ext_delta = crawler.settings.getdict("PERIODIC_LOG_DELTA") + except: + ext_delta = {"enabled": True} if crawler.settings.getdict("PERIODIC_LOG_DELTA") else None + ext_timing_enabled = crawler.settings.getbool("PERIODIC_LOG_TIMING_ENABLED", False) ext_stats_enabled = crawler.settings.getbool("LOGSTATS_EXT_STATS_ENABLED") ext_stats_include = crawler.settings.getlist("LOGSTATS_EXT_STATS_INCLUDE", []) ext_stats_exclude = crawler.settings.getlist("LOGSTATS_EXT_STATS_EXCLUDE", []) ext_delta_enabled = crawler.settings.getbool("LOGSTATS_EXT_DELTA_ENABLED") ext_delta_include = crawler.settings.getlist("LOGSTATS_EXT_DELTA_INCLUDE", []) ext_delta_exclude = crawler.settings.getlist("LOGSTATS_EXT_DELTA_EXCLUDE", []) - ext_timing_enabled = crawler.settings.getbool("LOGSTATS_EXT_TIMING_ENABLED") + #ext_timing_enabled = crawler.settings.getbool("LOGSTATS_EXT_TIMING_ENABLED") if not interval: raise NotConfigured - if not (ext_stats_enabled or ext_delta_enabled or ext_timing_enabled): + if not (ext_stats or ext_delta or ext_timing_enabled): raise NotConfigured o = cls(crawler.stats, interval, - ext_stats_enabled, - ext_stats_include, - ext_stats_exclude, - ext_delta_enabled, - ext_delta_include, - ext_delta_exclude, + ext_stats, + ext_delta, ext_timing_enabled, ) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) @@ -117,6 +115,8 @@ def log_crawler_stats(self): return {"stats": stats} def param_allowed(self, stat_name, include, exclude): + if not include and not exclude: + return True for p in exclude: if p in stat_name: return False From db794d351c6218c8b849682bb1aa0048788e7f1d Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Mon, 22 May 2023 23:30:00 +0300 Subject: [PATCH 04/24] periodic_log: not used code deleted --- scrapy/extensions/logstats_extended.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/scrapy/extensions/logstats_extended.py b/scrapy/extensions/logstats_extended.py index 4f970d3d7a1..9c9a4fe4a16 100644 --- a/scrapy/extensions/logstats_extended.py +++ b/scrapy/extensions/logstats_extended.py @@ -46,13 +46,6 @@ def from_crawler(cls, crawler): except: ext_delta = {"enabled": True} if crawler.settings.getdict("PERIODIC_LOG_DELTA") else None ext_timing_enabled = crawler.settings.getbool("PERIODIC_LOG_TIMING_ENABLED", False) - ext_stats_enabled = crawler.settings.getbool("LOGSTATS_EXT_STATS_ENABLED") - ext_stats_include = crawler.settings.getlist("LOGSTATS_EXT_STATS_INCLUDE", []) - ext_stats_exclude = crawler.settings.getlist("LOGSTATS_EXT_STATS_EXCLUDE", []) - ext_delta_enabled = crawler.settings.getbool("LOGSTATS_EXT_DELTA_ENABLED") - ext_delta_include = crawler.settings.getlist("LOGSTATS_EXT_DELTA_INCLUDE", []) - ext_delta_exclude = crawler.settings.getlist("LOGSTATS_EXT_DELTA_EXCLUDE", []) - #ext_timing_enabled = crawler.settings.getbool("LOGSTATS_EXT_TIMING_ENABLED") if not interval: raise NotConfigured if not (ext_stats or ext_delta or ext_timing_enabled): @@ -72,10 +65,10 @@ def spider_opened(self, spider): self.delta_prev = {} self.stats_prev = {} - self.task = task.LoopingCall(self.log, spider) + self.task = task.LoopingCall(self.log) self.task.start(self.interval) - def log(self, spider): + def log(self): data = {} if self.ext_timing_enabled: data.update(self.log_timing()) @@ -127,6 +120,6 @@ def param_allowed(self, stat_name, include, exclude): def spider_closed(self, spider, reason): - self.log(spider) + self.log() if self.task and self.task.running: self.task.stop() From 5c91f1bb43c00f9d64d16ecc1a5a976e14895ea5 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Tue, 23 May 2023 23:00:26 +0300 Subject: [PATCH 05/24] periodic_log: added settings to default settings --- scrapy/settings/default_settings.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 260ec1701c7..9660e0bcd05 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -240,6 +240,10 @@ NEWSPIDER_MODULE = "" +PERIODIC_LOG_DELTA = None +PERIODIC_LOG_STATS = None +PERIODIC_LOG_TIMING_ENABLED = None + RANDOMIZE_DOWNLOAD_DELAY = True REACTOR_THREADPOOL_MAXSIZE = 10 From a0c84903b703a9405f2807873837ebf274873a60 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Tue, 23 May 2023 23:07:12 +0300 Subject: [PATCH 06/24] periodic_log: codestyle fix (from pre-commit) --- scrapy/extensions/logstats_extended.py | 51 ++++++++++++++++---------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/scrapy/extensions/logstats_extended.py b/scrapy/extensions/logstats_extended.py index 9c9a4fe4a16..158c828681c 100644 --- a/scrapy/extensions/logstats_extended.py +++ b/scrapy/extensions/logstats_extended.py @@ -14,10 +14,12 @@ class LogStatsExtended: """Log basic scraping stats periodically""" def __init__( - self, stats, interval=60.0, - ext_stats={}, - ext_delta={}, - ext_timing_enabled=False, + self, + stats, + interval=60.0, + ext_stats={}, + ext_delta={}, + ext_timing_enabled=False, ): self.stats = stats self.interval = interval @@ -32,30 +34,40 @@ def __init__( self.ext_delta_exclude = ext_delta.get("exclude", []) self.ext_timing_enabled = ext_timing_enabled - @classmethod def from_crawler(cls, crawler): interval = crawler.settings.getfloat("LOGSTATS_INTERVAL") try: ext_stats = crawler.settings.getdict("PERIODIC_LOG_STATS") - except: - ext_stats = {"enabled": True} if crawler.settings.getbool("PERIODIC_LOG_STATS") else None + except ValueError: + ext_stats = ( + {"enabled": True} + if crawler.settings.getbool("PERIODIC_LOG_STATS") + else None + ) try: ext_delta = crawler.settings.getdict("PERIODIC_LOG_DELTA") - except: - ext_delta = {"enabled": True} if crawler.settings.getdict("PERIODIC_LOG_DELTA") else None - ext_timing_enabled = crawler.settings.getbool("PERIODIC_LOG_TIMING_ENABLED", False) + except ValueError: + ext_delta = ( + {"enabled": True} + if crawler.settings.getdict("PERIODIC_LOG_DELTA") + else None + ) + ext_timing_enabled = crawler.settings.getbool( + "PERIODIC_LOG_TIMING_ENABLED", False + ) if not interval: raise NotConfigured if not (ext_stats or ext_delta or ext_timing_enabled): raise NotConfigured - o = cls(crawler.stats, - interval, - ext_stats, - ext_delta, - ext_timing_enabled, - ) + o = cls( + crawler.stats, + interval, + ext_stats, + ext_delta, + ext_timing_enabled, + ) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o @@ -82,7 +94,8 @@ def log_delta(self): num_stats = { k: v for k, v in self.stats._stats.items() - if isinstance(v, (int, float)) and self.param_allowed(k,self.ext_delta_include,self.ext_delta_exclude) + if isinstance(v, (int, float)) + and self.param_allowed(k, self.ext_delta_include, self.ext_delta_exclude) } delta = {k: v - self.delta_prev.get(k, 0) for k, v in num_stats.items()} self.delta_prev = num_stats @@ -104,7 +117,8 @@ def log_crawler_stats(self): stats = { k: v for k, v in self.stats._stats.items() - if self.param_allowed(k,self.ext_stats_include, self.ext_stats_exclude)} + if self.param_allowed(k, self.ext_stats_include, self.ext_stats_exclude) + } return {"stats": stats} def param_allowed(self, stat_name, include, exclude): @@ -118,7 +132,6 @@ def param_allowed(self, stat_name, include, exclude): return True return False - def spider_closed(self, spider, reason): self.log() if self.task and self.task.running: From b60e0faf22eaee9cb26357c4c6637fba7e20f125 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Wed, 24 May 2023 23:10:30 +0300 Subject: [PATCH 07/24] periodic_log: fixed naming --- scrapy/extensions/{logstats_extended.py => periodic_log.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename scrapy/extensions/{logstats_extended.py => periodic_log.py} (99%) diff --git a/scrapy/extensions/logstats_extended.py b/scrapy/extensions/periodic_log.py similarity index 99% rename from scrapy/extensions/logstats_extended.py rename to scrapy/extensions/periodic_log.py index 158c828681c..3e496096ca1 100644 --- a/scrapy/extensions/logstats_extended.py +++ b/scrapy/extensions/periodic_log.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -class LogStatsExtended: +class PeriodicLog: """Log basic scraping stats periodically""" def __init__( From 639c2bcc47ecd700917aa2a48b9c63f41327f977 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Fri, 16 Jun 2023 13:37:26 +0300 Subject: [PATCH 08/24] periodic_log: TypeError except added --- scrapy/extensions/periodic_log.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scrapy/extensions/periodic_log.py b/scrapy/extensions/periodic_log.py index 3e496096ca1..0f01c441c79 100644 --- a/scrapy/extensions/periodic_log.py +++ b/scrapy/extensions/periodic_log.py @@ -45,6 +45,12 @@ def from_crawler(cls, crawler): if crawler.settings.getbool("PERIODIC_LOG_STATS") else None ) + except TypeError: + ext_stats = ( + {"enabled": True} + if crawler.settings.getbool("PERIODIC_LOG_STATS") + else None + ) try: ext_delta = crawler.settings.getdict("PERIODIC_LOG_DELTA") @@ -54,6 +60,13 @@ def from_crawler(cls, crawler): if crawler.settings.getdict("PERIODIC_LOG_DELTA") else None ) + except TypeError: + ext_stats = ( + {"enabled": True} + if crawler.settings.getbool("PERIODIC_LOG_DELTA") + else None + ) + ext_timing_enabled = crawler.settings.getbool( "PERIODIC_LOG_TIMING_ENABLED", False ) From 6e65eeb07beb6805d7f6d0a139c98f58c41a7bc3 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Fri, 16 Jun 2023 14:30:50 +0300 Subject: [PATCH 09/24] periodic_log: tests [wip] added --- tests/test_extension_periodic_log.py | 79 ++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 tests/test_extension_periodic_log.py diff --git a/tests/test_extension_periodic_log.py b/tests/test_extension_periodic_log.py new file mode 100644 index 00000000000..9f7ec7b230c --- /dev/null +++ b/tests/test_extension_periodic_log.py @@ -0,0 +1,79 @@ +import datetime +import unittest + +from scrapy.crawler import Crawler +from scrapy.exceptions import NotConfigured +from scrapy.extensions.periodic_log import PeriodicLog + +from .spiders import MetaSpider + +stats_dump_1 = { + "log_count/INFO": 10, + "log_count/WARNING": 1, + "start_time": datetime.datetime(2023, 6, 16, 8, 59, 18, 993170), + "scheduler/enqueued/memory": 190, + "scheduler/enqueued": 190, + "scheduler/dequeued/memory": 166, + "scheduler/dequeued": 166, + "downloader/request_count": 166, + "downloader/request_method_count/GET": 166, + "downloader/request_bytes": 56803, + "downloader/response_count": 150, + "downloader/response_status_count/200": 150, + "downloader/response_bytes": 595698, + "httpcompression/response_bytes": 3186068, + "httpcompression/response_count": 150, + "response_received_count": 150, + "request_depth_max": 9, + "dupefilter/filtered": 180, + "item_scraped_count": 140, +} +stats_dump_2 = { + "log_count/INFO": 12, + "log_count/WARNING": 1, + "start_time": datetime.datetime(2023, 6, 16, 8, 59, 18, 993170), + "scheduler/enqueued/memory": 337, + "scheduler/enqueued": 337, + "scheduler/dequeued/memory": 280, + "scheduler/dequeued": 280, + "downloader/request_count": 280, + "downloader/request_method_count/GET": 280, + "downloader/request_bytes": 95754, + "downloader/response_count": 264, + "downloader/response_status_count/200": 264, + "downloader/response_bytes": 1046274, + "httpcompression/response_bytes": 5614484, + "httpcompression/response_count": 264, + "response_received_count": 264, + "request_depth_max": 16, + "dupefilter/filtered": 320, + "item_scraped_count": 248, +} + + +class TestPeriodicLog(unittest.TestCase): + def test_extension_enabled(self): + extension = PeriodicLog.from_crawler( + Crawler( + MetaSpider, + settings={"PERIODIC_LOG_STATS": True, "LOGSTATS_INTERVAL": 60}, + ) + ) + # Test enabled + assert extension + + # Raise not configured if not set by settings + with self.assertRaises(NotConfigured): + PeriodicLog.from_crawler(Crawler(MetaSpider)) + + def test_periodic_log_stats(self): + pass + + def test_log_delta(self): + pass + + def test_settings_include(self): + pass + + def test_settings_exclude(self): + pass From 2ce4856508f7092368349c0ebf296dd1391a229a Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Sun, 25 Jun 2023 12:16:59 +0300 Subject: [PATCH 10/24] periodic_log: Exception handling on init updated --- scrapy/extensions/periodic_log.py | 17 +------ tests/test_extension_periodic_log.py | 76 ++++++++++++++++++++++++---- 2 files changed, 69 insertions(+), 24 deletions(-) diff --git a/scrapy/extensions/periodic_log.py b/scrapy/extensions/periodic_log.py index 0f01c441c79..b1f5b8894a3 100644 --- a/scrapy/extensions/periodic_log.py +++ b/scrapy/extensions/periodic_log.py @@ -39,29 +39,16 @@ def from_crawler(cls, crawler): interval = crawler.settings.getfloat("LOGSTATS_INTERVAL") try: ext_stats = crawler.settings.getdict("PERIODIC_LOG_STATS") - except ValueError: + except (TypeError, ValueError): ext_stats = ( {"enabled": True} if crawler.settings.getbool("PERIODIC_LOG_STATS") else None ) - except TypeError: - ext_stats = ( - {"enabled": True} - if crawler.settings.getbool("PERIODIC_LOG_STATS") - else None - ) - try: ext_delta = crawler.settings.getdict("PERIODIC_LOG_DELTA") - except ValueError: + except (TypeError, ValueError): ext_delta = ( - {"enabled": True} - if crawler.settings.getdict("PERIODIC_LOG_DELTA") - else None - ) - except TypeError: - ext_stats = ( {"enabled": True} if crawler.settings.getbool("PERIODIC_LOG_DELTA") else None diff --git a/tests/test_extension_periodic_log.py b/tests/test_extension_periodic_log.py index 9f7ec7b230c..4dccc687bcc 100644 --- a/tests/test_extension_periodic_log.py +++ b/tests/test_extension_periodic_log.py @@ -51,20 +51,78 @@ } +def extension(settings=None): + return PeriodicLog.from_crawler( + Crawler( + MetaSpider, + settings=settings, + ) + ) + + class TestPeriodicLog(unittest.TestCase): def test_extension_enabled(self): - extension = PeriodicLog.from_crawler( - Crawler( - MetaSpider, - settings={"PERIODIC_LOG_STATS": True, "LOGSTATS_INTERVAL": 60}, - ) - ) - # Test enabled - assert extension + # Expected that settings for this extension loaded succesfully + # And on certain conditions - extension raising NotConfigured + + # "PERIODIC_LOG_STATS": True -> set to {"enabled": True} + # due to TypeError exception from settings.getdict + assert extension({"PERIODIC_LOG_STATS": True, "LOGSTATS_INTERVAL": 60}) + + # "PERIODIC_LOG_STATS": "True" -> set to {"enabled": True} + # due to JSONDecodeError(ValueError) exception from settings.getdict + assert extension({"PERIODIC_LOG_STATS": "True", "LOGSTATS_INTERVAL": 60}) + + # The ame for PERIODIC_LOG_DELTA: + assert extension({"PERIODIC_LOG_DELTA": True, "LOGSTATS_INTERVAL": 60}) + assert extension({"PERIODIC_LOG_DELTA": "True", "LOGSTATS_INTERVAL": 60}) # Raise not configured if not set by settings with self.assertRaises(NotConfigured): - PeriodicLog.from_crawler(Crawler(MetaSpider)) + extension() + + # Regular use cases: + assert extension( + { + "PERIODIC_LOG_STATS": { + "include": [ + "downloader/", + "scheduler/", + "log_count/", + "item_scraped_count/", + ], + "exclude": ["scheduler/"], + } + } + ) + + assert extension( + { + "PERIODIC_LOG_DELTA": {"include": ["downloader/"]}, + "PERIODIC_LOG_TIMING_ENABLED": True, + } + ) + + assert extension( + { + "PERIODIC_LOG_TIMING_ENABLED": True, + } + ) + + assert extension( + { + "PERIODIC_LOG_STATS": { + "include": [ + "downloader/", + "scheduler/", + "log_count/", + "item_scraped_count/", + ], + "exclude": ["scheduler/"], + }, + "PERIODIC_LOG_DELTA": {"include": ["downloader/"]}, + } + ) def test_periodic_log_stats(self): pass From ebce5b4bcb8b0af172dce063d9a220359202cf36 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Tue, 27 Jun 2023 09:22:50 +0300 Subject: [PATCH 11/24] periodic_log: tests for logging deltas added --- tests/test_extension_periodic_log.py | 69 ++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/tests/test_extension_periodic_log.py b/tests/test_extension_periodic_log.py index 4dccc687bcc..1df030548af 100644 --- a/tests/test_extension_periodic_log.py +++ b/tests/test_extension_periodic_log.py @@ -51,8 +51,16 @@ } +class TestExtPeriodicLog(PeriodicLog): + def set_a(self): + self.stats._stats = stats_dump_1 + + def set_b(self): + self.stats._stats = stats_dump_2 + + def extension(settings=None): - return PeriodicLog.from_crawler( + return TestExtPeriodicLog.from_crawler( Crawler( MetaSpider, settings=settings, @@ -124,14 +132,57 @@ def test_extension_enabled(self): } ) - def test_periodic_log_stats(self): - pass - def test_log_delta(self): - pass + def emulate(settings=None): + ext = extension(settings) + ext.spider_opened(MetaSpider) + ext.set_a() + a = ext.log_delta() + ext.set_a() + b = ext.log_delta() + return ext, a, b + + def check(settings: dict, condition: callable): + ext, a, b = emulate(settings) + assert list(a["delta"].keys()) == [ + k for k, v in ext.stats._stats.items() if condition(k, v) + ] + assert list(b["delta"].keys()) == [ + k for k, v in ext.stats._stats.items() if condition(k, v) + ] + + # Including all + check({"PERIODIC_LOG_DELTA": True}, lambda k, v: isinstance(v, (int, float))) + + # include: + check( + {"PERIODIC_LOG_DELTA": {"include": ["downloader/"]}}, + lambda k, v: isinstance(v, (int, float)) and "downloader/" in k, + ) - def test_settings_include(self): - pass + # include multiple + check( + {"PERIODIC_LOG_DELTA": {"include": ["downloader/", "scheduler/"]}}, + lambda k, v: isinstance(v, (int, float)) + and ("downloader/" in k or "scheduler/" in k), + ) - def test_settings_exclude(self): - pass + # exclude + check( + {"PERIODIC_LOG_DELTA": {"exclude": ["downloader/"]}}, + lambda k, v: isinstance(v, (int, float)) and "downloader/" not in k, + ) + + # exclude multiple + check( + {"PERIODIC_LOG_DELTA": {"exclude": ["downloader/", "scheduler/"]}}, + lambda k, v: isinstance(v, (int, float)) + and ("downloader/" not in k and "scheduler/" not in k), + ) + + # include exclude combined + check( + {"PERIODIC_LOG_DELTA": {"include": ["downloader/"], "exclude": ["bytes"]}}, + lambda k, v: isinstance(v, (int, float)) + and ("downloader/" in k and "scheduler/" not in k), + ) From 315861c31d1bb1b2430883267b6973604b34a147 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Tue, 27 Jun 2023 09:24:03 +0300 Subject: [PATCH 12/24] periodic_log: stats filtering updated --- scrapy/extensions/periodic_log.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scrapy/extensions/periodic_log.py b/scrapy/extensions/periodic_log.py index b1f5b8894a3..1023b3cd518 100644 --- a/scrapy/extensions/periodic_log.py +++ b/scrapy/extensions/periodic_log.py @@ -127,6 +127,8 @@ def param_allowed(self, stat_name, include, exclude): for p in exclude: if p in stat_name: return False + if exclude and not include: + return True for p in include: if p in stat_name: return True From 56c38231b49f4f49fe5e215e46ec80c71755d44e Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Tue, 27 Jun 2023 11:48:51 +0300 Subject: [PATCH 13/24] periodic_log: tests for logging stats added --- tests/test_extension_periodic_log.py | 55 +++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/tests/test_extension_periodic_log.py b/tests/test_extension_periodic_log.py index 1df030548af..053613510b9 100644 --- a/tests/test_extension_periodic_log.py +++ b/tests/test_extension_periodic_log.py @@ -184,5 +184,58 @@ def check(settings: dict, condition: callable): check( {"PERIODIC_LOG_DELTA": {"include": ["downloader/"], "exclude": ["bytes"]}}, lambda k, v: isinstance(v, (int, float)) - and ("downloader/" in k and "scheduler/" not in k), + and ("downloader/" in k and "bytes" not in k), ) + + def test_log_stats(self): + def emulate(settings=None): + ext = extension(settings) + ext.spider_opened(MetaSpider) + ext.set_a() + a = ext.log_crawler_stats() + ext.set_a() + b = ext.log_crawler_stats() + return ext, a, b + + def check(settings: dict, condition: callable): + ext, a, b = emulate(settings) + assert list(a["stats"].keys()) == [ + k for k, v in ext.stats._stats.items() if condition(k, v) + ] + assert list(b["stats"].keys()) == [ + k for k, v in ext.stats._stats.items() if condition(k, v) + ] + + # Including all + check({"PERIODIC_LOG_STATS": True}, lambda k, v: True) + + # include: + check( + {"PERIODIC_LOG_STATS": {"include": ["downloader/"]}}, + lambda k, v: "downloader/" in k, + ) + + # include multiple + check( + {"PERIODIC_LOG_STATS": {"include": ["downloader/", "scheduler/"]}}, + lambda k, v: "downloader/" in k or "scheduler/" in k, + ) + + # exclude + check( + {"PERIODIC_LOG_STATS": {"exclude": ["downloader/"]}}, + lambda k, v: "downloader/" not in k, + ) + + # exclude multiple + check( + {"PERIODIC_LOG_STATS": {"exclude": ["downloader/", "scheduler/"]}}, + lambda k, v: "downloader/" not in k and "scheduler/" not in k, + ) + + # include exclude combined + check( + {"PERIODIC_LOG_STATS": {"include": ["downloader/"], "exclude": ["bytes"]}}, + lambda k, v: "downloader/" in k and "bytes" not in k, + ) + # From 6fd94fdcb3d633015372f8834308b7a0dcde5bf7 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Wed, 28 Jun 2023 19:55:33 +0300 Subject: [PATCH 14/24] periodic_log: tests updated (errors fixed) --- tests/test_extension_periodic_log.py | 56 +++------------------------- 1 file changed, 6 insertions(+), 50 deletions(-) diff --git a/tests/test_extension_periodic_log.py b/tests/test_extension_periodic_log.py index 053613510b9..242e390b3ea 100644 --- a/tests/test_extension_periodic_log.py +++ b/tests/test_extension_periodic_log.py @@ -2,7 +2,6 @@ import unittest from scrapy.crawler import Crawler -from scrapy.exceptions import NotConfigured from scrapy.extensions.periodic_log import PeriodicLog from .spiders import MetaSpider @@ -85,61 +84,16 @@ def test_extension_enabled(self): assert extension({"PERIODIC_LOG_DELTA": True, "LOGSTATS_INTERVAL": 60}) assert extension({"PERIODIC_LOG_DELTA": "True", "LOGSTATS_INTERVAL": 60}) - # Raise not configured if not set by settings - with self.assertRaises(NotConfigured): - extension() - - # Regular use cases: - assert extension( - { - "PERIODIC_LOG_STATS": { - "include": [ - "downloader/", - "scheduler/", - "log_count/", - "item_scraped_count/", - ], - "exclude": ["scheduler/"], - } - } - ) - - assert extension( - { - "PERIODIC_LOG_DELTA": {"include": ["downloader/"]}, - "PERIODIC_LOG_TIMING_ENABLED": True, - } - ) - - assert extension( - { - "PERIODIC_LOG_TIMING_ENABLED": True, - } - ) - - assert extension( - { - "PERIODIC_LOG_STATS": { - "include": [ - "downloader/", - "scheduler/", - "log_count/", - "item_scraped_count/", - ], - "exclude": ["scheduler/"], - }, - "PERIODIC_LOG_DELTA": {"include": ["downloader/"]}, - } - ) - def test_log_delta(self): def emulate(settings=None): + spider = MetaSpider() ext = extension(settings) - ext.spider_opened(MetaSpider) + ext.spider_opened(spider) ext.set_a() a = ext.log_delta() ext.set_a() b = ext.log_delta() + ext.spider_closed(spider, reason="finished") return ext, a, b def check(settings: dict, condition: callable): @@ -189,12 +143,14 @@ def check(settings: dict, condition: callable): def test_log_stats(self): def emulate(settings=None): + spider = MetaSpider() ext = extension(settings) - ext.spider_opened(MetaSpider) + ext.spider_opened(spider) ext.set_a() a = ext.log_crawler_stats() ext.set_a() b = ext.log_crawler_stats() + ext.spider_closed(spider, reason="finished") return ext, a, b def check(settings: dict, condition: callable): From 8b6a50a935b3dd822f7647c828c17f5409db541f Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Fri, 4 Aug 2023 11:12:07 +0300 Subject: [PATCH 15/24] periodic_log: docs added --- docs/topics/extensions.rst | 101 +++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index 96e0216b8f0..ae94c55a460 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -383,3 +383,104 @@ For more info see `Debugging in Python`_. This extension only works on POSIX-compliant platforms (i.e. not Windows). .. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/ + +Periodic log extension +~~~~~~~~~~~~~~~~~~~~~~ + +.. class:: PeriodicLog + +Extension provides extended stats data periodically in addition to basic data from Log Stats and Core Stats extensions (as JSON compatible dictionary) like: :: + + 2023-08-04 02:30:57 [scrapy.extensions.logstats] INFO: Crawled 976 pages (at 162 pages/min), scraped 925 items (at 161 items/min) + 2023-08-04 02:30:57 [scrapy.extensions.periodic_log] INFO: { + "delta": { + "downloader/request_bytes": 55582, + "downloader/request_count": 162, + "downloader/request_method_count/GET": 162, + "downloader/response_bytes": 618133, + "downloader/response_count": 162, + "downloader/response_status_count/200": 162, + "item_scraped_count": 161 + }, + "stats": { + "downloader/request_bytes": 338243, + "downloader/request_count": 992, + "downloader/request_method_count/GET": 992, + "downloader/response_bytes": 3836736, + "downloader/response_count": 976, + "downloader/response_status_count/200": 976, + "item_scraped_count": 925, + "log_count/INFO": 21, + "log_count/WARNING": 1, + "scheduler/dequeued": 992, + "scheduler/dequeued/memory": 992, + "scheduler/enqueued": 1050, + "scheduler/enqueued/memory": 1050 + }, + "time": { + "elapsed": 360.008903, + "log_interval": 60.0, + "log_interval_real": 60.006694, + "start_time": "2023-08-03 23:24:57", + "utcnow": "2023-08-03 23:30:57" + } + } + +``"delta"`` section shows numeric difference in stats values between current and previous log entry with period of ``LOGSTATS_INTERVAL`` (60 seconds by default). Its applicable for stats with values types ``int`` and ``float``. +Stats values displayed in this section configured by :setting:`PERIODIC_LOG_DELTA` setting. + +``"stats"`` section shows stats values as is at the moment of current period. +Stats values displayed in this section configured by :setting:`PERIODIC_LOG_STATS` setting. + +``"time"`` This extension produce log entries on startup, periodically, and on end of crawl. As final log entry produced earlier than ``LOGSTATS_INTERVAL`` value - detailed timing data required for more precise stats. + +Configured by :setting:`PERIODIC_LOG_TIMING_ENABLED` + + +Example extension configuration: + +.. code-block:: python + + custom_settings = { + "LOG_LEVEL": "INFO", + "PERIODIC_LOG_STATS": { + "include": ["downloader/", "scheduler/", "log_count/", "item_scraped_count/"], + }, + "PERIODIC_LOG_DELTA": {"include": ["downloader/"]}, + "PERIODIC_LOG_TIMING_ENABLED": True, + "EXTENSIONS": { + "scrapy.extensions.periodic_log.PeriodicLog": 0, + }, + } + +.. setting:: PERIODIC_LOG_DELTA + +PERIODIC_LOG_DELTA +"""""""""""""""""" + +Default: ``None`` + +* ``"PERIODIC_LOG_DELTA": True`` - show deltas for all ``int`` and ``float`` stats values. +* ``"PERIODIC_LOG_DELTA": {"include": ["downloader/", "scheduler/"]}`` - include stats deltas for stats with names that have listed substrings in stats names. +* ``"PERIODIC_LOG_DELTA": {"exclude": ["downloader/"]}`` - include all stats deltas except stats with listed substrings in stats names. + +.. setting:: PERIODIC_LOG_STATS + +PERIODIC_LOG_STATS +"""""""""""""""""" + +Default: ``None`` + +* ``"PERIODIC_LOG_STATS": True`` - show all available stats keys/values +* ``"PERIODIC_LOG_STATS": {"include": ["downloader/", "scheduler/"]}`` - include stats for keys that have listed substrings in stats names. +* ``"PERIODIC_LOG_STATS": {"exclude": ["downloader/"]}`` - include all stats deltas except stats with listed substrings in stats names. + + +.. setting:: PERIODIC_LOG_TIMING_ENABLED + +PERIODIC_LOG_TIMING_ENABLED +""""""""""""""""""""""""""" + +Default: ``None`` + +``"PERIODIC_LOG_TIMING_ENABLED": True`` - enables logging of timing data \ No newline at end of file From e9b088f1fb430009e4ea4e5b5a3836b4b94364eb Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Fri, 4 Aug 2023 12:49:22 +0300 Subject: [PATCH 16/24] periodic_log: typing --- tests/test_extension_periodic_log.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_extension_periodic_log.py b/tests/test_extension_periodic_log.py index 242e390b3ea..80f5c317769 100644 --- a/tests/test_extension_periodic_log.py +++ b/tests/test_extension_periodic_log.py @@ -1,4 +1,5 @@ import datetime +import typing import unittest from scrapy.crawler import Crawler @@ -96,7 +97,7 @@ def emulate(settings=None): ext.spider_closed(spider, reason="finished") return ext, a, b - def check(settings: dict, condition: callable): + def check(settings: dict, condition: typing.Callable): ext, a, b = emulate(settings) assert list(a["delta"].keys()) == [ k for k, v in ext.stats._stats.items() if condition(k, v) @@ -153,7 +154,7 @@ def emulate(settings=None): ext.spider_closed(spider, reason="finished") return ext, a, b - def check(settings: dict, condition: callable): + def check(settings: dict, condition: typing.Callable): ext, a, b = emulate(settings) assert list(a["stats"].keys()) == [ k for k, v in ext.stats._stats.items() if condition(k, v) From f05657e54245eff9f912914014c51b37b0bc6b34 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Thu, 10 Aug 2023 21:31:10 +0300 Subject: [PATCH 17/24] periodic_log: interval check moved --- scrapy/extensions/periodic_log.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy/extensions/periodic_log.py b/scrapy/extensions/periodic_log.py index 1023b3cd518..214f55777fe 100644 --- a/scrapy/extensions/periodic_log.py +++ b/scrapy/extensions/periodic_log.py @@ -37,6 +37,8 @@ def __init__( @classmethod def from_crawler(cls, crawler): interval = crawler.settings.getfloat("LOGSTATS_INTERVAL") + if not interval: + raise NotConfigured try: ext_stats = crawler.settings.getdict("PERIODIC_LOG_STATS") except (TypeError, ValueError): @@ -57,8 +59,6 @@ def from_crawler(cls, crawler): ext_timing_enabled = crawler.settings.getbool( "PERIODIC_LOG_TIMING_ENABLED", False ) - if not interval: - raise NotConfigured if not (ext_stats or ext_delta or ext_timing_enabled): raise NotConfigured o = cls( From 736a4b615c08cc7e610ddb18f5ac782f0aedca61 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Thu, 10 Aug 2023 21:34:35 +0300 Subject: [PATCH 18/24] Update scrapy/settings/default_settings.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- scrapy/settings/default_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 9660e0bcd05..f74b07c20d0 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -242,7 +242,7 @@ PERIODIC_LOG_DELTA = None PERIODIC_LOG_STATS = None -PERIODIC_LOG_TIMING_ENABLED = None +PERIODIC_LOG_TIMING_ENABLED = False RANDOMIZE_DOWNLOAD_DELAY = True From 2f094a7a5ca080ec6b43c1db4e0a70a385a0f756 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Fri, 11 Aug 2023 18:48:14 +0300 Subject: [PATCH 19/24] Update docs/topics/extensions.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- docs/topics/extensions.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index ae94c55a460..e280224b1dc 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -481,6 +481,6 @@ Default: ``None`` PERIODIC_LOG_TIMING_ENABLED """"""""""""""""""""""""""" -Default: ``None`` +Default: ``False`` -``"PERIODIC_LOG_TIMING_ENABLED": True`` - enables logging of timing data \ No newline at end of file +``True`` enables logging of timing data (i.e. the ``"time"`` section). \ No newline at end of file From e6bd9829bddba9d3301d8c883e80632944a19e79 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Fri, 11 Aug 2023 18:48:31 +0300 Subject: [PATCH 20/24] Update docs/topics/extensions.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- docs/topics/extensions.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index e280224b1dc..439e3e06eb2 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -471,9 +471,9 @@ PERIODIC_LOG_STATS Default: ``None`` -* ``"PERIODIC_LOG_STATS": True`` - show all available stats keys/values -* ``"PERIODIC_LOG_STATS": {"include": ["downloader/", "scheduler/"]}`` - include stats for keys that have listed substrings in stats names. -* ``"PERIODIC_LOG_STATS": {"exclude": ["downloader/"]}`` - include all stats deltas except stats with listed substrings in stats names. +* ``"PERIODIC_LOG_STATS": True`` - show the current value of all stats. +* ``"PERIODIC_LOG_STATS": {"include": ["downloader/", "scheduler/"]}`` - show current values for stats with names containing any configured substring. +* ``"PERIODIC_LOG_STATS": {"exclude": ["downloader/"]}`` - show current values for all stats with names not containing any configured substring. .. setting:: PERIODIC_LOG_TIMING_ENABLED From 3a4a949f9d3d64eb7124545813cf46cf63910149 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Fri, 11 Aug 2023 18:48:48 +0300 Subject: [PATCH 21/24] Update docs/topics/extensions.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- docs/topics/extensions.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index 439e3e06eb2..e345618b6e8 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -460,9 +460,9 @@ PERIODIC_LOG_DELTA Default: ``None`` -* ``"PERIODIC_LOG_DELTA": True`` - show deltas for all ``int`` and ``float`` stats values. -* ``"PERIODIC_LOG_DELTA": {"include": ["downloader/", "scheduler/"]}`` - include stats deltas for stats with names that have listed substrings in stats names. -* ``"PERIODIC_LOG_DELTA": {"exclude": ["downloader/"]}`` - include all stats deltas except stats with listed substrings in stats names. +* ``"PERIODIC_LOG_DELTA": True`` - show deltas for all ``int`` and ``float`` stat values. +* ``"PERIODIC_LOG_DELTA": {"include": ["downloader/", "scheduler/"]}`` - show deltas for stats with names containing any configured substring. +* ``"PERIODIC_LOG_DELTA": {"exclude": ["downloader/"]}`` - show deltas for all stats with names not containing any configured substring. .. setting:: PERIODIC_LOG_STATS From d67be20b2d1f9345b3b47bff8018e157c654142d Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Fri, 11 Aug 2023 18:48:58 +0300 Subject: [PATCH 22/24] Update docs/topics/extensions.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- docs/topics/extensions.rst | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index e345618b6e8..692ad72fe7c 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -426,15 +426,26 @@ Extension provides extended stats data periodically in addition to basic data fr } } -``"delta"`` section shows numeric difference in stats values between current and previous log entry with period of ``LOGSTATS_INTERVAL`` (60 seconds by default). Its applicable for stats with values types ``int`` and ``float``. -Stats values displayed in this section configured by :setting:`PERIODIC_LOG_DELTA` setting. +This extension logs the following configurable sections: -``"stats"`` section shows stats values as is at the moment of current period. -Stats values displayed in this section configured by :setting:`PERIODIC_LOG_STATS` setting. +- ``"delta"`` shows how some numeric stats have changed since the last stats + log message. + + The :setting:`PERIODIC_LOG_DELTA` setting determines the target stats. They + must have ``int`` or ``float`` values. -``"time"`` This extension produce log entries on startup, periodically, and on end of crawl. As final log entry produced earlier than ``LOGSTATS_INTERVAL`` value - detailed timing data required for more precise stats. +- ``"stats"`` shows the current value of some stats. -Configured by :setting:`PERIODIC_LOG_TIMING_ENABLED` + The :setting:`PERIODIC_LOG_STATS` setting determines the target stats. + +- ``"time"`` shows detailed timing data. + + The :setting:`PERIODIC_LOG_TIMING_ENABLED` setting determines whether or + not to show this section. + +This extension logs data at the start, then on a fixed time interval +configurable through the :setting:`LOGSTATS_INTERVAL` setting, and finally +right before the crawl ends. Example extension configuration: From ac1694a9adc301c5bc2a340e034e0702b7d1a3fc Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Fri, 11 Aug 2023 18:49:09 +0300 Subject: [PATCH 23/24] Update docs/topics/extensions.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- docs/topics/extensions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index 692ad72fe7c..3c1c7d16db4 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -389,7 +389,7 @@ Periodic log extension .. class:: PeriodicLog -Extension provides extended stats data periodically in addition to basic data from Log Stats and Core Stats extensions (as JSON compatible dictionary) like: :: +This extension periodically logs rich stat data as a JSON object:: 2023-08-04 02:30:57 [scrapy.extensions.logstats] INFO: Crawled 976 pages (at 162 pages/min), scraped 925 items (at 161 items/min) 2023-08-04 02:30:57 [scrapy.extensions.periodic_log] INFO: { From 7e542846e4d9b33f7b6f5b984cea0de13f47e8c2 Mon Sep 17 00:00:00 2001 From: Georgiy Zatserklianyi Date: Mon, 21 Aug 2023 17:13:18 +0300 Subject: [PATCH 24/24] periodic_log: `datetime.utcnow()` changed to `datetime.now(tz=timezone.utc)` --- scrapy/extensions/periodic_log.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapy/extensions/periodic_log.py b/scrapy/extensions/periodic_log.py index 214f55777fe..2d557f123a9 100644 --- a/scrapy/extensions/periodic_log.py +++ b/scrapy/extensions/periodic_log.py @@ -1,5 +1,5 @@ import logging -from datetime import datetime +from datetime import datetime, timezone from twisted.internet import task @@ -73,7 +73,7 @@ def from_crawler(cls, crawler): return o def spider_opened(self, spider): - self.time_prev = datetime.utcnow() + self.time_prev = datetime.now(tz=timezone.utc) self.delta_prev = {} self.stats_prev = {} @@ -102,7 +102,7 @@ def log_delta(self): return {"delta": delta} def log_timing(self): - now = datetime.utcnow() + now = datetime.now(tz=timezone.utc) time = { "log_interval": self.interval, "start_time": self.stats._stats["start_time"],