diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index 8d4749ab33d..0286581c025 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -396,3 +396,115 @@ For more info see `Debugging in Python`_. This extension only works on POSIX-compliant platforms (i.e. not Windows). .. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/ + +Periodic log extension +~~~~~~~~~~~~~~~~~~~~~~ + +.. class:: PeriodicLog + +This extension periodically logs rich stat data as a JSON object:: + + 2023-08-04 02:30:57 [scrapy.extensions.logstats] INFO: Crawled 976 pages (at 162 pages/min), scraped 925 items (at 161 items/min) + 2023-08-04 02:30:57 [scrapy.extensions.periodic_log] INFO: { + "delta": { + "downloader/request_bytes": 55582, + "downloader/request_count": 162, + "downloader/request_method_count/GET": 162, + "downloader/response_bytes": 618133, + "downloader/response_count": 162, + "downloader/response_status_count/200": 162, + "item_scraped_count": 161 + }, + "stats": { + "downloader/request_bytes": 338243, + "downloader/request_count": 992, + "downloader/request_method_count/GET": 992, + "downloader/response_bytes": 3836736, + "downloader/response_count": 976, + "downloader/response_status_count/200": 976, + "item_scraped_count": 925, + "log_count/INFO": 21, + "log_count/WARNING": 1, + "scheduler/dequeued": 992, + "scheduler/dequeued/memory": 992, + "scheduler/enqueued": 1050, + "scheduler/enqueued/memory": 1050 + }, + "time": { + "elapsed": 360.008903, + "log_interval": 60.0, + "log_interval_real": 60.006694, + "start_time": "2023-08-03 23:24:57", + "utcnow": "2023-08-03 23:30:57" + } + } + +This extension logs the following configurable sections: + +- ``"delta"`` shows how some numeric stats have changed since the last stats + log message. + + The :setting:`PERIODIC_LOG_DELTA` setting determines the target stats. They + must have ``int`` or ``float`` values. + +- ``"stats"`` shows the current value of some stats. + + The :setting:`PERIODIC_LOG_STATS` setting determines the target stats. + +- ``"time"`` shows detailed timing data. + + The :setting:`PERIODIC_LOG_TIMING_ENABLED` setting determines whether or + not to show this section. + +This extension logs data at the start, then on a fixed time interval +configurable through the :setting:`LOGSTATS_INTERVAL` setting, and finally +right before the crawl ends. + + +Example extension configuration: + +.. code-block:: python + + custom_settings = { + "LOG_LEVEL": "INFO", + "PERIODIC_LOG_STATS": { + "include": ["downloader/", "scheduler/", "log_count/", "item_scraped_count/"], + }, + "PERIODIC_LOG_DELTA": {"include": ["downloader/"]}, + "PERIODIC_LOG_TIMING_ENABLED": True, + "EXTENSIONS": { + "scrapy.extensions.periodic_log.PeriodicLog": 0, + }, + } + +.. setting:: PERIODIC_LOG_DELTA + +PERIODIC_LOG_DELTA +"""""""""""""""""" + +Default: ``None`` + +* ``"PERIODIC_LOG_DELTA": True`` - show deltas for all ``int`` and ``float`` stat values. +* ``"PERIODIC_LOG_DELTA": {"include": ["downloader/", "scheduler/"]}`` - show deltas for stats with names containing any configured substring. +* ``"PERIODIC_LOG_DELTA": {"exclude": ["downloader/"]}`` - show deltas for all stats with names not containing any configured substring. + +.. setting:: PERIODIC_LOG_STATS + +PERIODIC_LOG_STATS +"""""""""""""""""" + +Default: ``None`` + +* ``"PERIODIC_LOG_STATS": True`` - show the current value of all stats. +* ``"PERIODIC_LOG_STATS": {"include": ["downloader/", "scheduler/"]}`` - show current values for stats with names containing any configured substring. +* ``"PERIODIC_LOG_STATS": {"exclude": ["downloader/"]}`` - show current values for all stats with names not containing any configured substring. + + +.. setting:: PERIODIC_LOG_TIMING_ENABLED + +PERIODIC_LOG_TIMING_ENABLED +""""""""""""""""""""""""""" + +Default: ``False`` + +``True`` enables logging of timing data (i.e. the ``"time"`` section). \ No newline at end of file diff --git a/scrapy/extensions/periodic_log.py b/scrapy/extensions/periodic_log.py new file mode 100644 index 00000000000..2d557f123a9 --- /dev/null +++ b/scrapy/extensions/periodic_log.py @@ -0,0 +1,140 @@ +import logging +from datetime import datetime, timezone + +from twisted.internet import task + +from scrapy import signals +from scrapy.exceptions import NotConfigured +from scrapy.utils.serialize import ScrapyJSONEncoder + +logger = logging.getLogger(__name__) + + +class PeriodicLog: + """Log basic scraping stats periodically""" + + def __init__( + self, + stats, + interval=60.0, + ext_stats={}, + ext_delta={}, + ext_timing_enabled=False, + ): + self.stats = stats + self.interval = interval + self.multiplier = 60.0 / self.interval + self.task = None + self.encoder = ScrapyJSONEncoder(sort_keys=True, indent=4) + self.ext_stats_enabled = bool(ext_stats) + self.ext_stats_include = ext_stats.get("include", []) + self.ext_stats_exclude = ext_stats.get("exclude", []) + self.ext_delta_enabled = bool(ext_delta) + self.ext_delta_include = ext_delta.get("include", []) + self.ext_delta_exclude = ext_delta.get("exclude", []) + self.ext_timing_enabled = ext_timing_enabled + + @classmethod + def from_crawler(cls, crawler): + interval = crawler.settings.getfloat("LOGSTATS_INTERVAL") + if not interval: + raise NotConfigured + try: + ext_stats = crawler.settings.getdict("PERIODIC_LOG_STATS") + except (TypeError, ValueError): + ext_stats = ( + {"enabled": True} + if crawler.settings.getbool("PERIODIC_LOG_STATS") + else None + ) + try: + ext_delta = crawler.settings.getdict("PERIODIC_LOG_DELTA") + except (TypeError, ValueError): + ext_delta = ( + {"enabled": True} + if crawler.settings.getbool("PERIODIC_LOG_DELTA") + else None + ) + + ext_timing_enabled = crawler.settings.getbool( + "PERIODIC_LOG_TIMING_ENABLED", False + ) + if not (ext_stats or ext_delta or ext_timing_enabled): + raise NotConfigured + o = cls( + crawler.stats, + interval, + ext_stats, + ext_delta, + ext_timing_enabled, + ) + crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) + crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) + return o + + def spider_opened(self, spider): + self.time_prev = datetime.now(tz=timezone.utc) + self.delta_prev = {} + self.stats_prev = {} + + self.task = task.LoopingCall(self.log) + self.task.start(self.interval) + + def log(self): + data = {} + if self.ext_timing_enabled: + data.update(self.log_timing()) + if self.ext_delta_enabled: + data.update(self.log_delta()) + if self.ext_stats_enabled: + data.update(self.log_crawler_stats()) + logger.info(self.encoder.encode(data)) + + def log_delta(self): + num_stats = { + k: v + for k, v in self.stats._stats.items() + if isinstance(v, (int, float)) + and self.param_allowed(k, self.ext_delta_include, self.ext_delta_exclude) + } + delta = {k: v - self.delta_prev.get(k, 0) for k, v in num_stats.items()} + self.delta_prev = num_stats + return {"delta": delta} + + def log_timing(self): + now = datetime.now(tz=timezone.utc) + time = { + "log_interval": self.interval, + "start_time": self.stats._stats["start_time"], + "utcnow": now, + "log_interval_real": (now - self.time_prev).total_seconds(), + "elapsed": (now - self.stats._stats["start_time"]).total_seconds(), + } + self.time_prev = now + return {"time": time} + + def log_crawler_stats(self): + stats = { + k: v + for k, v in self.stats._stats.items() + if self.param_allowed(k, self.ext_stats_include, self.ext_stats_exclude) + } + return {"stats": stats} + + def param_allowed(self, stat_name, include, exclude): + if not include and not exclude: + return True + for p in exclude: + if p in stat_name: + return False + if exclude and not include: + return True + for p in include: + if p in stat_name: + return True + return False + + def spider_closed(self, spider, reason): + self.log() + if self.task and self.task.running: + self.task.stop() diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index feb6e8f6cdb..fa06e5ee8e8 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -242,6 +242,10 @@ NEWSPIDER_MODULE = "" +PERIODIC_LOG_DELTA = None +PERIODIC_LOG_STATS = None +PERIODIC_LOG_TIMING_ENABLED = False + RANDOMIZE_DOWNLOAD_DELAY = True REACTOR_THREADPOOL_MAXSIZE = 10 diff --git a/tests/test_extension_periodic_log.py b/tests/test_extension_periodic_log.py new file mode 100644 index 00000000000..80f5c317769 --- /dev/null +++ b/tests/test_extension_periodic_log.py @@ -0,0 +1,198 @@ +import datetime +import typing +import unittest + +from scrapy.crawler import Crawler +from scrapy.extensions.periodic_log import PeriodicLog + +from .spiders import MetaSpider + +stats_dump_1 = { + "log_count/INFO": 10, + "log_count/WARNING": 1, + "start_time": datetime.datetime(2023, 6, 16, 8, 59, 18, 993170), + "scheduler/enqueued/memory": 190, + "scheduler/enqueued": 190, + "scheduler/dequeued/memory": 166, + "scheduler/dequeued": 166, + "downloader/request_count": 166, + "downloader/request_method_count/GET": 166, + "downloader/request_bytes": 56803, + "downloader/response_count": 150, + "downloader/response_status_count/200": 150, + "downloader/response_bytes": 595698, + "httpcompression/response_bytes": 3186068, + "httpcompression/response_count": 150, + "response_received_count": 150, + "request_depth_max": 9, + "dupefilter/filtered": 180, + "item_scraped_count": 140, +} +stats_dump_2 = { + "log_count/INFO": 12, + "log_count/WARNING": 1, + "start_time": datetime.datetime(2023, 6, 16, 8, 59, 18, 993170), + "scheduler/enqueued/memory": 337, + "scheduler/enqueued": 337, + "scheduler/dequeued/memory": 280, + "scheduler/dequeued": 280, + "downloader/request_count": 280, + "downloader/request_method_count/GET": 280, + "downloader/request_bytes": 95754, + "downloader/response_count": 264, + "downloader/response_status_count/200": 264, + "downloader/response_bytes": 1046274, + "httpcompression/response_bytes": 5614484, + "httpcompression/response_count": 264, + "response_received_count": 264, + "request_depth_max": 16, + "dupefilter/filtered": 320, + "item_scraped_count": 248, +} + + +class TestExtPeriodicLog(PeriodicLog): + def set_a(self): + self.stats._stats = stats_dump_1 + + def set_b(self): + self.stats._stats = stats_dump_2 + + +def extension(settings=None): + return TestExtPeriodicLog.from_crawler( + Crawler( + MetaSpider, + settings=settings, + ) + ) + + +class TestPeriodicLog(unittest.TestCase): + def test_extension_enabled(self): + # Expected that settings for this extension loaded succesfully + # And on certain conditions - extension raising NotConfigured + + # "PERIODIC_LOG_STATS": True -> set to {"enabled": True} + # due to TypeError exception from settings.getdict + assert extension({"PERIODIC_LOG_STATS": True, "LOGSTATS_INTERVAL": 60}) + + # "PERIODIC_LOG_STATS": "True" -> set to {"enabled": True} + # due to JSONDecodeError(ValueError) exception from settings.getdict + assert extension({"PERIODIC_LOG_STATS": "True", "LOGSTATS_INTERVAL": 60}) + + # The ame for PERIODIC_LOG_DELTA: + assert extension({"PERIODIC_LOG_DELTA": True, "LOGSTATS_INTERVAL": 60}) + assert extension({"PERIODIC_LOG_DELTA": "True", "LOGSTATS_INTERVAL": 60}) + + def test_log_delta(self): + def emulate(settings=None): + spider = MetaSpider() + ext = extension(settings) + ext.spider_opened(spider) + ext.set_a() + a = ext.log_delta() + ext.set_a() + b = ext.log_delta() + ext.spider_closed(spider, reason="finished") + return ext, a, b + + def check(settings: dict, condition: typing.Callable): + ext, a, b = emulate(settings) + assert list(a["delta"].keys()) == [ + k for k, v in ext.stats._stats.items() if condition(k, v) + ] + assert list(b["delta"].keys()) == [ + k for k, v in ext.stats._stats.items() if condition(k, v) + ] + + # Including all + check({"PERIODIC_LOG_DELTA": True}, lambda k, v: isinstance(v, (int, float))) + + # include: + check( + {"PERIODIC_LOG_DELTA": {"include": ["downloader/"]}}, + lambda k, v: isinstance(v, (int, float)) and "downloader/" in k, + ) + + # include multiple + check( + {"PERIODIC_LOG_DELTA": {"include": ["downloader/", "scheduler/"]}}, + lambda k, v: isinstance(v, (int, float)) + and ("downloader/" in k or "scheduler/" in k), + ) + + # exclude + check( + {"PERIODIC_LOG_DELTA": {"exclude": ["downloader/"]}}, + lambda k, v: isinstance(v, (int, float)) and "downloader/" not in k, + ) + + # exclude multiple + check( + {"PERIODIC_LOG_DELTA": {"exclude": ["downloader/", "scheduler/"]}}, + lambda k, v: isinstance(v, (int, float)) + and ("downloader/" not in k and "scheduler/" not in k), + ) + + # include exclude combined + check( + {"PERIODIC_LOG_DELTA": {"include": ["downloader/"], "exclude": ["bytes"]}}, + lambda k, v: isinstance(v, (int, float)) + and ("downloader/" in k and "bytes" not in k), + ) + + def test_log_stats(self): + def emulate(settings=None): + spider = MetaSpider() + ext = extension(settings) + ext.spider_opened(spider) + ext.set_a() + a = ext.log_crawler_stats() + ext.set_a() + b = ext.log_crawler_stats() + ext.spider_closed(spider, reason="finished") + return ext, a, b + + def check(settings: dict, condition: typing.Callable): + ext, a, b = emulate(settings) + assert list(a["stats"].keys()) == [ + k for k, v in ext.stats._stats.items() if condition(k, v) + ] + assert list(b["stats"].keys()) == [ + k for k, v in ext.stats._stats.items() if condition(k, v) + ] + + # Including all + check({"PERIODIC_LOG_STATS": True}, lambda k, v: True) + + # include: + check( + {"PERIODIC_LOG_STATS": {"include": ["downloader/"]}}, + lambda k, v: "downloader/" in k, + ) + + # include multiple + check( + {"PERIODIC_LOG_STATS": {"include": ["downloader/", "scheduler/"]}}, + lambda k, v: "downloader/" in k or "scheduler/" in k, + ) + + # exclude + check( + {"PERIODIC_LOG_STATS": {"exclude": ["downloader/"]}}, + lambda k, v: "downloader/" not in k, + ) + + # exclude multiple + check( + {"PERIODIC_LOG_STATS": {"exclude": ["downloader/", "scheduler/"]}}, + lambda k, v: "downloader/" not in k and "scheduler/" not in k, + ) + + # include exclude combined + check( + {"PERIODIC_LOG_STATS": {"include": ["downloader/"], "exclude": ["bytes"]}}, + lambda k, v: "downloader/" in k and "bytes" not in k, + ) + #