Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor LogStats extension to log IPM and RPM to stats on spider_close #4111

Merged
merged 8 commits into from
Feb 16, 2024
44 changes: 34 additions & 10 deletions scrapy/extensions/logstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@


class LogStats:
"""Log basic scraping stats periodically"""
"""Log basic scraping stats periodically like:
* RPM - Requests per Minute
* IPM - Items per Minute
"""

def __init__(self, stats, interval=60.0):
self.stats = stats
Expand All @@ -35,24 +38,45 @@
self.task.start(self.interval)

def log(self, spider):
items = self.stats.get_value("item_scraped_count", 0)
pages = self.stats.get_value("response_received_count", 0)
irate = (items - self.itemsprev) * self.multiplier
prate = (pages - self.pagesprev) * self.multiplier
self.pagesprev, self.itemsprev = pages, items
self.calculate_stats()

msg = (
"Crawled %(pages)d pages (at %(pagerate)d pages/min), "
"scraped %(items)d items (at %(itemrate)d items/min)"
)
log_args = {
"pages": pages,
"pagerate": prate,
"items": items,
"itemrate": irate,
"pages": self.pages,
"pagerate": self.prate,
"items": self.items,
"itemrate": self.irate,
}
logger.info(msg, log_args, extra={"spider": spider})

def calculate_stats(self):
self.items = self.stats.get_value("item_scraped_count", 0)
self.pages = self.stats.get_value("response_received_count", 0)
self.irate = (self.items - self.itemsprev) * self.multiplier
self.prate = (self.pages - self.pagesprev) * self.multiplier
self.pagesprev, self.itemsprev = self.pages, self.items

def spider_closed(self, spider, reason):
if self.task and self.task.running:
self.task.stop()

rpm_final, ipm_final = self.calculate_final_stats(spider)
self.stats.set_value("responses_per_minute", rpm_final)
self.stats.set_value("items_per_minute", ipm_final)

def calculate_final_stats(self, spider):
start_time = self.stats.get_value("start_time")
finished_time = self.stats.get_value("finished_time")

if not start_time or not finished_time:
return None, None

mins_elapsed = (finished_time - start_time).seconds / 60

Check warning on line 77 in scrapy/extensions/logstats.py

View check run for this annotation

Codecov / codecov/patch

scrapy/extensions/logstats.py#L77

Added line #L77 was not covered by tests

items = self.stats.get_value("item_scraped_count", 0)
pages = self.stats.get_value("response_received_count", 0)

Check warning on line 80 in scrapy/extensions/logstats.py

View check run for this annotation

Codecov / codecov/patch

scrapy/extensions/logstats.py#L79-L80

Added lines #L79 - L80 were not covered by tests

return (pages / mins_elapsed), (items / mins_elapsed)

Check warning on line 82 in scrapy/extensions/logstats.py

View check run for this annotation

Codecov / codecov/patch

scrapy/extensions/logstats.py#L82

Added line #L82 was not covered by tests
62 changes: 62 additions & 0 deletions tests/test_logstats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import unittest
from datetime import datetime

from scrapy.extensions.logstats import LogStats
from scrapy.utils.test import get_crawler
from tests.spiders import SimpleSpider


class TestLogStats(unittest.TestCase):
def setUp(self):
self.crawler = get_crawler(SimpleSpider)
self.spider = self.crawler._create_spider("spidey")
self.stats = self.crawler.stats

self.stats.set_value("response_received_count", 4802)
self.stats.set_value("item_scraped_count", 3201)

def test_stats_calculations(self):
logstats = LogStats.from_crawler(self.crawler)

with self.assertRaises(AttributeError):
logstats.pagesprev
logstats.itemsprev

logstats.spider_opened(self.spider)
self.assertEqual(logstats.pagesprev, 4802)
self.assertEqual(logstats.itemsprev, 3201)

logstats.calculate_stats()
self.assertEqual(logstats.items, 3201)
self.assertEqual(logstats.pages, 4802)
self.assertEqual(logstats.irate, 0.0)
self.assertEqual(logstats.prate, 0.0)
self.assertEqual(logstats.pagesprev, 4802)
self.assertEqual(logstats.itemsprev, 3201)

# Simulate what happens after a minute
self.stats.set_value("response_received_count", 5187)
self.stats.set_value("item_scraped_count", 3492)
logstats.calculate_stats()
self.assertEqual(logstats.items, 3492)
self.assertEqual(logstats.pages, 5187)
self.assertEqual(logstats.irate, 291.0)
self.assertEqual(logstats.prate, 385.0)
self.assertEqual(logstats.pagesprev, 5187)
self.assertEqual(logstats.itemsprev, 3492)

# Simulate when spider closes after running for 30 mins
self.stats.set_value("start_time", datetime.fromtimestamp(1655100172))
self.stats.set_value("finished_time", datetime.fromtimestamp(1655101972))
logstats.spider_closed(self.spider, "test reason")
self.assertEqual(self.stats.get_value("responses_per_minute"), 172.9)
self.assertEqual(self.stats.get_value("items_per_minute"), 116.4)

def test_stats_calculations_no_time(self):
"""The stat values should be None since the start and finish time are
not available.
"""
logstats = LogStats.from_crawler(self.crawler)
logstats.spider_closed(self.spider, "test reason")
self.assertIsNone(self.stats.get_value("responses_per_minute"))
self.assertIsNone(self.stats.get_value("items_per_minute"))
Loading