From bdf12f775062fda8aa8bf03f7b4faade4faac16d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20C=C3=A9sar=20Batista?= Date: Fri, 18 Jan 2019 11:38:59 -0200 Subject: [PATCH 1/2] Logging the request referer when DUPEFILTER_DEBUG is active --- scrapy/dupefilters.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapy/dupefilters.py b/scrapy/dupefilters.py index 9d8966b9c6e..0bcdd349561 100644 --- a/scrapy/dupefilters.py +++ b/scrapy/dupefilters.py @@ -3,8 +3,7 @@ import logging from scrapy.utils.job import job_dir -from scrapy.utils.request import request_fingerprint - +from scrapy.utils.request import referer_str, request_fingerprint class BaseDupeFilter(object): @@ -61,8 +60,9 @@ def close(self, reason): def log(self, request, spider): if self.debug: - msg = "Filtered duplicate request: %(request)s" - self.logger.debug(msg, {'request': request}, extra={'spider': spider}) + msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)" + args = {'request': request, 'referer': referer_str(request) } + self.logger.debug(msg, args, extra={'spider': spider}) elif self.logdupes: msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" From 8eade7d8640e112faf8677f4666bbe3ab10c7234 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20C=C3=A9sar=20Batista?= Date: Fri, 18 Jan 2019 11:39:35 -0200 Subject: [PATCH 2/2] Testing stats and log messages from RFPDupeFilter --- tests/test_dupefilters.py | 57 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/tests/test_dupefilters.py b/tests/test_dupefilters.py index db69597a296..d7eb98c97c0 100644 --- a/tests/test_dupefilters.py +++ b/tests/test_dupefilters.py @@ -2,6 +2,7 @@ import tempfile import unittest import shutil +from testfixtures import LogCapture from scrapy.dupefilters import RFPDupeFilter from scrapy.http import Request @@ -9,7 +10,7 @@ from scrapy.utils.python import to_bytes from scrapy.utils.job import job_dir from scrapy.utils.test import get_crawler - +from tests.spiders import SimpleSpider class FromCrawlerRFPDupeFilter(RFPDupeFilter): @@ -126,3 +127,57 @@ def request_fingerprint(self, request): assert case_insensitive_dupefilter.request_seen(r2) case_insensitive_dupefilter.close('finished') + + def test_log(self): + with LogCapture() as l: + settings = {'DUPEFILTER_DEBUG': False, + 'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'} + crawler = get_crawler(SimpleSpider, settings_dict=settings) + scheduler = Scheduler.from_crawler(crawler) + spider = SimpleSpider.from_crawler(crawler) + + dupefilter = scheduler.df + dupefilter.open() + + r1 = Request('http://scrapytest.org/index.html') + r2 = Request('http://scrapytest.org/index.html') + + dupefilter.log(r1, spider) + dupefilter.log(r2, spider) + + assert crawler.stats.get_value('dupefilter/filtered') == 2 + l.check_present(('scrapy.dupefilters', 'DEBUG', + ('Filtered duplicate request: ' + ' - no more duplicates will be shown' + ' (see DUPEFILTER_DEBUG to show all duplicates)'))) + + dupefilter.close('finished') + + def test_log_debug(self): + with LogCapture() as l: + settings = {'DUPEFILTER_DEBUG': True, + 'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'} + crawler = get_crawler(SimpleSpider, settings_dict=settings) + scheduler = Scheduler.from_crawler(crawler) + spider = SimpleSpider.from_crawler(crawler) + + dupefilter = scheduler.df + dupefilter.open() + + r1 = Request('http://scrapytest.org/index.html') + r2 = Request('http://scrapytest.org/index.html', + headers={'Referer': 'http://scrapytest.org/INDEX.html'} + ) + + dupefilter.log(r1, spider) + dupefilter.log(r2, spider) + + assert crawler.stats.get_value('dupefilter/filtered') == 2 + l.check_present(('scrapy.dupefilters', 'DEBUG', + ('Filtered duplicate request: ' + ' (referer: None)'))) + l.check_present(('scrapy.dupefilters', 'DEBUG', + ('Filtered duplicate request: ' + ' (referer: http://scrapytest.org/INDEX.html)'))) + + dupefilter.close('finished')