Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+1] Adding requests referer to RFPDupeFilter log messages #3588

Merged
merged 2 commits into from Jan 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions scrapy/dupefilters.py
Expand Up @@ -3,8 +3,7 @@
import logging

from scrapy.utils.job import job_dir
from scrapy.utils.request import request_fingerprint

from scrapy.utils.request import referer_str, request_fingerprint

class BaseDupeFilter(object):

Expand Down Expand Up @@ -61,8 +60,9 @@ def close(self, reason):

def log(self, request, spider):
if self.debug:
msg = "Filtered duplicate request: %(request)s"
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
args = {'request': request, 'referer': referer_str(request) }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey! Sorry for delay. No objections to me, a nice feature. I was just wondering if Referer should be added to the next message as well, when debug is False.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair point.
I just added to keep the same behavior, but it makes sense to log referer only in debug mode.

self.logger.debug(msg, args, extra={'spider': spider})
elif self.logdupes:
msg = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
Expand Down
57 changes: 56 additions & 1 deletion tests/test_dupefilters.py
Expand Up @@ -2,14 +2,15 @@
import tempfile
import unittest
import shutil
from testfixtures import LogCapture

from scrapy.dupefilters import RFPDupeFilter
from scrapy.http import Request
from scrapy.core.scheduler import Scheduler
from scrapy.utils.python import to_bytes
from scrapy.utils.job import job_dir
from scrapy.utils.test import get_crawler

from tests.spiders import SimpleSpider

class FromCrawlerRFPDupeFilter(RFPDupeFilter):

Expand Down Expand Up @@ -126,3 +127,57 @@ def request_fingerprint(self, request):
assert case_insensitive_dupefilter.request_seen(r2)

case_insensitive_dupefilter.close('finished')

def test_log(self):
with LogCapture() as l:
settings = {'DUPEFILTER_DEBUG': False,
'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'}
crawler = get_crawler(SimpleSpider, settings_dict=settings)
scheduler = Scheduler.from_crawler(crawler)
spider = SimpleSpider.from_crawler(crawler)

dupefilter = scheduler.df
dupefilter.open()

r1 = Request('http://scrapytest.org/index.html')
r2 = Request('http://scrapytest.org/index.html')

dupefilter.log(r1, spider)
dupefilter.log(r2, spider)

assert crawler.stats.get_value('dupefilter/filtered') == 2
l.check_present(('scrapy.dupefilters', 'DEBUG',
('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
' - no more duplicates will be shown'
' (see DUPEFILTER_DEBUG to show all duplicates)')))

dupefilter.close('finished')

def test_log_debug(self):
with LogCapture() as l:
settings = {'DUPEFILTER_DEBUG': True,
'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'}
crawler = get_crawler(SimpleSpider, settings_dict=settings)
scheduler = Scheduler.from_crawler(crawler)
spider = SimpleSpider.from_crawler(crawler)

dupefilter = scheduler.df
dupefilter.open()

r1 = Request('http://scrapytest.org/index.html')
r2 = Request('http://scrapytest.org/index.html',
headers={'Referer': 'http://scrapytest.org/INDEX.html'}
)

dupefilter.log(r1, spider)
dupefilter.log(r2, spider)

assert crawler.stats.get_value('dupefilter/filtered') == 2
l.check_present(('scrapy.dupefilters', 'DEBUG',
('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
' (referer: None)')))
l.check_present(('scrapy.dupefilters', 'DEBUG',
('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
' (referer: http://scrapytest.org/INDEX.html)')))

dupefilter.close('finished')