From 787b5af30e01040251efff091a19691d4f5d1f09 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 26 Aug 2015 01:58:33 +0500 Subject: [PATCH 1/3] add coverage files to gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 4db22f1d8a0..b116640b4f2 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,8 @@ venv build dist .idea +htmlcov/ +.coverage # Windows Thumbs.db From 7da769feb24e03629d7b7acb94a631459a4a333c Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 26 Aug 2015 01:58:59 +0500 Subject: [PATCH 2/3] enable test_stats and test_utils_log tests in Python 3 --- scrapy/statscollectors.py | 1 + tests/py3-ignores.txt | 2 -- tests/test_stats.py | 4 +--- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/scrapy/statscollectors.py b/scrapy/statscollectors.py index 62b037f3623..6da9ddcd27d 100644 --- a/scrapy/statscollectors.py +++ b/scrapy/statscollectors.py @@ -50,6 +50,7 @@ def close_spider(self, spider, reason): def _persist_stats(self, stats, spider): pass + class MemoryStatsCollector(StatsCollector): def __init__(self, crawler): diff --git a/tests/py3-ignores.txt b/tests/py3-ignores.txt index 5a009db3692..4d7fdc4305c 100644 --- a/tests/py3-ignores.txt +++ b/tests/py3-ignores.txt @@ -30,9 +30,7 @@ tests/test_spidermiddleware_httperror.py tests/test_spidermiddleware_offsite.py tests/test_spidermiddleware_referer.py tests/test_spider.py -tests/test_stats.py tests/test_utils_iterators.py -tests/test_utils_log.py tests/test_utils_template.py tests/test_webclient.py diff --git a/tests/test_stats.py b/tests/test_stats.py index 5c7c0e6bb16..9f950ebc91b 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -4,6 +4,7 @@ from scrapy.statscollectors import StatsCollector, DummyStatsCollector from scrapy.utils.test import get_crawler + class StatsCollectorTest(unittest.TestCase): def setUp(self): @@ -50,6 +51,3 @@ def test_dummy_collector(self): stats.set_value('test', 'value', spider=self.spider) self.assertEqual(stats.get_stats(), {}) self.assertEqual(stats.get_stats('a'), {}) - -if __name__ == "__main__": - unittest.main() From 642af00bb7a676470155a907080839ba410a9adc Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 26 Aug 2015 02:19:33 +0500 Subject: [PATCH 3/3] fix Referer logging --- scrapy/core/scraper.py | 4 ++-- scrapy/logformatter.py | 8 ++++++-- scrapy/pipelines/files.py | 17 +++++------------ scrapy/utils/request.py | 8 +++++++- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/scrapy/core/scraper.py b/scrapy/core/scraper.py index 244499be24a..67198179d0e 100644 --- a/scrapy/core/scraper.py +++ b/scrapy/core/scraper.py @@ -16,6 +16,7 @@ from scrapy.http import Request, Response from scrapy.item import BaseItem from scrapy.core.spidermw import SpiderMiddlewareManager +from scrapy.utils.request import referer_str logger = logging.getLogger(__name__) @@ -150,10 +151,9 @@ def handle_spider_error(self, _failure, request, response, spider): if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return - referer = request.headers.get('Referer') logger.error( "Spider error processing %(request)s (referer: %(referer)s)", - {'request': request, 'referer': referer}, + {'request': request, 'referer': referer_str(request)}, exc_info=failure_to_exc_info(_failure), extra={'spider': spider} ) diff --git a/scrapy/logformatter.py b/scrapy/logformatter.py index a0508e0b76c..2160d9ab0bb 100644 --- a/scrapy/logformatter.py +++ b/scrapy/logformatter.py @@ -3,6 +3,7 @@ from twisted.python.failure import Failure +from scrapy.utils.request import referer_str SCRAPEDMSG = u"Scraped from %(src)s" + os.linesep + "%(item)s" DROPPEDMSG = u"Dropped: %(exception)s" + os.linesep + "%(item)s" @@ -38,13 +39,16 @@ def crawled(self, request, response, spider): 'args': { 'status': response.status, 'request': request, - 'referer': request.headers.get('Referer'), + 'referer': referer_str(request), 'flags': flags, } } def scraped(self, item, response, spider): - src = response.getErrorMessage() if isinstance(response, Failure) else response + if isinstance(response, Failure): + src = response.getErrorMessage() + else: + src = response return { 'level': logging.DEBUG, 'msg': SCRAPEDMSG, diff --git a/scrapy/pipelines/files.py b/scrapy/pipelines/files.py index db49aff6534..e4011d31dfb 100644 --- a/scrapy/pipelines/files.py +++ b/scrapy/pipelines/files.py @@ -26,7 +26,8 @@ from scrapy.http import Request from scrapy.utils.misc import md5sum from scrapy.utils.log import failure_to_exc_info -from scrapy.utils.python import to_bytes, to_native_str +from scrapy.utils.python import to_bytes +from scrapy.utils.request import referer_str logger = logging.getLogger(__name__) @@ -199,7 +200,7 @@ def _onsuccess(result): if age_days > self.EXPIRES: return # returning None force download - referer = _get_referer(request) + referer = referer_str(request) logger.debug( 'File (uptodate): Downloaded %(medianame)s from %(request)s ' 'referred in <%(referer)s>', @@ -225,7 +226,7 @@ def _onsuccess(result): def media_failed(self, failure, request, info): if not isinstance(failure.value, IgnoreRequest): - referer = _get_referer(request) + referer = referer_str(request) logger.warning( 'File (unknown-error): Error downloading %(medianame)s from ' '%(request)s referred in <%(referer)s>: %(exception)s', @@ -237,7 +238,7 @@ def media_failed(self, failure, request, info): raise FileException def media_downloaded(self, response, request, info): - referer = _get_referer(request) + referer = referer_str(request) if response.status != 200: logger.warning( @@ -339,11 +340,3 @@ def _warn(): def file_key(self, url): return self.file_path(url) file_key._base = True - - -def _get_referer(request): - """ Return Referer HTTP header suitable for logging """ - referrer = request.headers.get('Referer') - if referrer is None: - return referrer - return to_native_str(referrer, errors='replace') diff --git a/scrapy/utils/request.py b/scrapy/utils/request.py index 0487d1e1b23..e361b74332e 100644 --- a/scrapy/utils/request.py +++ b/scrapy/utils/request.py @@ -8,7 +8,6 @@ import weakref from six.moves.urllib.parse import urlunparse -from twisted.internet.defer import Deferred from w3lib.http import basic_auth_header from scrapy.utils.python import to_bytes, to_native_str @@ -86,3 +85,10 @@ def request_httprepr(request): s += request.body return s + +def referer_str(request): + """ Return Referer HTTP header suitable for logging. """ + referrer = request.headers.get('Referer') + if referrer is None: + return referrer + return to_native_str(referrer, errors='replace')