Skip to content

Commit

Permalink
Merge 642af00 into cfae62f
Browse files Browse the repository at this point in the history
  • Loading branch information
kmike committed Aug 25, 2015
2 parents cfae62f + 642af00 commit b05988e
Show file tree
Hide file tree
Showing 8 changed files with 24 additions and 22 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -10,6 +10,8 @@ venv
build
dist
.idea
htmlcov/
.coverage

# Windows
Thumbs.db
4 changes: 2 additions & 2 deletions scrapy/core/scraper.py
Expand Up @@ -16,6 +16,7 @@
from scrapy.http import Request, Response
from scrapy.item import BaseItem
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.utils.request import referer_str

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -150,10 +151,9 @@ def handle_spider_error(self, _failure, request, response, spider):
if isinstance(exc, CloseSpider):
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
return
referer = request.headers.get('Referer')
logger.error(
"Spider error processing %(request)s (referer: %(referer)s)",
{'request': request, 'referer': referer},
{'request': request, 'referer': referer_str(request)},
exc_info=failure_to_exc_info(_failure),
extra={'spider': spider}
)
Expand Down
8 changes: 6 additions & 2 deletions scrapy/logformatter.py
Expand Up @@ -3,6 +3,7 @@

from twisted.python.failure import Failure

from scrapy.utils.request import referer_str

SCRAPEDMSG = u"Scraped from %(src)s" + os.linesep + "%(item)s"
DROPPEDMSG = u"Dropped: %(exception)s" + os.linesep + "%(item)s"
Expand Down Expand Up @@ -38,13 +39,16 @@ def crawled(self, request, response, spider):
'args': {
'status': response.status,
'request': request,
'referer': request.headers.get('Referer'),
'referer': referer_str(request),
'flags': flags,
}
}

def scraped(self, item, response, spider):
src = response.getErrorMessage() if isinstance(response, Failure) else response
if isinstance(response, Failure):
src = response.getErrorMessage()
else:
src = response
return {
'level': logging.DEBUG,
'msg': SCRAPEDMSG,
Expand Down
17 changes: 5 additions & 12 deletions scrapy/pipelines/files.py
Expand Up @@ -26,7 +26,8 @@
from scrapy.http import Request
from scrapy.utils.misc import md5sum
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.python import to_bytes, to_native_str
from scrapy.utils.python import to_bytes
from scrapy.utils.request import referer_str

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -199,7 +200,7 @@ def _onsuccess(result):
if age_days > self.EXPIRES:
return # returning None force download

referer = _get_referer(request)
referer = referer_str(request)
logger.debug(
'File (uptodate): Downloaded %(medianame)s from %(request)s '
'referred in <%(referer)s>',
Expand All @@ -225,7 +226,7 @@ def _onsuccess(result):

def media_failed(self, failure, request, info):
if not isinstance(failure.value, IgnoreRequest):
referer = _get_referer(request)
referer = referer_str(request)
logger.warning(
'File (unknown-error): Error downloading %(medianame)s from '
'%(request)s referred in <%(referer)s>: %(exception)s',
Expand All @@ -237,7 +238,7 @@ def media_failed(self, failure, request, info):
raise FileException

def media_downloaded(self, response, request, info):
referer = _get_referer(request)
referer = referer_str(request)

if response.status != 200:
logger.warning(
Expand Down Expand Up @@ -339,11 +340,3 @@ def _warn():
def file_key(self, url):
return self.file_path(url)
file_key._base = True


def _get_referer(request):
""" Return Referer HTTP header suitable for logging """
referrer = request.headers.get('Referer')
if referrer is None:
return referrer
return to_native_str(referrer, errors='replace')
1 change: 1 addition & 0 deletions scrapy/statscollectors.py
Expand Up @@ -50,6 +50,7 @@ def close_spider(self, spider, reason):
def _persist_stats(self, stats, spider):
pass


class MemoryStatsCollector(StatsCollector):

def __init__(self, crawler):
Expand Down
8 changes: 7 additions & 1 deletion scrapy/utils/request.py
Expand Up @@ -8,7 +8,6 @@
import weakref
from six.moves.urllib.parse import urlunparse

from twisted.internet.defer import Deferred
from w3lib.http import basic_auth_header
from scrapy.utils.python import to_bytes, to_native_str

Expand Down Expand Up @@ -86,3 +85,10 @@ def request_httprepr(request):
s += request.body
return s


def referer_str(request):
""" Return Referer HTTP header suitable for logging. """
referrer = request.headers.get('Referer')
if referrer is None:
return referrer
return to_native_str(referrer, errors='replace')
2 changes: 0 additions & 2 deletions tests/py3-ignores.txt
Expand Up @@ -30,9 +30,7 @@ tests/test_spidermiddleware_httperror.py
tests/test_spidermiddleware_offsite.py
tests/test_spidermiddleware_referer.py
tests/test_spider.py
tests/test_stats.py
tests/test_utils_iterators.py
tests/test_utils_log.py
tests/test_utils_template.py
tests/test_webclient.py

Expand Down
4 changes: 1 addition & 3 deletions tests/test_stats.py
Expand Up @@ -4,6 +4,7 @@
from scrapy.statscollectors import StatsCollector, DummyStatsCollector
from scrapy.utils.test import get_crawler


class StatsCollectorTest(unittest.TestCase):

def setUp(self):
Expand Down Expand Up @@ -50,6 +51,3 @@ def test_dummy_collector(self):
stats.set_value('test', 'value', spider=self.spider)
self.assertEqual(stats.get_stats(), {})
self.assertEqual(stats.get_stats('a'), {})

if __name__ == "__main__":
unittest.main()

0 comments on commit b05988e

Please sign in to comment.