Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Do not set Referer by default when its value is None #475

Merged
merged 1 commit into from Dec 24, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion scrapy/http/headers.py
Expand Up @@ -17,7 +17,9 @@ def normkey(self, key):

def normvalue(self, value):
"""Headers must not be unicode"""
if not hasattr(value, '__iter__'):
if value is None:
value = []
elif not hasattr(value, '__iter__'):
value = [value]
return [x.encode(self.encoding) if isinstance(x, unicode) else x \
for x in value]
Expand Down
13 changes: 12 additions & 1 deletion scrapy/tests/mockserver.py
@@ -1,5 +1,5 @@
from __future__ import print_function
import sys, time, random, urllib, os
import sys, time, random, urllib, os, json
from subprocess import Popen, PIPE
from twisted.web.server import Site, NOT_DONE_YET
from twisted.web.resource import Resource
Expand Down Expand Up @@ -119,6 +119,16 @@ def _delayedRender(self, request):
request.finish()


class Echo(LeafResource):

def render_GET(self, request):
output = {
'headers': dict(request.requestHeaders.getAllRawHeaders()),
'body': request.content.read(),
}
return json.dumps(output)


class Partial(LeafResource):

def render_GET(self, request):
Expand Down Expand Up @@ -156,6 +166,7 @@ def __init__(self):
self.putChild("partial", Partial())
self.putChild("drop", Drop())
self.putChild("raw", Raw())
self.putChild("echo", Echo())

def getChild(self, name, request):
return self
Expand Down
19 changes: 19 additions & 0 deletions scrapy/tests/spiders.py
Expand Up @@ -132,3 +132,22 @@ def parse(self, response):
self.seedsseen.append(response.meta.get('seed'))
for req in super(BrokenStartRequestsSpider, self).parse(response):
yield req


class SingleRequestSpider(MetaSpider):

seed = None

def start_requests(self):
if isinstance(self.seed, Request):
yield self.seed.replace(callback=self.parse, errback=self.on_error)
else:
yield Request(self.seed, callback=self.parse, errback=self.on_error)

def parse(self, response):
self.meta.setdefault('responses', []).append(response)
if 'next' in response.meta:
return response.meta['next']

def on_error(self, failure):
self.meta['failure'] = failure
32 changes: 31 additions & 1 deletion scrapy/tests/test_crawl.py
@@ -1,9 +1,11 @@
import json
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import get_crawler, get_testlog
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
BrokenStartRequestsSpider
BrokenStartRequestsSpider, SingleRequestSpider
from scrapy.tests.mockserver import MockServer
from scrapy.http import Request


def docrawl(spider, settings=None):
Expand Down Expand Up @@ -158,3 +160,31 @@ def _assert_retried(self):
log = get_testlog()
self.assertEqual(log.count("Retrying"), 2)
self.assertEqual(log.count("Gave up retrying"), 1)

@defer.inlineCallbacks
def test_referer_header(self):
"""Referer header is set by RefererMiddleware unless it is already set"""
req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
req1 = req0.replace()
req2 = req0.replace(headers={'Referer': None})
req3 = req0.replace(headers={'Referer': 'http://example.com'})
req0.meta['next'] = req1
req1.meta['next'] = req2
req2.meta['next'] = req3
spider = SingleRequestSpider(seed=req0)
yield docrawl(spider)
# basic asserts in case of weird communication errors
self.assertIn('responses', spider.meta)
self.assertNotIn('failures', spider.meta)
# start requests doesn't set Referer header
echo0 = json.loads(spider.meta['responses'][2].body)
self.assertNotIn('Referer', echo0['headers'])
# following request sets Referer to start request url
echo1 = json.loads(spider.meta['responses'][1].body)
self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
# next request avoids Referer header
echo2 = json.loads(spider.meta['responses'][2].body)
self.assertNotIn('Referer', echo2['headers'])
# last request explicitly sets a Referer header
echo3 = json.loads(spider.meta['responses'][3].body)
self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
8 changes: 8 additions & 0 deletions scrapy/tests/test_http_headers.py
Expand Up @@ -119,3 +119,11 @@ def test_setlistdefault(self):
h1.setlistdefault('header2', ['value2', 'value3'])
self.assertEqual(h1.getlist('header1'), ['value1'])
self.assertEqual(h1.getlist('header2'), ['value2', 'value3'])

def test_none_value(self):
h1 = Headers()
h1['foo'] = 'bar'
h1['foo'] = None
h1.setdefault('foo', 'bar')
self.assertEqual(h1.get('foo'), None)
self.assertEqual(h1.getlist('foo'), [])