Skip to content

Commit

Permalink
Split redirection into status and metarefresh middlewares, also chang…
Browse files Browse the repository at this point in the history
…es httpcompression priority. closes #78
  • Loading branch information
dangra committed Jan 8, 2013
1 parent fe5d0ce commit 71db7f1
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 78 deletions.
80 changes: 48 additions & 32 deletions scrapy/contrib/downloadermiddleware/redirect.py
Expand Up @@ -6,49 +6,19 @@
from scrapy.exceptions import IgnoreRequest, NotConfigured


class RedirectMiddleware(object):
"""Handle redirection of requests based on response status and meta-refresh html tag"""
class BaseRedirectMiddleware(object):

def __init__(self, settings):
if not settings.getbool('REDIRECT_ENABLED'):
raise NotConfigured
self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY')

self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')

@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)

def process_response(self, request, response, spider):
if 'dont_redirect' in request.meta:
return response
if request.method.upper() == 'HEAD':
if response.status in [301, 302, 303, 307] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
else:
return response

if response.status in [302, 303] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = self._redirect_request_using_get(request, redirected_url)
return self._redirect(redirected, request, spider, response.status)

if response.status in [301, 307] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)

if isinstance(response, HtmlResponse):
interval, url = get_meta_refresh(response)
if url and interval < self.max_metarefresh_delay:
redirected = self._redirect_request_using_get(request, url)
return self._redirect(redirected, request, spider, 'meta refresh')

return response

def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
Expand Down Expand Up @@ -76,3 +46,49 @@ def _redirect_request_using_get(self, request, redirect_url):
return redirected


class RedirectMiddleware(BaseRedirectMiddleware):
"""Handle redirection of requests based on response status and meta-refresh html tag"""

def process_response(self, request, response, spider):
if 'dont_redirect' in request.meta:
return response

if request.method == 'HEAD':
if response.status in [301, 302, 303, 307] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
else:
return response

if response.status in [302, 303] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = self._redirect_request_using_get(request, redirected_url)
return self._redirect(redirected, request, spider, response.status)

if response.status in [301, 307] and 'Location' in response.headers:
redirected_url = urljoin(request.url, response.headers['location'])
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)

return response


class MetaRefreshMiddleware(BaseRedirectMiddleware):

def __init__(self, settings):
super(MetaRefreshMiddleware, self).__init__(settings)
self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY')

def process_response(self, request, response, spider):
if 'dont_redirect' in request.meta or request.method == 'HEAD' or \
not isinstance(response, HtmlResponse):
return response

if isinstance(response, HtmlResponse):
interval, url = get_meta_refresh(response)
if url and interval < self.max_metarefresh_delay:
redirected = self._redirect_request_using_get(request, url)
return self._redirect(redirected, request, spider, 'meta refresh')

return response
5 changes: 3 additions & 2 deletions scrapy/settings/default_settings.py
@@ -1,5 +1,5 @@
"""
This module contains the default values for all settings used by Scrapy.
This module contains the default values for all settings used by Scrapy.
For more information about these settings you can read the settings
documentation in docs/topics/settings.rst
Expand Down Expand Up @@ -74,10 +74,11 @@
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 800,
'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
Expand Down
49 changes: 29 additions & 20 deletions scrapy/tests/test_downloadermiddleware.py
Expand Up @@ -5,7 +5,6 @@
from scrapy.spider import BaseSpider
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
from scrapy.utils.test import get_crawler
from scrapy.stats import stats


class ManagerTestCase(TestCase):
Expand All @@ -18,22 +17,24 @@ def setUp(self):
self.spider.set_crawler(self.crawler)
self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
# some mw depends on stats collector
stats.open_spider(self.spider)
self.crawler.stats.open_spider(self.spider)
return self.mwman.open_spider(self.spider)

def tearDown(self):
stats.close_spider(self.spider, '')
self.crawler.stats.close_spider(self.spider, '')
return self.mwman.close_spider(self.spider)

def _download(self,request, response=None):
def _download(self, request, response=None):
"""Executes downloader mw manager's download method and returns
the result (Request or Response) or raise exception in case of
failure.
"""
if not response:
response = Response(request.url, request=request)
response = Response(request.url)

def download_func(**kwargs):
return response

dfd = self.mwman.download(download_func, request, self.spider)
# catch deferred result and return the value
results = []
Expand All @@ -50,28 +51,25 @@ class DefaultsTest(ManagerTestCase):

def test_request_response(self):
req = Request('http://example.com/index.html')
resp = Response(req.url, status=200, request=req)
resp = Response(req.url, status=200)
ret = self._download(req, resp)
self.assertTrue(isinstance(resp, Response), "Non-response returned")


class GzippedRedirectionTest(ManagerTestCase):
"""Regression test for a failure when redirecting a compressed
request.
self.assertTrue(isinstance(ret, Response), "Non-response returned")

This happens when httpcompression middleware is executed before redirect
middleware and attempts to decompress a non-compressed body.
In particular when some website returns a 30x response with header
'Content-Encoding: gzip' giving as result the error below:
def test_3xx_and_invalid_gzipped_body_must_redirect(self):
"""Regression test for a failure when redirecting a compressed
request.
exceptions.IOError: Not a gzipped file
This happens when httpcompression middleware is executed before redirect
middleware and attempts to decompress a non-compressed body.
In particular when some website returns a 30x response with header
'Content-Encoding: gzip' giving as result the error below:
"""
exceptions.IOError: Not a gzipped file
def test_gzipped_redirection(self):
"""
req = Request('http://example.com')
body = '<p>You are being redirected</p>'
resp = Response(req.url, status=302, body=body, request=req, headers={
resp = Response(req.url, status=302, body=body, headers={
'Content-Length': len(body),
'Content-Type': 'text/html',
'Content-Encoding': 'gzip',
Expand All @@ -82,3 +80,14 @@ def test_gzipped_redirection(self):
"Not redirected: {0!r}".format(ret))
self.assertEqual(ret.url, resp.headers['Location'],
"Not redirected to location header")

def test_200_and_invalid_gzipped_body_must_fail(self):
req = Request('http://example.com')
body = '<p>You are being redirected</p>'
resp = Response(req.url, status=200, body=body, headers={
'Content-Length': len(body),
'Content-Type': 'text/html',
'Content-Encoding': 'gzip',
'Location': 'http://example.com/login',
})
self.assertRaises(IOError, self._download, request=req, response=resp)
89 changes: 65 additions & 24 deletions scrapy/tests/test_downloadermiddleware_redirect.py
@@ -1,11 +1,12 @@
import unittest

from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware
from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware, MetaRefreshMiddleware
from scrapy.spider import BaseSpider
from scrapy.exceptions import IgnoreRequest
from scrapy.http import Request, Response, HtmlResponse, Headers
from scrapy.http import Request, Response, HtmlResponse
from scrapy.utils.test import get_crawler


class RedirectMiddlewareTest(unittest.TestCase):

def setUp(self):
Expand Down Expand Up @@ -52,7 +53,7 @@ def test_dont_redirect(self):
def test_redirect_302(self):
url = 'http://www.example.com/302'
url2 = 'http://www.example.com/redirected2'
req = Request(url, method='POST', body='test',
req = Request(url, method='POST', body='test',
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
rsp = Response(url, headers={'Location': url2}, status=302)

Expand Down Expand Up @@ -86,35 +87,74 @@ def test_redirect_302_head(self):
del rsp.headers['Location']
assert self.mw.process_response(req, rsp, self.spider) is rsp


def test_max_redirect_times(self):
self.mw.max_redirect_times = 1
req = Request('http://scrapytest.org/302')
rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)

req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
assert 'redirect_times' in req.meta
self.assertEqual(req.meta['redirect_times'], 1)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)

def test_ttl(self):
self.mw.max_redirect_times = 100
req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)

req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)

def test_redirect_urls(self):
req1 = Request('http://scrapytest.org/first')
rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
req2 = self.mw.process_response(req1, rsp1, self.spider)
rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
req3 = self.mw.process_response(req2, rsp2, self.spider)

self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])

class MetaRefreshMiddlewareTest(unittest.TestCase):

def setUp(self):
crawler = get_crawler()
self.spider = BaseSpider('foo')
self.mw = MetaRefreshMiddleware.from_crawler(crawler)

def _body(self, interval=5, url='http://example.org/newpage'):
return """<html><head><meta http-equiv="refresh" content="{0};url={1}"/></head></html>"""\
.format(interval, url)

def test_priority_adjust(self):
req = Request('http://a.com')
rsp = HtmlResponse(req.url, body=self._body())
req2 = self.mw.process_response(req, rsp, self.spider)
assert req2.priority > req.priority

def test_meta_refresh(self):
body = """<html>
<head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
</html>"""
req = Request(url='http://example.org')
rsp = HtmlResponse(url='http://example.org', body=body)
rsp = HtmlResponse(req.url, body=self._body())
req2 = self.mw.process_response(req, rsp, self.spider)

assert isinstance(req2, Request)
self.assertEqual(req2.url, 'http://example.org/newpage')

def test_meta_refresh_with_high_interval(self):
# meta-refresh with high intervals don't trigger redirects
body = """<html>
<head><meta http-equiv="refresh" content="1000;url=http://example.org/newpage" /></head>
</html>"""
req = Request(url='http://example.org')
rsp = HtmlResponse(url='http://example.org', body=body)
rsp = HtmlResponse(url='http://example.org', body=self._body(interval=1000))
rsp2 = self.mw.process_response(req, rsp, self.spider)

assert rsp is rsp2

def test_meta_refresh_trough_posted_request(self):
body = """<html>
<head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
</html>"""
req = Request(url='http://example.org', method='POST', body='test',
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
rsp = HtmlResponse(url='http://example.org', body=body)
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
rsp = HtmlResponse(req.url, body=self._body())
req2 = self.mw.process_response(req, rsp, self.spider)

assert isinstance(req2, Request)
Expand All @@ -129,8 +169,8 @@ def test_meta_refresh_trough_posted_request(self):

def test_max_redirect_times(self):
self.mw.max_redirect_times = 1
req = Request('http://scrapytest.org/302')
rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
req = Request('http://scrapytest.org/max')
rsp = HtmlResponse(req.url, body=self._body())

req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
Expand All @@ -141,19 +181,20 @@ def test_max_redirect_times(self):
def test_ttl(self):
self.mw.max_redirect_times = 100
req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
rsp = HtmlResponse(req.url, body=self._body())

req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)

def test_redirect_urls(self):
req1 = Request('http://scrapytest.org/first')
rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
rsp1 = HtmlResponse(req1.url, body=self._body(url='/redirected'))
req2 = self.mw.process_response(req1, rsp1, self.spider)
rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
assert isinstance(req2, Request), req2
rsp2 = HtmlResponse(req2.url, body=self._body(url='/redirected2'))
req3 = self.mw.process_response(req2, rsp2, self.spider)

assert isinstance(req3, Request), req3
self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
Expand Down

0 comments on commit 71db7f1

Please sign in to comment.