From 71db7f1b25dc6c147273fff5c2fc2adc43bfd7e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Gra=C3=B1a?= Date: Tue, 8 Jan 2013 09:55:16 -0200 Subject: [PATCH] Split redirection into status and metarefresh middlewares, also changes httpcompression priority. closes #78 --- .../contrib/downloadermiddleware/redirect.py | 80 ++++++++++------- scrapy/settings/default_settings.py | 5 +- scrapy/tests/test_downloadermiddleware.py | 49 +++++----- .../test_downloadermiddleware_redirect.py | 89 ++++++++++++++----- 4 files changed, 145 insertions(+), 78 deletions(-) diff --git a/scrapy/contrib/downloadermiddleware/redirect.py b/scrapy/contrib/downloadermiddleware/redirect.py index 1247c018d1d..69c772ea9ac 100644 --- a/scrapy/contrib/downloadermiddleware/redirect.py +++ b/scrapy/contrib/downloadermiddleware/redirect.py @@ -6,13 +6,12 @@ from scrapy.exceptions import IgnoreRequest, NotConfigured -class RedirectMiddleware(object): - """Handle redirection of requests based on response status and meta-refresh html tag""" +class BaseRedirectMiddleware(object): def __init__(self, settings): if not settings.getbool('REDIRECT_ENABLED'): raise NotConfigured - self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY') + self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES') self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST') @@ -20,35 +19,6 @@ def __init__(self, settings): def from_crawler(cls, crawler): return cls(crawler.settings) - def process_response(self, request, response, spider): - if 'dont_redirect' in request.meta: - return response - if request.method.upper() == 'HEAD': - if response.status in [301, 302, 303, 307] and 'Location' in response.headers: - redirected_url = urljoin(request.url, response.headers['location']) - redirected = request.replace(url=redirected_url) - return self._redirect(redirected, request, spider, response.status) - else: - return response - - if response.status in [302, 303] and 'Location' in response.headers: - redirected_url = urljoin(request.url, response.headers['location']) - redirected = self._redirect_request_using_get(request, redirected_url) - return self._redirect(redirected, request, spider, response.status) - - if response.status in [301, 307] and 'Location' in response.headers: - redirected_url = urljoin(request.url, response.headers['location']) - redirected = request.replace(url=redirected_url) - return self._redirect(redirected, request, spider, response.status) - - if isinstance(response, HtmlResponse): - interval, url = get_meta_refresh(response) - if url and interval < self.max_metarefresh_delay: - redirected = self._redirect_request_using_get(request, url) - return self._redirect(redirected, request, spider, 'meta refresh') - - return response - def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 @@ -76,3 +46,49 @@ def _redirect_request_using_get(self, request, redirect_url): return redirected +class RedirectMiddleware(BaseRedirectMiddleware): + """Handle redirection of requests based on response status and meta-refresh html tag""" + + def process_response(self, request, response, spider): + if 'dont_redirect' in request.meta: + return response + + if request.method == 'HEAD': + if response.status in [301, 302, 303, 307] and 'Location' in response.headers: + redirected_url = urljoin(request.url, response.headers['location']) + redirected = request.replace(url=redirected_url) + return self._redirect(redirected, request, spider, response.status) + else: + return response + + if response.status in [302, 303] and 'Location' in response.headers: + redirected_url = urljoin(request.url, response.headers['location']) + redirected = self._redirect_request_using_get(request, redirected_url) + return self._redirect(redirected, request, spider, response.status) + + if response.status in [301, 307] and 'Location' in response.headers: + redirected_url = urljoin(request.url, response.headers['location']) + redirected = request.replace(url=redirected_url) + return self._redirect(redirected, request, spider, response.status) + + return response + + +class MetaRefreshMiddleware(BaseRedirectMiddleware): + + def __init__(self, settings): + super(MetaRefreshMiddleware, self).__init__(settings) + self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY') + + def process_response(self, request, response, spider): + if 'dont_redirect' in request.meta or request.method == 'HEAD' or \ + not isinstance(response, HtmlResponse): + return response + + if isinstance(response, HtmlResponse): + interval, url = get_meta_refresh(response) + if url and interval < self.max_metarefresh_delay: + redirected = self._redirect_request_using_get(request, url) + return self._redirect(redirected, request, spider, 'meta refresh') + + return response diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index b70b2cadcc9..b588ce7ee15 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -1,5 +1,5 @@ """ -This module contains the default values for all settings used by Scrapy. +This module contains the default values for all settings used by Scrapy. For more information about these settings you can read the settings documentation in docs/topics/settings.rst @@ -74,10 +74,11 @@ 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400, 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500, 'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550, + 'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580, + 'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590, 'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600, 'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700, 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750, - 'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 800, 'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830, 'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850, 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900, diff --git a/scrapy/tests/test_downloadermiddleware.py b/scrapy/tests/test_downloadermiddleware.py index bdbde8e433a..acae944801f 100644 --- a/scrapy/tests/test_downloadermiddleware.py +++ b/scrapy/tests/test_downloadermiddleware.py @@ -5,7 +5,6 @@ from scrapy.spider import BaseSpider from scrapy.core.downloader.middleware import DownloaderMiddlewareManager from scrapy.utils.test import get_crawler -from scrapy.stats import stats class ManagerTestCase(TestCase): @@ -18,22 +17,24 @@ def setUp(self): self.spider.set_crawler(self.crawler) self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler) # some mw depends on stats collector - stats.open_spider(self.spider) + self.crawler.stats.open_spider(self.spider) return self.mwman.open_spider(self.spider) def tearDown(self): - stats.close_spider(self.spider, '') + self.crawler.stats.close_spider(self.spider, '') return self.mwman.close_spider(self.spider) - def _download(self,request, response=None): + def _download(self, request, response=None): """Executes downloader mw manager's download method and returns the result (Request or Response) or raise exception in case of failure. """ if not response: - response = Response(request.url, request=request) + response = Response(request.url) + def download_func(**kwargs): return response + dfd = self.mwman.download(download_func, request, self.spider) # catch deferred result and return the value results = [] @@ -50,28 +51,25 @@ class DefaultsTest(ManagerTestCase): def test_request_response(self): req = Request('http://example.com/index.html') - resp = Response(req.url, status=200, request=req) + resp = Response(req.url, status=200) ret = self._download(req, resp) - self.assertTrue(isinstance(resp, Response), "Non-response returned") - - -class GzippedRedirectionTest(ManagerTestCase): - """Regression test for a failure when redirecting a compressed - request. + self.assertTrue(isinstance(ret, Response), "Non-response returned") - This happens when httpcompression middleware is executed before redirect - middleware and attempts to decompress a non-compressed body. - In particular when some website returns a 30x response with header - 'Content-Encoding: gzip' giving as result the error below: + def test_3xx_and_invalid_gzipped_body_must_redirect(self): + """Regression test for a failure when redirecting a compressed + request. - exceptions.IOError: Not a gzipped file + This happens when httpcompression middleware is executed before redirect + middleware and attempts to decompress a non-compressed body. + In particular when some website returns a 30x response with header + 'Content-Encoding: gzip' giving as result the error below: - """ + exceptions.IOError: Not a gzipped file - def test_gzipped_redirection(self): + """ req = Request('http://example.com') body = '

You are being redirected

' - resp = Response(req.url, status=302, body=body, request=req, headers={ + resp = Response(req.url, status=302, body=body, headers={ 'Content-Length': len(body), 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', @@ -82,3 +80,14 @@ def test_gzipped_redirection(self): "Not redirected: {0!r}".format(ret)) self.assertEqual(ret.url, resp.headers['Location'], "Not redirected to location header") + + def test_200_and_invalid_gzipped_body_must_fail(self): + req = Request('http://example.com') + body = '

You are being redirected

' + resp = Response(req.url, status=200, body=body, headers={ + 'Content-Length': len(body), + 'Content-Type': 'text/html', + 'Content-Encoding': 'gzip', + 'Location': 'http://example.com/login', + }) + self.assertRaises(IOError, self._download, request=req, response=resp) diff --git a/scrapy/tests/test_downloadermiddleware_redirect.py b/scrapy/tests/test_downloadermiddleware_redirect.py index 0df82e02a80..bb74522ee72 100644 --- a/scrapy/tests/test_downloadermiddleware_redirect.py +++ b/scrapy/tests/test_downloadermiddleware_redirect.py @@ -1,11 +1,12 @@ import unittest -from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware +from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware, MetaRefreshMiddleware from scrapy.spider import BaseSpider from scrapy.exceptions import IgnoreRequest -from scrapy.http import Request, Response, HtmlResponse, Headers +from scrapy.http import Request, Response, HtmlResponse from scrapy.utils.test import get_crawler + class RedirectMiddlewareTest(unittest.TestCase): def setUp(self): @@ -52,7 +53,7 @@ def test_dont_redirect(self): def test_redirect_302(self): url = 'http://www.example.com/302' url2 = 'http://www.example.com/redirected2' - req = Request(url, method='POST', body='test', + req = Request(url, method='POST', body='test', headers={'Content-Type': 'text/plain', 'Content-length': '4'}) rsp = Response(url, headers={'Location': url2}, status=302) @@ -86,35 +87,74 @@ def test_redirect_302_head(self): del rsp.headers['Location'] assert self.mw.process_response(req, rsp, self.spider) is rsp + + def test_max_redirect_times(self): + self.mw.max_redirect_times = 1 + req = Request('http://scrapytest.org/302') + rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302) + + req = self.mw.process_response(req, rsp, self.spider) + assert isinstance(req, Request) + assert 'redirect_times' in req.meta + self.assertEqual(req.meta['redirect_times'], 1) + self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider) + + def test_ttl(self): + self.mw.max_redirect_times = 100 + req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1}) + rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302) + + req = self.mw.process_response(req, rsp, self.spider) + assert isinstance(req, Request) + self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider) + + def test_redirect_urls(self): + req1 = Request('http://scrapytest.org/first') + rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302) + req2 = self.mw.process_response(req1, rsp1, self.spider) + rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302) + req3 = self.mw.process_response(req2, rsp2, self.spider) + + self.assertEqual(req2.url, 'http://scrapytest.org/redirected') + self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first']) + self.assertEqual(req3.url, 'http://scrapytest.org/redirected2') + self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected']) + +class MetaRefreshMiddlewareTest(unittest.TestCase): + + def setUp(self): + crawler = get_crawler() + self.spider = BaseSpider('foo') + self.mw = MetaRefreshMiddleware.from_crawler(crawler) + + def _body(self, interval=5, url='http://example.org/newpage'): + return """"""\ + .format(interval, url) + + def test_priority_adjust(self): + req = Request('http://a.com') + rsp = HtmlResponse(req.url, body=self._body()) + req2 = self.mw.process_response(req, rsp, self.spider) + assert req2.priority > req.priority + def test_meta_refresh(self): - body = """ - - """ req = Request(url='http://example.org') - rsp = HtmlResponse(url='http://example.org', body=body) + rsp = HtmlResponse(req.url, body=self._body()) req2 = self.mw.process_response(req, rsp, self.spider) - assert isinstance(req2, Request) self.assertEqual(req2.url, 'http://example.org/newpage') def test_meta_refresh_with_high_interval(self): # meta-refresh with high intervals don't trigger redirects - body = """ - - """ req = Request(url='http://example.org') - rsp = HtmlResponse(url='http://example.org', body=body) + rsp = HtmlResponse(url='http://example.org', body=self._body(interval=1000)) rsp2 = self.mw.process_response(req, rsp, self.spider) - assert rsp is rsp2 def test_meta_refresh_trough_posted_request(self): - body = """ - - """ req = Request(url='http://example.org', method='POST', body='test', - headers={'Content-Type': 'text/plain', 'Content-length': '4'}) - rsp = HtmlResponse(url='http://example.org', body=body) + headers={'Content-Type': 'text/plain', 'Content-length': '4'}) + rsp = HtmlResponse(req.url, body=self._body()) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) @@ -129,8 +169,8 @@ def test_meta_refresh_trough_posted_request(self): def test_max_redirect_times(self): self.mw.max_redirect_times = 1 - req = Request('http://scrapytest.org/302') - rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302) + req = Request('http://scrapytest.org/max') + rsp = HtmlResponse(req.url, body=self._body()) req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) @@ -141,7 +181,7 @@ def test_max_redirect_times(self): def test_ttl(self): self.mw.max_redirect_times = 100 req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1}) - rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302) + rsp = HtmlResponse(req.url, body=self._body()) req = self.mw.process_response(req, rsp, self.spider) assert isinstance(req, Request) @@ -149,11 +189,12 @@ def test_ttl(self): def test_redirect_urls(self): req1 = Request('http://scrapytest.org/first') - rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302) + rsp1 = HtmlResponse(req1.url, body=self._body(url='/redirected')) req2 = self.mw.process_response(req1, rsp1, self.spider) - rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302) + assert isinstance(req2, Request), req2 + rsp2 = HtmlResponse(req2.url, body=self._body(url='/redirected2')) req3 = self.mw.process_response(req2, rsp2, self.spider) - + assert isinstance(req3, Request), req3 self.assertEqual(req2.url, 'http://scrapytest.org/redirected') self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first']) self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')