Split redirection into status and metarefresh middlewares, also chang…

…es httpcompression priority. closes #78
scrapy · Jan 8, 2013 · 71db7f1 · 71db7f1
1 parent fe5d0ce
commit 71db7f1
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 78 deletions.
diff --git a/scrapy/contrib/downloadermiddleware/redirect.py b/scrapy/contrib/downloadermiddleware/redirect.py
@@ -6,49 +6,19 @@
 from scrapy.exceptions import IgnoreRequest, NotConfigured
 
 
-class RedirectMiddleware(object):
-    """Handle redirection of requests based on response status and meta-refresh html tag"""
+class BaseRedirectMiddleware(object):
 
     def __init__(self, settings):
         if not settings.getbool('REDIRECT_ENABLED'):
             raise NotConfigured
-        self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY')
+
         self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
         self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
 
     @classmethod
     def from_crawler(cls, crawler):
         return cls(crawler.settings)
 
-    def process_response(self, request, response, spider):
-        if 'dont_redirect' in request.meta:
-            return response
-        if request.method.upper() == 'HEAD':
-            if response.status in [301, 302, 303, 307] and 'Location' in response.headers:
-                redirected_url = urljoin(request.url, response.headers['location'])
-                redirected = request.replace(url=redirected_url)
-                return self._redirect(redirected, request, spider, response.status)
-            else:
-                return response
-
-        if response.status in [302, 303] and 'Location' in response.headers:
-            redirected_url = urljoin(request.url, response.headers['location'])
-            redirected = self._redirect_request_using_get(request, redirected_url)
-            return self._redirect(redirected, request, spider, response.status)
-
-        if response.status in [301, 307] and 'Location' in response.headers:
-            redirected_url = urljoin(request.url, response.headers['location'])
-            redirected = request.replace(url=redirected_url)
-            return self._redirect(redirected, request, spider, response.status)
-
-        if isinstance(response, HtmlResponse):
-            interval, url = get_meta_refresh(response)
-            if url and interval < self.max_metarefresh_delay:
-                redirected = self._redirect_request_using_get(request, url)
-                return self._redirect(redirected, request, spider, 'meta refresh')
-
-        return response
-
     def _redirect(self, redirected, request, spider, reason):
         ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
         redirects = request.meta.get('redirect_times', 0) + 1
@@ -76,3 +46,49 @@ def _redirect_request_using_get(self, request, redirect_url):
         return redirected
 
 
+class RedirectMiddleware(BaseRedirectMiddleware):
+    """Handle redirection of requests based on response status and meta-refresh html tag"""
+
+    def process_response(self, request, response, spider):
+        if 'dont_redirect' in request.meta:
+            return response
+
+        if request.method == 'HEAD':
+            if response.status in [301, 302, 303, 307] and 'Location' in response.headers:
+                redirected_url = urljoin(request.url, response.headers['location'])
+                redirected = request.replace(url=redirected_url)
+                return self._redirect(redirected, request, spider, response.status)
+            else:
+                return response
+
+        if response.status in [302, 303] and 'Location' in response.headers:
+            redirected_url = urljoin(request.url, response.headers['location'])
+            redirected = self._redirect_request_using_get(request, redirected_url)
+            return self._redirect(redirected, request, spider, response.status)
+
+        if response.status in [301, 307] and 'Location' in response.headers:
+            redirected_url = urljoin(request.url, response.headers['location'])
+            redirected = request.replace(url=redirected_url)
+            return self._redirect(redirected, request, spider, response.status)
+
+        return response
+
+
+class MetaRefreshMiddleware(BaseRedirectMiddleware):
+
+    def __init__(self, settings):
+        super(MetaRefreshMiddleware, self).__init__(settings)
+        self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY')
+
+    def process_response(self, request, response, spider):
+        if 'dont_redirect' in request.meta or request.method == 'HEAD' or \
+                not isinstance(response, HtmlResponse):
+            return response
+
+        if isinstance(response, HtmlResponse):
+            interval, url = get_meta_refresh(response)
+            if url and interval < self.max_metarefresh_delay:
+                redirected = self._redirect_request_using_get(request, url)
+                return self._redirect(redirected, request, spider, 'meta refresh')
+
+        return response
diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py
@@ -1,5 +1,5 @@
 """
-This module contains the default values for all settings used by Scrapy. 
+This module contains the default values for all settings used by Scrapy.
 
 For more information about these settings you can read the settings
 documentation in docs/topics/settings.rst
@@ -74,10 +74,11 @@
     'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
     'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
     'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
+    'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
+    'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
     'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
     'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
     'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
-    'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 800,
     'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
     'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
     'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,

diff --git a/scrapy/tests/test_downloadermiddleware.py b/scrapy/tests/test_downloadermiddleware.py
@@ -5,7 +5,6 @@
 from scrapy.spider import BaseSpider
 from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
 from scrapy.utils.test import get_crawler
-from scrapy.stats import stats
 
 
 class ManagerTestCase(TestCase):
@@ -18,22 +17,24 @@ def setUp(self):
         self.spider.set_crawler(self.crawler)
         self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
         # some mw depends on stats collector
-        stats.open_spider(self.spider)
+        self.crawler.stats.open_spider(self.spider)
         return self.mwman.open_spider(self.spider)
 
     def tearDown(self):
-        stats.close_spider(self.spider, '')
+        self.crawler.stats.close_spider(self.spider, '')
         return self.mwman.close_spider(self.spider)
 
-    def _download(self,request, response=None):
+    def _download(self, request, response=None):
         """Executes downloader mw manager's download method and returns
         the result (Request or Response) or raise exception in case of
         failure.
         """
         if not response:
-            response = Response(request.url, request=request)
+            response = Response(request.url)
+
         def download_func(**kwargs):
             return response
+
         dfd = self.mwman.download(download_func, request, self.spider)
         # catch deferred result and return the value
         results = []
@@ -50,28 +51,25 @@ class DefaultsTest(ManagerTestCase):
 
     def test_request_response(self):
         req = Request('http://example.com/index.html')
-        resp = Response(req.url, status=200, request=req)
+        resp = Response(req.url, status=200)
         ret = self._download(req, resp)
-        self.assertTrue(isinstance(resp, Response), "Non-response returned")
-
-
-class GzippedRedirectionTest(ManagerTestCase):
-    """Regression test for a failure when redirecting a compressed
-    request.
+        self.assertTrue(isinstance(ret, Response), "Non-response returned")
 
-    This happens when httpcompression middleware is executed before redirect
-    middleware and attempts to decompress a non-compressed body.
-    In particular when some website returns a 30x response with header
-    'Content-Encoding: gzip' giving as result the error below:
+    def test_3xx_and_invalid_gzipped_body_must_redirect(self):
+        """Regression test for a failure when redirecting a compressed
+        request.
 
-        exceptions.IOError: Not a gzipped file
+        This happens when httpcompression middleware is executed before redirect
+        middleware and attempts to decompress a non-compressed body.
+        In particular when some website returns a 30x response with header
+        'Content-Encoding: gzip' giving as result the error below:
 
-    """
+            exceptions.IOError: Not a gzipped file
 
-    def test_gzipped_redirection(self):
+        """
         req = Request('http://example.com')
         body = '<p>You are being redirected</p>'
-        resp = Response(req.url, status=302, body=body, request=req, headers={
+        resp = Response(req.url, status=302, body=body, headers={
             'Content-Length': len(body),
             'Content-Type': 'text/html',
             'Content-Encoding': 'gzip',
@@ -82,3 +80,14 @@ def test_gzipped_redirection(self):
                         "Not redirected: {0!r}".format(ret))
         self.assertEqual(ret.url, resp.headers['Location'],
                          "Not redirected to location header")
+
+    def test_200_and_invalid_gzipped_body_must_fail(self):
+        req = Request('http://example.com')
+        body = '<p>You are being redirected</p>'
+        resp = Response(req.url, status=200, body=body, headers={
+            'Content-Length': len(body),
+            'Content-Type': 'text/html',
+            'Content-Encoding': 'gzip',
+            'Location': 'http://example.com/login',
+        })
+        self.assertRaises(IOError, self._download, request=req, response=resp)
diff --git a/scrapy/tests/test_downloadermiddleware_redirect.py b/scrapy/tests/test_downloadermiddleware_redirect.py
@@ -1,11 +1,12 @@
 import unittest
 
-from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware
+from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware, MetaRefreshMiddleware
 from scrapy.spider import BaseSpider
 from scrapy.exceptions import IgnoreRequest
-from scrapy.http import Request, Response, HtmlResponse, Headers
+from scrapy.http import Request, Response, HtmlResponse
 from scrapy.utils.test import get_crawler
 
+
 class RedirectMiddlewareTest(unittest.TestCase):
 
     def setUp(self):
@@ -52,7 +53,7 @@ def test_dont_redirect(self):
     def test_redirect_302(self):
         url = 'http://www.example.com/302'
         url2 = 'http://www.example.com/redirected2'
-        req = Request(url, method='POST', body='test', 
+        req = Request(url, method='POST', body='test',
             headers={'Content-Type': 'text/plain', 'Content-length': '4'})
         rsp = Response(url, headers={'Location': url2}, status=302)
 
@@ -86,35 +87,74 @@ def test_redirect_302_head(self):
         del rsp.headers['Location']
         assert self.mw.process_response(req, rsp, self.spider) is rsp
 
+
+    def test_max_redirect_times(self):
+        self.mw.max_redirect_times = 1
+        req = Request('http://scrapytest.org/302')
+        rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
+
+        req = self.mw.process_response(req, rsp, self.spider)
+        assert isinstance(req, Request)
+        assert 'redirect_times' in req.meta
+        self.assertEqual(req.meta['redirect_times'], 1)
+        self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
+
+    def test_ttl(self):
+        self.mw.max_redirect_times = 100
+        req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
+        rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
+
+        req = self.mw.process_response(req, rsp, self.spider)
+        assert isinstance(req, Request)
+        self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
+
+    def test_redirect_urls(self):
+        req1 = Request('http://scrapytest.org/first')
+        rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
+        req2 = self.mw.process_response(req1, rsp1, self.spider)
+        rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
+        req3 = self.mw.process_response(req2, rsp2, self.spider)
+
+        self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
+        self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
+        self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
+        self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
+
+class MetaRefreshMiddlewareTest(unittest.TestCase):
+
+    def setUp(self):
+        crawler = get_crawler()
+        self.spider = BaseSpider('foo')
+        self.mw = MetaRefreshMiddleware.from_crawler(crawler)
+
+    def _body(self, interval=5, url='http://example.org/newpage'):
+        return """<html><head><meta http-equiv="refresh" content="{0};url={1}"/></head></html>"""\
+                .format(interval, url)
+
+    def test_priority_adjust(self):
+        req = Request('http://a.com')
+        rsp = HtmlResponse(req.url, body=self._body())
+        req2 = self.mw.process_response(req, rsp, self.spider)
+        assert req2.priority > req.priority
+
     def test_meta_refresh(self):
-        body = """<html>
-            <head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
-            </html>"""
         req = Request(url='http://example.org')
-        rsp = HtmlResponse(url='http://example.org', body=body)
+        rsp = HtmlResponse(req.url, body=self._body())
         req2 = self.mw.process_response(req, rsp, self.spider)
-
         assert isinstance(req2, Request)
         self.assertEqual(req2.url, 'http://example.org/newpage')
 
     def test_meta_refresh_with_high_interval(self):
         # meta-refresh with high intervals don't trigger redirects
-        body = """<html>
-            <head><meta http-equiv="refresh" content="1000;url=http://example.org/newpage" /></head>
-            </html>"""
         req = Request(url='http://example.org')
-        rsp = HtmlResponse(url='http://example.org', body=body)
+        rsp = HtmlResponse(url='http://example.org', body=self._body(interval=1000))
         rsp2 = self.mw.process_response(req, rsp, self.spider)
-
         assert rsp is rsp2
 
     def test_meta_refresh_trough_posted_request(self):
-        body = """<html>
-            <head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
-            </html>"""
         req = Request(url='http://example.org', method='POST', body='test',
-            headers={'Content-Type': 'text/plain', 'Content-length': '4'})
-        rsp = HtmlResponse(url='http://example.org', body=body)
+                      headers={'Content-Type': 'text/plain', 'Content-length': '4'})
+        rsp = HtmlResponse(req.url, body=self._body())
         req2 = self.mw.process_response(req, rsp, self.spider)
 
         assert isinstance(req2, Request)
@@ -129,8 +169,8 @@ def test_meta_refresh_trough_posted_request(self):
 
     def test_max_redirect_times(self):
         self.mw.max_redirect_times = 1
-        req = Request('http://scrapytest.org/302')
-        rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
+        req = Request('http://scrapytest.org/max')
+        rsp = HtmlResponse(req.url, body=self._body())
 
         req = self.mw.process_response(req, rsp, self.spider)
         assert isinstance(req, Request)
@@ -141,19 +181,20 @@ def test_max_redirect_times(self):
     def test_ttl(self):
         self.mw.max_redirect_times = 100
         req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
-        rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
+        rsp = HtmlResponse(req.url, body=self._body())
 
         req = self.mw.process_response(req, rsp, self.spider)
         assert isinstance(req, Request)
         self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
 
     def test_redirect_urls(self):
         req1 = Request('http://scrapytest.org/first')
-        rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
+        rsp1 = HtmlResponse(req1.url, body=self._body(url='/redirected'))
         req2 = self.mw.process_response(req1, rsp1, self.spider)
-        rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
+        assert isinstance(req2, Request), req2
+        rsp2 = HtmlResponse(req2.url, body=self._body(url='/redirected2'))
         req3 = self.mw.process_response(req2, rsp2, self.spider)
-
+        assert isinstance(req3, Request), req3
         self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
         self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
         self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')