diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst index f2f3ef46657..fa65f66ed41 100644 --- a/docs/topics/downloader-middleware.rst +++ b/docs/topics/downloader-middleware.rst @@ -805,6 +805,7 @@ The :class:`MetaRefreshMiddleware` can be configured through the following settings (see the settings documentation for more info): * :setting:`METAREFRESH_ENABLED` +* :setting:`METAREFRESH_IGNORE_TAGS` * :setting:`METAREFRESH_MAXDELAY` This middleware obey :setting:`REDIRECT_MAX_TIMES` setting, :reqmeta:`dont_redirect`, @@ -826,6 +827,15 @@ Default: ``True`` Whether the Meta Refresh middleware will be enabled. +.. setting:: METAREFRESH_IGNORE_TAGS + +METAREFRESH_IGNORE_TAGS +^^^^^^^^^^^^^^^^^^^^^^^ + +Default: ``['script', 'noscript']`` + +Meta tags within these tags are ignored. + .. setting:: METAREFRESH_MAXDELAY METAREFRESH_MAXDELAY diff --git a/scrapy/downloadermiddlewares/redirect.py b/scrapy/downloadermiddlewares/redirect.py index cb59d3fd2bd..49468a2e486 100644 --- a/scrapy/downloadermiddlewares/redirect.py +++ b/scrapy/downloadermiddlewares/redirect.py @@ -88,6 +88,7 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware): def __init__(self, settings): super(MetaRefreshMiddleware, self).__init__(settings) + self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS') self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY', settings.getint('METAREFRESH_MAXDELAY')) @@ -96,7 +97,8 @@ def process_response(self, request, response, spider): not isinstance(response, HtmlResponse): return response - interval, url = get_meta_refresh(response) + interval, url = get_meta_refresh(response, + ignore_tags=self._ignore_tags) if url and interval < self._maxdelay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, spider, 'meta refresh') diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 9986827d82e..1ce1516e56b 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -221,6 +221,7 @@ MEMUSAGE_WARNING_MB = 0 METAREFRESH_ENABLED = True +METAREFRESH_IGNORE_TAGS = ['script', 'noscript'] METAREFRESH_MAXDELAY = 100 NEWSPIDER_MODULE = '' diff --git a/scrapy/utils/response.py b/scrapy/utils/response.py index bf276b5caa9..122af28b001 100644 --- a/scrapy/utils/response.py +++ b/scrapy/utils/response.py @@ -31,12 +31,12 @@ def get_base_url(response): _metaref_cache = weakref.WeakKeyDictionary() -def get_meta_refresh(response): +def get_meta_refresh(response, ignore_tags=('script', 'noscript')): """Parse the http-equiv refrsh parameter from the given response""" if response not in _metaref_cache: text = response.text[0:4096] _metaref_cache[response] = html.get_meta_refresh(text, response.url, - response.encoding, ignore_tags=('script', 'noscript')) + response.encoding, ignore_tags=ignore_tags) return _metaref_cache[response] diff --git a/tests/test_downloadermiddleware_redirect.py b/tests/test_downloadermiddleware_redirect.py index 6c81c94ca31..0e841489d9a 100644 --- a/tests/test_downloadermiddleware_redirect.py +++ b/tests/test_downloadermiddleware_redirect.py @@ -279,5 +279,24 @@ def test_redirect_reasons(self): self.assertEqual(req2.meta['redirect_reasons'], ['meta refresh']) self.assertEqual(req3.meta['redirect_reasons'], ['meta refresh', 'meta refresh']) + def test_ignore_tags_default(self): + req = Request(url='http://example.org') + body = ('''''') + rsp = HtmlResponse(req.url, body=body.encode()) + response = self.mw.process_response(req, rsp, self.spider) + assert isinstance(response, Response) + + def test_ignore_tags_empty_list(self): + crawler = get_crawler(Spider, {'METAREFRESH_IGNORE_TAGS': []}) + mw = MetaRefreshMiddleware.from_crawler(crawler) + req = Request(url='http://example.org') + body = ('''''') + rsp = HtmlResponse(req.url, body=body.encode()) + req2 = mw.process_response(req, rsp, self.spider) + assert isinstance(req2, Request) + self.assertEqual(req2.url, 'http://example.org/newpage') + if __name__ == "__main__": unittest.main()