diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 4e7feeeafac..b6bbfc1dfcd 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -3,7 +3,7 @@ import zlib from scrapy.exceptions import NotConfigured -from scrapy.http import Response, TextResponse +from scrapy.http import TextResponse from scrapy.responsetypes import responsetypes from scrapy.utils.deprecate import ScrapyDeprecationWarning from scrapy.utils.gz import gunzip @@ -27,24 +27,43 @@ class HttpCompressionMiddleware: """This middleware allows compressed (gzip, deflate) traffic to be sent/received from web sites""" - def __init__(self, stats=None): + def __init__(self, stats=None, settings=None): self.stats = stats + if not stats: + warnings.warn( + "HttpCompressionMiddleware now accepts a 'stats' parameter which should be specified.", + ScrapyDeprecationWarning, + ) + if settings: + self.keep_encoding_header = settings.getbool('COMPRESSION_KEEP_ENCODING_HEADER') + if not self.keep_encoding_header: + warnings.warn( + "Setting COMPRESSION_KEEP_ENCODING_HEADER=False is deprecated", + ScrapyDeprecationWarning, + ) + else: + self.keep_encoding_header = False + warnings.warn( + "HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified.", + ScrapyDeprecationWarning, + ) @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('COMPRESSION_ENABLED'): raise NotConfigured try: - return cls(stats=crawler.stats) + return cls(stats=crawler.stats, settings=crawler.settings) except TypeError: warnings.warn( "HttpCompressionMiddleware subclasses must either modify " - "their '__init__' method to support a 'stats' parameter or " + "their '__init__' method to support 'stats' and 'settings' parameters or " "reimplement the 'from_crawler' method.", ScrapyDeprecationWarning, ) result = cls() result.stats = crawler.stats + result.keep_encoding_header = False return result def process_request(self, request, spider): @@ -52,28 +71,34 @@ def process_request(self, request, spider): b", ".join(ACCEPTED_ENCODINGS)) def process_response(self, request, response, spider): - if request.method == 'HEAD': return response - if isinstance(response, Response): - content_encoding = response.headers.getlist('Content-Encoding') - if content_encoding: - encoding = content_encoding.pop() - decoded_body = self._decode(response.body, encoding.lower()) - if self.stats: - self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider) - self.stats.inc_value('httpcompression/response_count', spider=spider) - respcls = responsetypes.from_args( - headers=response.headers, url=response.url, body=decoded_body - ) - kwargs = dict(cls=respcls, body=decoded_body) - if issubclass(respcls, TextResponse): - # force recalculating the encoding until we make sure the - # responsetypes guessing is reliable - kwargs['encoding'] = None - response = response.replace(**kwargs) - if not content_encoding: - del response.headers['Content-Encoding'] + + if b'decoded' in response.flags: + return response + + content_encoding = response.headers.getlist('Content-Encoding') + if not content_encoding: + return response + + encoding = content_encoding[0] + decoded_body = self._decode(response.body, encoding.lower()) + if self.stats: + self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider) + self.stats.inc_value('httpcompression/response_count', spider=spider) + respcls = responsetypes.from_args( + headers=response.headers, url=response.url, body=decoded_body + ) + kwargs = dict(cls=respcls, body=decoded_body) + if issubclass(respcls, TextResponse): + # force recalculating the encoding until we make sure the + # responsetypes guessing is reliable + kwargs['encoding'] = None + + kwargs['flags'] = response.flags + [b'decoded'] + response = response.replace(**kwargs) + if not self.keep_encoding_header: + del response.headers['Content-Encoding'] return response diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 8389a70cb4b..02dd20b2190 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -37,6 +37,7 @@ COMMANDS_MODULE = '' COMPRESSION_ENABLED = True +COMPRESSION_KEEP_ENCODING_HEADER = True CONCURRENT_ITEMS = 100 diff --git a/scrapy/templates/project/module/settings.py.tmpl b/scrapy/templates/project/module/settings.py.tmpl index a414b5fde5b..1b4a9d7516d 100644 --- a/scrapy/templates/project/module/settings.py.tmpl +++ b/scrapy/templates/project/module/settings.py.tmpl @@ -86,3 +86,6 @@ ROBOTSTXT_OBEY = True #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +# Keep the original Content-Encoding header +COMPRESSION_KEEP_ENCODING_HEADER = True diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 40e9f3a9644..6c7b6ae33f1 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -4,6 +4,7 @@ from unittest import TestCase, SkipTest from warnings import catch_warnings +from scrapy.settings import Settings from scrapy.spiders import Spider from scrapy.http import Response, Request, HtmlResponse from scrapy.downloadermiddlewares.httpcompression import HttpCompressionMiddleware, ACCEPTED_ENCODINGS @@ -14,7 +15,6 @@ from tests import tests_datadir from w3lib.encoding import resolve_encoding - SAMPLEDIR = join(tests_datadir, 'compressed') FORMAT = { @@ -102,23 +102,25 @@ def test_process_response_gzip(self): self.assertEqual(response.headers['Content-Encoding'], b'gzip') newresponse = self.mw.process_response(request, response, self.spider) - assert newresponse is not response - assert newresponse.body.startswith(b'