From 5f068dc9863d158557c2aa68c05395620f124b3f Mon Sep 17 00:00:00 2001 From: VorBoto Date: Fri, 13 Sep 2019 22:15:56 -0400 Subject: [PATCH 01/36] Add HEADERS_KEEP variable to settings and to HttpCompressionMiddleware class (#1988) --- scrapy/downloadermiddlewares/httpcompression.py | 9 +++++++-- scrapy/settings/default_settings.py | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 203dee42dde..8a46d6ed828 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -18,6 +18,10 @@ class HttpCompressionMiddleware(object): """This middleware allows compressed (gzip, deflate) traffic to be sent/received from web sites""" + + HEADERS_KEEP = setting.getbool('HEADERS_KEEP') + + @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('COMPRESSION_ENABLED'): @@ -45,8 +49,9 @@ def process_response(self, request, response, spider): # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) - if not content_encoding: - del response.headers['Content-Encoding'] + if not HEADERS_KEEP: + if not content_encoding: + del response.headers['Content-Encoding'] return response diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 05ab4b62850..f76af9095c4 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -170,6 +170,8 @@ FTP_PASSWORD = 'guest' FTP_PASSIVE_MODE = True +HEADERS_KEEP = False + HTTPCACHE_ENABLED = False HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_MISSING = False From 6a725ca46cdfa4907300769bf76d585d47683d5d Mon Sep 17 00:00:00 2001 From: VorBoto Date: Sat, 14 Sep 2019 21:34:10 -0400 Subject: [PATCH 02/36] fixed type --- scrapy/downloadermiddlewares/httpcompression.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 8a46d6ed828..2e06b2963b9 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -19,9 +19,8 @@ class HttpCompressionMiddleware(object): """This middleware allows compressed (gzip, deflate) traffic to be sent/received from web sites""" - HEADERS_KEEP = setting.getbool('HEADERS_KEEP') + HEADERS_KEEP = settings.getbool('HEADERS_KEEP') - @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('COMPRESSION_ENABLED'): From 2b6fb1befa9b1d7c931d10e6ca8cec5f265ac298 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Mon, 16 Sep 2019 17:49:18 -0400 Subject: [PATCH 03/36] Added init to HttpCompMiddleware #1988 --- scrapy/downloadermiddlewares/httpcompression.py | 7 +++++-- scrapy/settings/default_settings.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 2e06b2963b9..911d9832e07 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -19,7 +19,10 @@ class HttpCompressionMiddleware(object): """This middleware allows compressed (gzip, deflate) traffic to be sent/received from web sites""" - HEADERS_KEEP = settings.getbool('HEADERS_KEEP') + def __int__(self, settings): + #setting to decide to keep or discard encoding header + #stored in default_settings.py + self.keep_encoding_header = setting.getbool('HTTPCOMPRESSION_HEADERS_KEEP') @classmethod def from_crawler(cls, crawler): @@ -48,7 +51,7 @@ def process_response(self, request, response, spider): # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) - if not HEADERS_KEEP: + if not self.keep_encoding_header: if not content_encoding: del response.headers['Content-Encoding'] diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index f76af9095c4..5a094556f40 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -170,8 +170,6 @@ FTP_PASSWORD = 'guest' FTP_PASSIVE_MODE = True -HEADERS_KEEP = False - HTTPCACHE_ENABLED = False HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_MISSING = False @@ -185,6 +183,8 @@ HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy' HTTPCACHE_GZIP = False +HTTPCOMPRESSION_HEADERS_KEEP = False + HTTPPROXY_ENABLED = True HTTPPROXY_AUTH_ENCODING = 'latin-1' From 96a8dfcc411638bfff45ad071015992e66aac196 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Mon, 16 Sep 2019 18:05:48 -0400 Subject: [PATCH 04/36] Added init to HttpCompMiddleware #1988 Typeo --- scrapy/downloadermiddlewares/httpcompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 911d9832e07..c275837c8ca 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -22,7 +22,7 @@ class HttpCompressionMiddleware(object): def __int__(self, settings): #setting to decide to keep or discard encoding header #stored in default_settings.py - self.keep_encoding_header = setting.getbool('HTTPCOMPRESSION_HEADERS_KEEP') + self.keep_encoding_header = settings.getbool('HTTPCOMPRESSION_HEADERS_KEEP') @classmethod def from_crawler(cls, crawler): From 9010d9ffe25cb1f7768c040d4efc733edf82a876 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Mon, 16 Sep 2019 18:36:59 -0400 Subject: [PATCH 05/36] Trying with passing init crawler #1988 --- scrapy/downloadermiddlewares/httpcompression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index c275837c8ca..c28d9c8f4a6 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -19,10 +19,10 @@ class HttpCompressionMiddleware(object): """This middleware allows compressed (gzip, deflate) traffic to be sent/received from web sites""" - def __int__(self, settings): + def __int__(self, crawler): #setting to decide to keep or discard encoding header #stored in default_settings.py - self.keep_encoding_header = settings.getbool('HTTPCOMPRESSION_HEADERS_KEEP') + self.keep_encoding_header = crawler.settings.getbool('HTTPCOMPRESSION_HEADERS_KEEP') @classmethod def from_crawler(cls, crawler): From 608b3422a18ebcf3f0e911d8bb92a1717babef77 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Mon, 16 Sep 2019 19:19:04 -0400 Subject: [PATCH 06/36] Adding settings to HttpCompressionMiddleware instantiation in test --- scrapy/downloadermiddlewares/httpcompression.py | 4 ++-- tests/test_downloadermiddleware_httpcompression.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index c28d9c8f4a6..c275837c8ca 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -19,10 +19,10 @@ class HttpCompressionMiddleware(object): """This middleware allows compressed (gzip, deflate) traffic to be sent/received from web sites""" - def __int__(self, crawler): + def __int__(self, settings): #setting to decide to keep or discard encoding header #stored in default_settings.py - self.keep_encoding_header = crawler.settings.getbool('HTTPCOMPRESSION_HEADERS_KEEP') + self.keep_encoding_header = settings.getbool('HTTPCOMPRESSION_HEADERS_KEEP') @classmethod def from_crawler(cls, crawler): diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 0745c8dd3be..f3dbdbacdcb 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -28,7 +28,7 @@ class HttpCompressionTest(TestCase): def setUp(self): self.spider = Spider('foo') - self.mw = HttpCompressionMiddleware() + self.mw = HttpCompressionMiddleware(settings) def _getresponse(self, coding): if coding not in FORMAT: From 3350d0fb3f6f4c3f8af09b14d6f398e3195aff46 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Mon, 16 Sep 2019 20:21:20 -0400 Subject: [PATCH 07/36] Slowed down and tried to take things slower --- scrapy/downloadermiddlewares/httpcompression.py | 7 ++++--- tests/test_downloadermiddleware_httpcompression.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index c275837c8ca..1d95f55b274 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -20,15 +20,13 @@ class HttpCompressionMiddleware(object): sent/received from web sites""" def __int__(self, settings): - #setting to decide to keep or discard encoding header - #stored in default_settings.py self.keep_encoding_header = settings.getbool('HTTPCOMPRESSION_HEADERS_KEEP') @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('COMPRESSION_ENABLED'): raise NotConfigured - return cls() + return cls(crawler.settings) def process_request(self, request, spider): request.headers.setdefault('Accept-Encoding', @@ -54,6 +52,9 @@ def process_response(self, request, response, spider): if not self.keep_encoding_header: if not content_encoding: del response.headers['Content-Encoding'] + #else: + #flags_plus_one = response.flags.append(#encoding-value#) + #response.replace(flags=glags_plus_one) return response diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index f3dbdbacdcb..20e36de53e1 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -28,7 +28,7 @@ class HttpCompressionTest(TestCase): def setUp(self): self.spider = Spider('foo') - self.mw = HttpCompressionMiddleware(settings) + self.mw = HttpCompressionMiddleware.from_crawler(crawler) def _getresponse(self, coding): if coding not in FORMAT: From cb09ef57ddade03b1014f7038e1bd630c0947f74 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Mon, 16 Sep 2019 21:29:37 -0400 Subject: [PATCH 08/36] I mispelled init -_- --- scrapy/downloadermiddlewares/httpcompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 1d95f55b274..2d246d8ec1a 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -19,7 +19,7 @@ class HttpCompressionMiddleware(object): """This middleware allows compressed (gzip, deflate) traffic to be sent/received from web sites""" - def __int__(self, settings): + def __init__(self, settings): self.keep_encoding_header = settings.getbool('HTTPCOMPRESSION_HEADERS_KEEP') @classmethod From f949126da9a32d196d6109652bd485ed3c84b561 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Mon, 16 Sep 2019 21:59:28 -0400 Subject: [PATCH 09/36] changed the how its intialized in test --- tests/test_downloadermiddleware_httpcompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 20e36de53e1..2e5f33ef0da 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -28,7 +28,7 @@ class HttpCompressionTest(TestCase): def setUp(self): self.spider = Spider('foo') - self.mw = HttpCompressionMiddleware.from_crawler(crawler) + self.mw = HttpCompressionMiddleware(self.spider.settings) def _getresponse(self, coding): if coding not in FORMAT: From c94f35b2d37cfd13853b3d3aa7799af117cebb7c Mon Sep 17 00:00:00 2001 From: VorBoto Date: Mon, 16 Sep 2019 22:18:56 -0400 Subject: [PATCH 10/36] Added a crawler to HttpCompressionTest using the Spider it created --- tests/test_downloadermiddleware_httpcompression.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 2e5f33ef0da..7859f95b04c 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -28,7 +28,8 @@ class HttpCompressionTest(TestCase): def setUp(self): self.spider = Spider('foo') - self.mw = HttpCompressionMiddleware(self.spider.settings) + crawler = get_crawler(Spider, {'HTTPCOMPRESSION_HEADERS_KEEP': False}) + self.mw = HttpCompressionMiddleware.from_crawler(crawler.settings) def _getresponse(self, coding): if coding not in FORMAT: From 490fb3c6db5beed4a3140a704be203bf83a5555f Mon Sep 17 00:00:00 2001 From: VorBoto Date: Mon, 16 Sep 2019 22:29:49 -0400 Subject: [PATCH 11/36] Added from scrapy.utils.test import get_crawler --- tests/test_downloadermiddleware_httpcompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 7859f95b04c..0d7673e052b 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -11,7 +11,7 @@ from scrapy.utils.gz import gunzip from tests import tests_datadir from w3lib.encoding import resolve_encoding - +from scrapy.utils.test import get_crawler SAMPLEDIR = join(tests_datadir, 'compressed') From e22c8511ae6eed6b48f27b041b4398d2f59409a6 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Tue, 17 Sep 2019 18:43:21 -0400 Subject: [PATCH 12/36] Passed tox but still not right --- tests/test_downloadermiddleware_httpcompression.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 0d7673e052b..fdd3288572a 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -27,9 +27,11 @@ class HttpCompressionTest(TestCase): def setUp(self): - self.spider = Spider('foo') - crawler = get_crawler(Spider, {'HTTPCOMPRESSION_HEADERS_KEEP': False}) - self.mw = HttpCompressionMiddleware.from_crawler(crawler.settings) + #need to find way to access settings here + crawler = get_crawler(Spider,{'HTTPCOMPRESSION_HEADERS_KEEP':False, + 'COMPRESSION_ENABLED':True}) + self.spider = crawler._create_spider('foo') + self.mw = HttpCompressionMiddleware.from_crawler(crawler) def _getresponse(self, coding): if coding not in FORMAT: From e479f88a1e2ccee1fe9f1f9b92defa20467b6895 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Wed, 18 Sep 2019 07:02:55 -0400 Subject: [PATCH 13/36] Trying to add flag for encoding --- scrapy/downloadermiddlewares/httpcompression.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 2d246d8ec1a..5de3755236d 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -48,14 +48,16 @@ def process_response(self, request, response, spider): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None + + if self.keep_encoding_header: + #check encoding of outgoing response + out_encoding = TextResponse(response)._body_inferred_encoding() + kwargs['flags'] = out_encoding + response = response.replace(**kwargs) if not self.keep_encoding_header: if not content_encoding: del response.headers['Content-Encoding'] - #else: - #flags_plus_one = response.flags.append(#encoding-value#) - #response.replace(flags=glags_plus_one) - return response def _decode(self, body, encoding): From 25d476080b07d260baf285abb1545fd506c3e64b Mon Sep 17 00:00:00 2001 From: VorBoto Date: Wed, 18 Sep 2019 07:06:27 -0400 Subject: [PATCH 14/36] Trying to add flag for encoding --- scrapy/downloadermiddlewares/httpcompression.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 5de3755236d..c9593c31ccf 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -51,7 +51,8 @@ def process_response(self, request, response, spider): if self.keep_encoding_header: #check encoding of outgoing response - out_encoding = TextResponse(response)._body_inferred_encoding() + #dont know if i can do this with a RespinceTypes object + out_encoding = TextResponse(respcls)._body_inferred_encoding() kwargs['flags'] = out_encoding response = response.replace(**kwargs) From ccbb52453cf33b32da4e9cf66c95919fbb35d37d Mon Sep 17 00:00:00 2001 From: VorBoto Date: Wed, 18 Sep 2019 19:46:32 -0400 Subject: [PATCH 15/36] Place encodings in response's flags issue #1988 --- scrapy/downloadermiddlewares/httpcompression.py | 12 +++--------- tests/test_downloadermiddleware_httpcompression.py | 2 +- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index c9593c31ccf..0087f79db48 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -48,17 +48,11 @@ def process_response(self, request, response, spider): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None - if self.keep_encoding_header: - #check encoding of outgoing response - #dont know if i can do this with a RespinceTypes object - out_encoding = TextResponse(respcls)._body_inferred_encoding() - kwargs['flags'] = out_encoding - + kwargs['flags'] = [encoding] response = response.replace(**kwargs) - if not self.keep_encoding_header: - if not content_encoding: - del response.headers['Content-Encoding'] + if not content_encoding: + del response.headers['Content-Encoding'] return response def _decode(self, body, encoding): diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index fdd3288572a..fc5b57380d0 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -28,7 +28,7 @@ class HttpCompressionTest(TestCase): def setUp(self): #need to find way to access settings here - crawler = get_crawler(Spider,{'HTTPCOMPRESSION_HEADERS_KEEP':False, + crawler = get_crawler(Spider,{'HTTPCOMPRESSION_HEADERS_KEEP':True, 'COMPRESSION_ENABLED':True}) self.spider = crawler._create_spider('foo') self.mw = HttpCompressionMiddleware.from_crawler(crawler) From 881426563a61f6515d1b4c25b25b0c6136155291 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Sat, 21 Sep 2019 19:09:45 -0400 Subject: [PATCH 16/36] Did some recomended changes --- .../downloadermiddlewares/httpcompression.py | 3 ++- scrapy/settings/default_settings.py | 3 +-- ...est_downloadermiddleware_httpcompression.py | 18 +++++++++++++----- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 0087f79db48..9ccb0eeb89d 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -20,7 +20,8 @@ class HttpCompressionMiddleware(object): sent/received from web sites""" def __init__(self, settings): - self.keep_encoding_header = settings.getbool('HTTPCOMPRESSION_HEADERS_KEEP') + self.keep_encoding_header = settings.getbool( + 'COMPRESSION_KEEP_ENCODING_HEADERS') @classmethod def from_crawler(cls, crawler): diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 5a094556f40..89e8262bb8d 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -37,6 +37,7 @@ COMMANDS_MODULE = '' COMPRESSION_ENABLED = True +COMPRESSION_KEEP_ENCODING_HEADERS = False CONCURRENT_ITEMS = 100 @@ -183,8 +184,6 @@ HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy' HTTPCACHE_GZIP = False -HTTPCOMPRESSION_HEADERS_KEEP = False - HTTPPROXY_ENABLED = True HTTPPROXY_AUTH_ENCODING = 'latin-1' diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index fc5b57380d0..0acf73905c7 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -26,12 +26,20 @@ class HttpCompressionTest(TestCase): - def setUp(self): + def create_spider_mw(self ,compression_enabled, compression_headers): + crawler = get_crawler(Spider, + {'COMPRESSION_ENABLED': + compression_enabled, + 'COMPRESSION_KEEP_ENCODING_HEADERS': + compression_header}) + spider = crawler._create_spider('foo') + mw = HttpCompressionMiddleware.from_crawler(crawler) + return spider, mw + + def setUp(self, spider, mw): #need to find way to access settings here - crawler = get_crawler(Spider,{'HTTPCOMPRESSION_HEADERS_KEEP':True, - 'COMPRESSION_ENABLED':True}) - self.spider = crawler._create_spider('foo') - self.mw = HttpCompressionMiddleware.from_crawler(crawler) + self.spider = spider + self.mw = mw def _getresponse(self, coding): if coding not in FORMAT: From 4588368416b487b6f0f3f3f432369536ef14b1cd Mon Sep 17 00:00:00 2001 From: VorBoto Date: Tue, 24 Sep 2019 22:23:13 -0400 Subject: [PATCH 17/36] updated tests to function with no paramters sent to setUp() --- scrapy/downloadermiddlewares/httpcompression.py | 13 +++++++++---- tests/test_downloadermiddleware_httpcompression.py | 10 ++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 9ccb0eeb89d..34fa1746b42 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -34,7 +34,6 @@ def process_request(self, request, spider): b",".join(ACCEPTED_ENCODINGS)) def process_response(self, request, response, spider): - if request.method == 'HEAD': return response if isinstance(response, Response): @@ -50,10 +49,16 @@ def process_response(self, request, response, spider): # responsetypes guessing is reliable kwargs['encoding'] = None if self.keep_encoding_header: - kwargs['flags'] = [encoding] + if isinstance(kwargs['flags'], list): + kwargs['flags'].append(b'decoded') + elif isinstance(kwargs['flags'], bytes): + kwargs['flags'] = [kwargs['flags'], b'decoded'] + else: + kwargs['flags'] = [b'decoded'] response = response.replace(**kwargs) - if not content_encoding: - del response.headers['Content-Encoding'] + if not self.keep_encoding_header: + if not content_encoding: + del response.headers['Content-Encoding'] return response def _decode(self, body, encoding): diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 0acf73905c7..a035f191b99 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -26,7 +26,9 @@ class HttpCompressionTest(TestCase): - def create_spider_mw(self ,compression_enabled, compression_headers): + def create_spider_mw(self + ,compression_enabled=True + ,compression_header=False): crawler = get_crawler(Spider, {'COMPRESSION_ENABLED': compression_enabled, @@ -36,11 +38,11 @@ def create_spider_mw(self ,compression_enabled, compression_headers): mw = HttpCompressionMiddleware.from_crawler(crawler) return spider, mw - def setUp(self, spider, mw): + def setUp(self): #need to find way to access settings here - self.spider = spider - self.mw = mw + self.spider, self.mw = self.create_spider_mw() + def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() From 0a0b40d6e912e6d7fe41e13a83eda6343bbdd668 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Wed, 25 Sep 2019 19:25:09 -0400 Subject: [PATCH 18/36] Switched dict['k'] to a dict.get('k') so no KeyError --- scrapy/downloadermiddlewares/httpcompression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 34fa1746b42..d54df1969d3 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -49,9 +49,9 @@ def process_response(self, request, response, spider): # responsetypes guessing is reliable kwargs['encoding'] = None if self.keep_encoding_header: - if isinstance(kwargs['flags'], list): + if isinstance(kwargs.get('flags'), list): kwargs['flags'].append(b'decoded') - elif isinstance(kwargs['flags'], bytes): + elif isinstance(kwargs.get('flags'), bytes): kwargs['flags'] = [kwargs['flags'], b'decoded'] else: kwargs['flags'] = [b'decoded'] From 30ead531692a89ce2b38fd9478d48d9418245d2b Mon Sep 17 00:00:00 2001 From: VorBoto Date: Thu, 26 Sep 2019 19:59:33 -0400 Subject: [PATCH 19/36] Actually pull the flags before trying to add decoded --- scrapy/downloadermiddlewares/httpcompression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index d54df1969d3..81d9903f4fb 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -48,6 +48,7 @@ def process_response(self, request, response, spider): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None + kwargs['flags'] = response.flags if self.keep_encoding_header: if isinstance(kwargs.get('flags'), list): kwargs['flags'].append(b'decoded') From fd11144d4defd2fccd32e51a2772c1cf22e3f6d2 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Fri, 27 Sep 2019 19:33:50 -0400 Subject: [PATCH 20/36] Removed unecessary checks for flag entry of kwargs --- scrapy/downloadermiddlewares/httpcompression.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 81d9903f4fb..6ec2f3bddfb 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -48,14 +48,9 @@ def process_response(self, request, response, spider): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None - kwargs['flags'] = response.flags if self.keep_encoding_header: - if isinstance(kwargs.get('flags'), list): - kwargs['flags'].append(b'decoded') - elif isinstance(kwargs.get('flags'), bytes): - kwargs['flags'] = [kwargs['flags'], b'decoded'] - else: - kwargs['flags'] = [b'decoded'] + kwargs['flags'] = response.flags + kwargs['flags'].append(b'decoded') response = response.replace(**kwargs) if not self.keep_encoding_header: if not content_encoding: From 091a8a4ecfc5140144af517ed3293b2acf549d46 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Fri, 4 Oct 2019 22:29:22 -0400 Subject: [PATCH 21/36] Added/copied a few test as a way to show some continued functionality --- ...st_downloadermiddleware_httpcompression.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index a035f191b99..a6562e0769a 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -262,3 +262,34 @@ def test_process_response_head_request_no_decode_required(self): newresponse = self.mw.process_response(request, response, self.spider) self.assertIs(newresponse, response) self.assertEqual(response.body, b'') + + def test_process_response_gzip_keep_headers(self): + self.spider, self.mw = self.create_spider_mw( + compression_enabled=True, + compression_header=True) + response = self._getresponse('gzip') + request = response.request + + self.assertEqual(response.headers['Content-Encoding'], b'gzip') + newresponse = self.mw.process_response(request, response, self.spider) + assert newresponse is not response + assert newresponse.body.startswith(b' Date: Fri, 4 Oct 2019 22:57:12 -0400 Subject: [PATCH 22/36] Fix an oversight --- tests/test_downloadermiddleware_httpcompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index a6562e0769a..0e2967cba4d 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -275,7 +275,7 @@ def test_process_response_gzip_keep_headers(self): assert newresponse is not response assert newresponse.body.startswith(b' Date: Fri, 4 Oct 2019 23:12:55 -0400 Subject: [PATCH 23/36] Fix an oversight on casting a str to binary --- tests/test_downloadermiddleware_httpcompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 0e2967cba4d..419cec40a5b 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -290,6 +290,6 @@ def test_process_response_gzip_binary_octetstream_contenttype(self): self.assertIsNot(newresponse, response) self.assertTrue(newresponse.body.startswith(b' Date: Fri, 4 Oct 2019 23:28:32 -0400 Subject: [PATCH 24/36] Fix an oversight on casting a str to binary again --- tests/test_downloadermiddleware_httpcompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 419cec40a5b..4f22e827235 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -275,7 +275,7 @@ def test_process_response_gzip_keep_headers(self): assert newresponse is not response assert newresponse.body.startswith(b' Date: Wed, 16 Oct 2019 20:15:16 -0400 Subject: [PATCH 25/36] fix some sytanx and formating as well as variable usage --- .../downloadermiddlewares/httpcompression.py | 8 +++---- ...st_downloadermiddleware_httpcompression.py | 24 +++++++------------ 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 6ec2f3bddfb..9d81ecd50cc 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -49,12 +49,10 @@ def process_response(self, request, response, spider): # responsetypes guessing is reliable kwargs['encoding'] = None if self.keep_encoding_header: - kwargs['flags'] = response.flags - kwargs['flags'].append(b'decoded') + kwargs['flags'] = response.flags +[b'decoded'] response = response.replace(**kwargs) - if not self.keep_encoding_header: - if not content_encoding: - del response.headers['Content-Encoding'] + if not (self.keep_encoding_header and not content_encoding: + del response.headers['Content-Encoding'] return response def _decode(self, body, encoding): diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 4f22e827235..f9817fe9e01 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -23,26 +23,19 @@ 'br': ('html-br.bin', 'br') } - class HttpCompressionTest(TestCase): - def create_spider_mw(self - ,compression_enabled=True - ,compression_header=False): - crawler = get_crawler(Spider, - {'COMPRESSION_ENABLED': - compression_enabled, - 'COMPRESSION_KEEP_ENCODING_HEADERS': - compression_header}) + def create_spider_mw(self, compression_enabled=True, compression_header=False): + settings = {'COMPRESSION_ENABLED': compression_enabled, + 'COMPRESSION_KEEP_ENCODING_HEADERS': compression_header} + crawler = get_crawler(Spider, settings) spider = crawler._create_spider('foo') mw = HttpCompressionMiddleware.from_crawler(crawler) return spider, mw def setUp(self): - #need to find way to access settings here self.spider, self.mw = self.create_spider_mw() - def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() @@ -264,29 +257,28 @@ def test_process_response_head_request_no_decode_required(self): self.assertEqual(response.body, b'') def test_process_response_gzip_keep_headers(self): - self.spider, self.mw = self.create_spider_mw( + test_spider, test_mw = self.create_spider_mw( compression_enabled=True, compression_header=True) response = self._getresponse('gzip') request = response.request self.assertEqual(response.headers['Content-Encoding'], b'gzip') - newresponse = self.mw.process_response(request, response, self.spider) + newresponse = test_mw.process_response(request, response, test_spider) assert newresponse is not response assert newresponse.body.startswith(b' Date: Wed, 16 Oct 2019 20:51:44 -0400 Subject: [PATCH 26/36] Ther was a ( on the loose --- scrapy/downloadermiddlewares/httpcompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 9d81ecd50cc..d9d58d2e39f 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -51,7 +51,7 @@ def process_response(self, request, response, spider): if self.keep_encoding_header: kwargs['flags'] = response.flags +[b'decoded'] response = response.replace(**kwargs) - if not (self.keep_encoding_header and not content_encoding: + if not self.keep_encoding_header and not content_encoding: del response.headers['Content-Encoding'] return response From 0a03b459a33a167ea669305169675799af16bdd9 Mon Sep 17 00:00:00 2001 From: VorBoto Date: Fri, 18 Oct 2019 07:03:43 -0400 Subject: [PATCH 27/36] Syntax clean up --- scrapy/downloadermiddlewares/httpcompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index d9d58d2e39f..46e48fa59d0 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -49,7 +49,7 @@ def process_response(self, request, response, spider): # responsetypes guessing is reliable kwargs['encoding'] = None if self.keep_encoding_header: - kwargs['flags'] = response.flags +[b'decoded'] + kwargs['flags'] = response.flags + [b'decoded'] response = response.replace(**kwargs) if not self.keep_encoding_header and not content_encoding: del response.headers['Content-Encoding'] From 52b435adc783eb69cf703fea8a35421d396bba99 Mon Sep 17 00:00:00 2001 From: T0shik Date: Sat, 23 Oct 2021 13:19:22 +0100 Subject: [PATCH 28/36] fix post merge tests --- .../downloadermiddlewares/httpcompression.py | 2 +- ...st_downloadermiddleware_httpcompression.py | 75 +++++++++---------- 2 files changed, 36 insertions(+), 41 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index f372d3f84f3..d5c88f8280b 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -45,8 +45,8 @@ def from_crawler(cls, crawler): ScrapyDeprecationWarning, ) result = cls() - result.settings = crawler.settings result.stats = crawler.stats + result.settings = crawler.settings return result def process_request(self, request, spider): diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index 72493992e1e..df40beee637 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -13,7 +13,6 @@ from scrapy.utils.test import get_crawler from tests import tests_datadir from w3lib.encoding import resolve_encoding -from scrapy.utils.test import get_crawler SAMPLEDIR = join(tests_datadir, 'compressed') @@ -34,16 +33,10 @@ class HttpCompressionTest(TestCase): - def create_spider_mw(self, compression_enabled=True, compression_header=False): - settings = {'COMPRESSION_ENABLED': compression_enabled, - 'COMPRESSION_KEEP_ENCODING_HEADERS': compression_header} - crawler = get_crawler(Spider, settings) - spider = crawler._create_spider('foo') - mw = HttpCompressionMiddleware.from_crawler(crawler) - return spider, mw - def setUp(self): - self.crawler = get_crawler(Spider) + settings = {'COMPRESSION_ENABLED': True, + 'COMPRESSION_KEEP_ENCODING_HEADERS': False} + self.crawler = get_crawler(Spider, settings) self.spider = self.crawler._create_spider('scrapytest.org') self.mw = HttpCompressionMiddleware.from_crawler(self.crawler) self.crawler.stats.open_spider(self.spider) @@ -358,6 +351,38 @@ def test_process_response_head_request_no_decode_required(self): self.assertStatsEqual('httpcompression/response_count', None) self.assertStatsEqual('httpcompression/response_bytes', None) + def test_process_response_gzip_keep_headers(self): + settings = {'COMPRESSION_ENABLED': True, + 'COMPRESSION_KEEP_ENCODING_HEADERS': True} + crawler = get_crawler(Spider, settings) + spider = crawler._create_spider('scrapytest.org') + mw = HttpCompressionMiddleware.from_crawler(crawler) + response = self._getresponse('gzip') + request = response.request + + self.assertEqual(response.headers['Content-Encoding'], b'gzip') + newresponse = mw.process_response(request, response, spider) + self.assertIsNot(newresponse, response) + self.assertTrue(newresponse.body.startswith(b' Date: Sat, 23 Oct 2021 13:19:49 +0100 Subject: [PATCH 29/36] conftest 'tests/ignores.txt' to rely on native paths --- conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 117087790d2..224db435fb0 100644 --- a/conftest.py +++ b/conftest.py @@ -21,7 +21,8 @@ def _py_files(folder): *_py_files("tests/CrawlerRunner"), ] -with open('tests/ignores.txt') as reader: +ignore_tests_file = Path("tests/ignores.txt") +with open(ignore_tests_file) as reader: for line in reader: file_path = line.strip() if file_path and file_path[0] != '#': From fa417f5d88cc9a161b38a1827356258e4cbca967 Mon Sep 17 00:00:00 2001 From: T0shik Date: Sat, 23 Oct 2021 18:03:14 +0100 Subject: [PATCH 30/36] revert conftest.py --- conftest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conftest.py b/conftest.py index 224db435fb0..a93764f6f28 100644 --- a/conftest.py +++ b/conftest.py @@ -21,8 +21,7 @@ def _py_files(folder): *_py_files("tests/CrawlerRunner"), ] -ignore_tests_file = Path("tests/ignores.txt") -with open(ignore_tests_file) as reader: +with open("tests/ignores.txt") as reader: for line in reader: file_path = line.strip() if file_path and file_path[0] != '#': From 0d095c4cf8eadd4be2b7321dfd4492a8f8b35858 Mon Sep 17 00:00:00 2001 From: T0shik Date: Sat, 23 Oct 2021 18:39:35 +0100 Subject: [PATCH 31/36] HttpCompressionMiddleware COMPRESSION_KEEP_ENCODING_HEADERS updates - HttpCompressionMiddleware constructor deprecation warning - refactor process_response - put COMPRESSION_KEEP_ENCODING_HEADER in settings.py.tmpl - test HttpCompressionMiddleware init - test COMPRESSION_KEEP_ENCODING_HEADERS presence - ensure b'decoded' flag is set if decoded --- .../downloadermiddlewares/httpcompression.py | 69 ++++++++++++------- scrapy/settings/default_settings.py | 2 +- .../templates/project/module/settings.py.tmpl | 3 + ...st_downloadermiddleware_httpcompression.py | 64 ++++++++++++----- 4 files changed, 95 insertions(+), 43 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index d5c88f8280b..30188bdf57a 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -29,7 +29,25 @@ class HttpCompressionMiddleware: sent/received from web sites""" def __init__(self, stats=None, settings=None): self.stats = stats - self.keep_encoding_header = settings.getbool('COMPRESSION_KEEP_ENCODING_HEADER') + if not stats: + warnings.warn( + "HttpCompressionMiddleware now accepts a 'stats' parameter which should be specified.", + ScrapyDeprecationWarning, + ) + if settings: + self.keep_encoding_header = settings.getbool('COMPRESSION_KEEP_ENCODING_HEADER') + if not self.keep_encoding_header: + warnings.warn( + "COMPRESSION_KEEP_ENCODING_HEADER should be set to True in settings.", + ScrapyDeprecationWarning, + ) + else: + self.keep_encoding_header = False + warnings.warn( + "HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified." + "Example: {'COMPRESSION_KEEP_ENCODING_HEADER': True}", + ScrapyDeprecationWarning, + ) @classmethod def from_crawler(cls, crawler): @@ -46,7 +64,7 @@ def from_crawler(cls, crawler): ) result = cls() result.stats = crawler.stats - result.settings = crawler.settings + result.keep_encoding_header = False return result def process_request(self, request, spider): @@ -56,27 +74,32 @@ def process_request(self, request, spider): def process_response(self, request, response, spider): if request.method == 'HEAD': return response - if isinstance(response, Response): - content_encoding = response.headers.getlist('Content-Encoding') - if content_encoding: - encoding = content_encoding.pop() - decoded_body = self._decode(response.body, encoding.lower()) - if self.stats: - self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider) - self.stats.inc_value('httpcompression/response_count', spider=spider) - respcls = responsetypes.from_args( - headers=response.headers, url=response.url, body=decoded_body - ) - kwargs = dict(cls=respcls, body=decoded_body) - if issubclass(respcls, TextResponse): - # force recalculating the encoding until we make sure the - # responsetypes guessing is reliable - kwargs['encoding'] = None - if self.keep_encoding_header: - kwargs['flags'] = response.flags + [b'decoded'] - response = response.replace(**kwargs) - if not self.keep_encoding_header and not content_encoding: - del response.headers['Content-Encoding'] + + if b'decoded' in response.flags: + return response + + content_encoding = response.headers.getlist('Content-Encoding') + if not content_encoding: + return response + + encoding = content_encoding.pop() + decoded_body = self._decode(response.body, encoding.lower()) + if self.stats: + self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider) + self.stats.inc_value('httpcompression/response_count', spider=spider) + respcls = responsetypes.from_args( + headers=response.headers, url=response.url, body=decoded_body + ) + kwargs = dict(cls=respcls, body=decoded_body) + if issubclass(respcls, TextResponse): + # force recalculating the encoding until we make sure the + # responsetypes guessing is reliable + kwargs['encoding'] = None + + kwargs['flags'] = response.flags + [b'decoded'] + response = response.replace(**kwargs) + if not self.keep_encoding_header and not content_encoding: + del response.headers['Content-Encoding'] return response diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index a0939899e48..11e9678896b 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -37,7 +37,7 @@ COMMANDS_MODULE = '' COMPRESSION_ENABLED = True -COMPRESSION_KEEP_ENCODING_HEADERS = False +COMPRESSION_KEEP_ENCODING_HEADER = False CONCURRENT_ITEMS = 100 diff --git a/scrapy/templates/project/module/settings.py.tmpl b/scrapy/templates/project/module/settings.py.tmpl index a414b5fde5b..1b4a9d7516d 100644 --- a/scrapy/templates/project/module/settings.py.tmpl +++ b/scrapy/templates/project/module/settings.py.tmpl @@ -86,3 +86,6 @@ ROBOTSTXT_OBEY = True #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +# Keep the original Content-Encoding header +COMPRESSION_KEEP_ENCODING_HEADER = True diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index df40beee637..c55b37c63f3 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -34,9 +34,7 @@ class HttpCompressionTest(TestCase): def setUp(self): - settings = {'COMPRESSION_ENABLED': True, - 'COMPRESSION_KEEP_ENCODING_HEADERS': False} - self.crawler = get_crawler(Spider, settings) + self.crawler = get_crawler(Spider) self.spider = self.crawler._create_spider('scrapytest.org') self.mw = HttpCompressionMiddleware.from_crawler(self.crawler) self.crawler.stats.open_spider(self.spider) @@ -168,6 +166,7 @@ def test_process_response_rawdeflate(self): assert 'Content-Encoding' not in newresponse.headers self.assertStatsEqual('httpcompression/response_count', 1) self.assertStatsEqual('httpcompression/response_bytes', 74840) + self.assertIn(b'decoded', newresponse.flags) def test_process_response_zlibdelate(self): response = self._getresponse('zlibdeflate') @@ -180,6 +179,7 @@ def test_process_response_zlibdelate(self): assert 'Content-Encoding' not in newresponse.headers self.assertStatsEqual('httpcompression/response_count', 1) self.assertStatsEqual('httpcompression/response_bytes', 74840) + self.assertIn(b'decoded', newresponse.flags) def test_process_response_plain(self): response = Response('http://scrapytest.org', body=b' Date: Mon, 25 Oct 2021 20:14:37 +0100 Subject: [PATCH 32/36] improve wording & deprecation warning test --- .../downloadermiddlewares/httpcompression.py | 7 +++-- ...st_downloadermiddleware_httpcompression.py | 27 +++++++++++++++---- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 30188bdf57a..7bae69418c8 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -38,14 +38,13 @@ def __init__(self, stats=None, settings=None): self.keep_encoding_header = settings.getbool('COMPRESSION_KEEP_ENCODING_HEADER') if not self.keep_encoding_header: warnings.warn( - "COMPRESSION_KEEP_ENCODING_HEADER should be set to True in settings.", + "Setting COMPRESSION_KEEP_ENCODING_HEADER=False is deprecated", ScrapyDeprecationWarning, ) else: self.keep_encoding_header = False warnings.warn( - "HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified." - "Example: {'COMPRESSION_KEEP_ENCODING_HEADER': True}", + "HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified.", ScrapyDeprecationWarning, ) @@ -58,7 +57,7 @@ def from_crawler(cls, crawler): except TypeError: warnings.warn( "HttpCompressionMiddleware subclasses must either modify " - "their '__init__' method to support a 'stats' and 'settings' parameters or " + "their '__init__' method to support 'stats' and 'settings' parameters or " "reimplement the 'from_crawler' method.", ScrapyDeprecationWarning, ) diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index c55b37c63f3..beec5b1fabd 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -411,11 +411,10 @@ def __init__(self): messages, ( "HttpCompressionMiddleware subclasses must either modify " - "their '__init__' method to support a 'stats' and 'settings' parameters " + "their '__init__' method to support 'stats' and 'settings' parameters " "or reimplement the 'from_crawler' method.", "HttpCompressionMiddleware now accepts a 'stats' parameter which should be specified.", - "HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified." - "Example: {'COMPRESSION_KEEP_ENCODING_HEADER': True}", + "HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified.", ), ) @@ -431,7 +430,25 @@ def test_init_missing_args(self): messages, ( "HttpCompressionMiddleware now accepts a 'stats' parameter which should be specified.", - "HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified." - "Example: {'COMPRESSION_KEEP_ENCODING_HEADER': True}", + "HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified.", + ), + ) + + def test_init_keep_encoding_header_deprecation_warning(self): + from scrapy.settings import Settings + settings = Settings({'COMPRESSION_KEEP_ENCODING_HEADER': False}) + + with catch_warnings(record=True) as caught_warnings: + mw = HttpCompressionMiddleware(stats={'foo': 'bar'}, settings=settings) + self.assertIsNotNone(mw) + messages = tuple( + str(warning.message) for warning in caught_warnings + if warning.category is ScrapyDeprecationWarning + ) + + self.assertEqual( + messages, + ( + "Setting COMPRESSION_KEEP_ENCODING_HEADER=False is deprecated", ), ) From 332099374fd5419544f70d731101045a2b020672 Mon Sep 17 00:00:00 2001 From: T0shik Date: Wed, 27 Oct 2021 19:21:14 +0100 Subject: [PATCH 33/36] remove un-used import --- scrapy/downloadermiddlewares/httpcompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 7bae69418c8..ac7036f9e2f 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -3,7 +3,7 @@ import zlib from scrapy.exceptions import NotConfigured -from scrapy.http import Response, TextResponse +from scrapy.http import TextResponse from scrapy.responsetypes import responsetypes from scrapy.utils.deprecate import ScrapyDeprecationWarning from scrapy.utils.gz import gunzip From b811b81ccad97a793615f5a6a04f8c2b19ad36f3 Mon Sep 17 00:00:00 2001 From: T0shik Date: Sat, 30 Oct 2021 13:56:06 +0100 Subject: [PATCH 34/36] actually revert conftest.py --- conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index a93764f6f28..117087790d2 100644 --- a/conftest.py +++ b/conftest.py @@ -21,7 +21,7 @@ def _py_files(folder): *_py_files("tests/CrawlerRunner"), ] -with open("tests/ignores.txt") as reader: +with open('tests/ignores.txt') as reader: for line in reader: file_path = line.strip() if file_path and file_path[0] != '#': From 35b48abc8981f81e46b2d24ce9b2046945c78258 Mon Sep 17 00:00:00 2001 From: T0shik Date: Sat, 30 Oct 2021 14:08:12 +0100 Subject: [PATCH 35/36] set COMPRESSION_KEEP_ENCODING_HEADER to True as default and revert changes in test_engine.py --- scrapy/settings/default_settings.py | 2 +- tests/test_engine.py | 18 ++++++------------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 11e9678896b..02dd20b2190 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -37,7 +37,7 @@ COMMANDS_MODULE = '' COMPRESSION_ENABLED = True -COMPRESSION_KEEP_ENCODING_HEADER = False +COMPRESSION_KEEP_ENCODING_HEADER = True CONCURRENT_ITEMS = 100 diff --git a/tests/test_engine.py b/tests/test_engine.py index 4823f1f3e44..fa7d0c8d45f 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -415,8 +415,7 @@ def test_start_already_running_exception(self): @defer.inlineCallbacks def test_close_spiders_downloader(self): with warnings.catch_warnings(record=True) as warning_list: - settings = {'COMPRESSION_KEEP_ENCODING_HEADER': True} - e = ExecutionEngine(get_crawler(TestSpider, settings_dict=settings), lambda _: None) + e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) yield e.open_spider(TestSpider(), []) self.assertEqual(len(e.open_spiders), 1) yield e.close() @@ -430,8 +429,7 @@ def test_close_spiders_downloader(self): @defer.inlineCallbacks def test_close_engine_spiders_downloader(self): with warnings.catch_warnings(record=True) as warning_list: - settings = {'COMPRESSION_KEEP_ENCODING_HEADER': True} - e = ExecutionEngine(get_crawler(TestSpider, settings_dict=settings), lambda _: None) + e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) yield e.open_spider(TestSpider(), []) e.start() self.assertTrue(e.running) @@ -447,8 +445,7 @@ def test_close_engine_spiders_downloader(self): @defer.inlineCallbacks def test_crawl_deprecated_spider_arg(self): with warnings.catch_warnings(record=True) as warning_list: - settings = {'COMPRESSION_KEEP_ENCODING_HEADER': True} - e = ExecutionEngine(get_crawler(TestSpider, settings_dict=settings), lambda _: None) + e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) spider = TestSpider() yield e.open_spider(spider, []) e.start() @@ -463,8 +460,7 @@ def test_crawl_deprecated_spider_arg(self): @defer.inlineCallbacks def test_download_deprecated_spider_arg(self): with warnings.catch_warnings(record=True) as warning_list: - settings = {'COMPRESSION_KEEP_ENCODING_HEADER': True} - e = ExecutionEngine(get_crawler(TestSpider, settings_dict=settings), lambda _: None) + e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) spider = TestSpider() yield e.open_spider(spider, []) e.start() @@ -479,8 +475,7 @@ def test_download_deprecated_spider_arg(self): @defer.inlineCallbacks def test_deprecated_schedule(self): with warnings.catch_warnings(record=True) as warning_list: - settings = {'COMPRESSION_KEEP_ENCODING_HEADER': True} - e = ExecutionEngine(get_crawler(TestSpider, settings_dict=settings), lambda _: None) + e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) spider = TestSpider() yield e.open_spider(spider, []) e.start() @@ -496,8 +491,7 @@ def test_deprecated_schedule(self): @defer.inlineCallbacks def test_deprecated_has_capacity(self): with warnings.catch_warnings(record=True) as warning_list: - settings = {'COMPRESSION_KEEP_ENCODING_HEADER': True} - e = ExecutionEngine(get_crawler(TestSpider, settings_dict=settings), lambda _: None) + e = ExecutionEngine(get_crawler(TestSpider), lambda _: None) self.assertTrue(e.has_capacity()) spider = TestSpider() yield e.open_spider(spider, []) From 87998afd037fd457f4aaac9efa89e2d8a6eb150d Mon Sep 17 00:00:00 2001 From: T0shik Date: Sat, 30 Oct 2021 14:58:45 +0100 Subject: [PATCH 36/36] do NOT mutate original response content_encoding list. - amend tests & logic to follow the new default of COMPRESSION_KEEP_ENCODING_HEADER=True - remove redundant condition --- .../downloadermiddlewares/httpcompression.py | 4 +- ...st_downloadermiddleware_httpcompression.py | 84 ++++++++++++------- 2 files changed, 55 insertions(+), 33 deletions(-) diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index ac7036f9e2f..b6bbfc1dfcd 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -81,7 +81,7 @@ def process_response(self, request, response, spider): if not content_encoding: return response - encoding = content_encoding.pop() + encoding = content_encoding[0] decoded_body = self._decode(response.body, encoding.lower()) if self.stats: self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider) @@ -97,7 +97,7 @@ def process_response(self, request, response, spider): kwargs['flags'] = response.flags + [b'decoded'] response = response.replace(**kwargs) - if not self.keep_encoding_header and not content_encoding: + if not self.keep_encoding_header: del response.headers['Content-Encoding'] return response diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index beec5b1fabd..6c7b6ae33f1 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -4,6 +4,7 @@ from unittest import TestCase, SkipTest from warnings import catch_warnings +from scrapy.settings import Settings from scrapy.spiders import Spider from scrapy.http import Response, Request, HtmlResponse from scrapy.downloadermiddlewares.httpcompression import HttpCompressionMiddleware, ACCEPTED_ENCODINGS @@ -101,23 +102,25 @@ def test_process_response_gzip(self): self.assertEqual(response.headers['Content-Encoding'], b'gzip') newresponse = self.mw.process_response(request, response, self.spider) - assert newresponse is not response - assert newresponse.body.startswith(b'