Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Place Content-encoding header into response's flags is desired #5290

Open
wants to merge 42 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
5f068dc
Add HEADERS_KEEP variable to settings and to HttpCompressionMiddlewar…
cwen13 Sep 14, 2019
6a725ca
fixed type
cwen13 Sep 15, 2019
2b6fb1b
Added init to HttpCompMiddleware #1988
cwen13 Sep 16, 2019
96a8dfc
Added init to HttpCompMiddleware #1988 Typeo
cwen13 Sep 16, 2019
9010d9f
Trying with passing init crawler #1988
cwen13 Sep 16, 2019
608b342
Adding settings to HttpCompressionMiddleware instantiation in test
cwen13 Sep 16, 2019
3350d0f
Slowed down and tried to take things slower
cwen13 Sep 17, 2019
cb09ef5
I mispelled init -_-
VorBoto Sep 17, 2019
f949126
changed the how its intialized in test
VorBoto Sep 17, 2019
c94f35b
Added a crawler to HttpCompressionTest using the Spider it created
VorBoto Sep 17, 2019
490fb3c
Added from scrapy.utils.test import get_crawler
VorBoto Sep 17, 2019
e22c851
Passed tox but still not right
VorBoto Sep 17, 2019
e479f88
Trying to add flag for encoding
VorBoto Sep 18, 2019
25d4760
Trying to add flag for encoding
VorBoto Sep 18, 2019
ccbb524
Place encodings in response's flags issue #1988
VorBoto Sep 18, 2019
8814265
Did some recomended changes
VorBoto Sep 21, 2019
4588368
updated tests to function with no paramters sent to setUp()
VorBoto Sep 25, 2019
0a0b40d
Switched dict['k'] to a dict.get('k') so no KeyError
VorBoto Sep 25, 2019
30ead53
Actually pull the flags before trying to add decoded
VorBoto Sep 26, 2019
fd11144
Removed unecessary checks for flag entry of kwargs
VorBoto Sep 27, 2019
091a8a4
Added/copied a few test as a way to show some continued functionality
VorBoto Oct 5, 2019
d2fbd10
Fix an oversight
VorBoto Oct 5, 2019
a944117
Fix an oversight on casting a str to binary
VorBoto Oct 5, 2019
8164e31
Fix an oversight on casting a str to binary again
VorBoto Oct 5, 2019
11af9bb
fix some sytanx and formating as well as variable usage
VorBoto Oct 17, 2019
64597cd
Ther was a ( on the loose
VorBoto Oct 17, 2019
0a03b45
Syntax clean up
VorBoto Oct 18, 2019
ed06e4c
Merge branch 'master' into vorboto_local_master
T0shik Oct 23, 2021
52b435a
fix post merge tests
T0shik Oct 23, 2021
624a1ff
conftest 'tests/ignores.txt' to rely on native paths
T0shik Oct 23, 2021
fa417f5
revert conftest.py
T0shik Oct 23, 2021
0d095c4
HttpCompressionMiddleware COMPRESSION_KEEP_ENCODING_HEADERS updates
T0shik Oct 23, 2021
6556495
improve wording & deprecation warning test
T0shik Oct 25, 2021
38e23ff
add keep_encoding_header setting to test_engine.py
T0shik Oct 25, 2021
3320993
remove un-used import
T0shik Oct 27, 2021
b811b81
actually revert conftest.py
T0shik Oct 30, 2021
35b48ab
set COMPRESSION_KEEP_ENCODING_HEADER to True as default and revert ch…
T0shik Oct 30, 2021
87998af
do NOT mutate original response content_encoding list.
T0shik Oct 30, 2021
533f69a
Merge remote-tracking branch 'upstream/master'
T0shik Nov 6, 2021
3aea4a8
Merge branch 'scrapy:master' into master
T0shik Nov 8, 2021
cc491e7
Merge branch 'scrapy:master' into master
T0shik Nov 18, 2021
ed157d5
Merge branch 'scrapy:master' into master
T0shik Jan 10, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def _py_files(folder):
*_py_files("tests/CrawlerRunner"),
]

with open('tests/ignores.txt') as reader:
with open("tests/ignores.txt") as reader:
for line in reader:
file_path = line.strip()
if file_path and file_path[0] != '#':
Expand Down
72 changes: 49 additions & 23 deletions scrapy/downloadermiddlewares/httpcompression.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,53 +27,79 @@
class HttpCompressionMiddleware:
"""This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites"""
def __init__(self, stats=None):
def __init__(self, stats=None, settings=None):
self.stats = stats
if not stats:
warnings.warn(
"HttpCompressionMiddleware now accepts a 'stats' parameter which should be specified.",
ScrapyDeprecationWarning,
)
if settings:
self.keep_encoding_header = settings.getbool('COMPRESSION_KEEP_ENCODING_HEADER')
if not self.keep_encoding_header:
warnings.warn(
"COMPRESSION_KEEP_ENCODING_HEADER should be set to True in settings.",
T0shik marked this conversation as resolved.
Show resolved Hide resolved
ScrapyDeprecationWarning,
)
else:
self.keep_encoding_header = False
warnings.warn(
"HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified."
"Example: {'COMPRESSION_KEEP_ENCODING_HEADER': True}",
T0shik marked this conversation as resolved.
Show resolved Hide resolved
ScrapyDeprecationWarning,
)

@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('COMPRESSION_ENABLED'):
raise NotConfigured
try:
return cls(stats=crawler.stats)
return cls(stats=crawler.stats, settings=crawler.settings)
except TypeError:
warnings.warn(
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support a 'stats' parameter or "
"their '__init__' method to support a 'stats' and 'settings' parameters or "
T0shik marked this conversation as resolved.
Show resolved Hide resolved
"reimplement the 'from_crawler' method.",
ScrapyDeprecationWarning,
)
result = cls()
result.stats = crawler.stats
result.keep_encoding_header = False
return result

def process_request(self, request, spider):
request.headers.setdefault('Accept-Encoding',
b", ".join(ACCEPTED_ENCODINGS))

def process_response(self, request, response, spider):

if request.method == 'HEAD':
return response
if isinstance(response, Response):
content_encoding = response.headers.getlist('Content-Encoding')
if content_encoding:
encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower())
if self.stats:
self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider)
self.stats.inc_value('httpcompression/response_count', spider=spider)
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
kwargs = dict(cls=respcls, body=decoded_body)
if issubclass(respcls, TextResponse):
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
kwargs['encoding'] = None
response = response.replace(**kwargs)
if not content_encoding:
del response.headers['Content-Encoding']

if b'decoded' in response.flags:
return response

content_encoding = response.headers.getlist('Content-Encoding')
if not content_encoding:
return response

encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower())
if self.stats:
self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider)
self.stats.inc_value('httpcompression/response_count', spider=spider)
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
kwargs = dict(cls=respcls, body=decoded_body)
if issubclass(respcls, TextResponse):
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
kwargs['encoding'] = None

kwargs['flags'] = response.flags + [b'decoded']
response = response.replace(**kwargs)
if not self.keep_encoding_header and not content_encoding:
del response.headers['Content-Encoding']

return response

Expand Down
1 change: 1 addition & 0 deletions scrapy/settings/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
COMMANDS_MODULE = ''

COMPRESSION_ENABLED = True
COMPRESSION_KEEP_ENCODING_HEADER = False

CONCURRENT_ITEMS = 100

Expand Down
3 changes: 3 additions & 0 deletions scrapy/templates/project/module/settings.py.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,6 @@ ROBOTSTXT_OBEY = True
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# Keep the original Content-Encoding header
COMPRESSION_KEEP_ENCODING_HEADER = True
75 changes: 67 additions & 8 deletions tests/test_downloadermiddleware_httpcompression.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from tests import tests_datadir
from w3lib.encoding import resolve_encoding


SAMPLEDIR = join(tests_datadir, 'compressed')

FORMAT = {
Expand Down Expand Up @@ -167,6 +166,7 @@ def test_process_response_rawdeflate(self):
assert 'Content-Encoding' not in newresponse.headers
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74840)
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_zlibdelate(self):
response = self._getresponse('zlibdeflate')
Expand All @@ -179,6 +179,7 @@ def test_process_response_zlibdelate(self):
assert 'Content-Encoding' not in newresponse.headers
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74840)
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_plain(self):
response = Response('http://scrapytest.org', body=b'<!DOCTYPE...')
Expand All @@ -198,6 +199,7 @@ def test_multipleencodings(self):
newresponse = self.mw.process_response(request, response, self.spider)
assert newresponse is not response
self.assertEqual(newresponse.headers.getlist('Content-Encoding'), [b'uuencode'])
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_encoding_inside_body(self):
headers = {
Expand All @@ -219,6 +221,7 @@ def test_process_response_encoding_inside_body(self):
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 104)
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_force_recalculate_encoding(self):
headers = {
Expand All @@ -240,6 +243,7 @@ def test_process_response_force_recalculate_encoding(self):
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 104)
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_no_content_type_header(self):
headers = {
Expand All @@ -257,6 +261,7 @@ def test_process_response_no_content_type_header(self):
self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 104)
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_gzipped_contenttype(self):
response = self._getresponse('gzip')
Expand All @@ -269,6 +274,7 @@ def test_process_response_gzipped_contenttype(self):
self.assertNotIn('Content-Encoding', newresponse.headers)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_gzip_app_octetstream_contenttype(self):
response = self._getresponse('gzip')
Expand All @@ -281,6 +287,7 @@ def test_process_response_gzip_app_octetstream_contenttype(self):
self.assertNotIn('Content-Encoding', newresponse.headers)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_gzip_binary_octetstream_contenttype(self):
response = self._getresponse('x-gzip')
Expand All @@ -293,6 +300,7 @@ def test_process_response_gzip_binary_octetstream_contenttype(self):
self.assertNotIn('Content-Encoding', newresponse.headers)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 74837)
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_gzipped_gzip_file(self):
"""Test that a gzip Content-Encoded .gz file is gunzipped
Expand Down Expand Up @@ -337,6 +345,7 @@ def test_process_response_gzipped_gzip_file(self):
self.assertEqual(gunzip(newresponse.body), plainbody)
self.assertStatsEqual('httpcompression/response_count', 1)
self.assertStatsEqual('httpcompression/response_bytes', 230)
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_head_request_no_decode_required(self):
response = self._getresponse('gzip')
Expand All @@ -350,10 +359,42 @@ def test_process_response_head_request_no_decode_required(self):
self.assertStatsEqual('httpcompression/response_count', None)
self.assertStatsEqual('httpcompression/response_bytes', None)

def test_process_response_keeps_content_encoding_header(self):
settings = {'COMPRESSION_ENABLED': True,
'COMPRESSION_KEEP_ENCODING_HEADER': True}
crawler = get_crawler(Spider, settings)
spider = crawler._create_spider('example.com')
mw = HttpCompressionMiddleware.from_crawler(crawler)
response = self._getresponse('gzip')
request = response.request

self.assertEqual(response.headers['Content-Encoding'], b'gzip')
newresponse = mw.process_response(request, response, spider)
self.assertIsNot(newresponse, response)
self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE'))
self.assertIn('Content-Encoding', newresponse.headers)
self.assertIn(b'decoded', newresponse.flags)

def test_process_response_doesnt_keep_content_encoding_header(self):
settings = {'COMPRESSION_ENABLED': True,
'COMPRESSION_KEEP_ENCODING_HEADER': False}
crawler = get_crawler(Spider, settings)
spider = crawler._create_spider('example.com')
mw = HttpCompressionMiddleware.from_crawler(crawler)
response = self._getresponse('gzip')
request = response.request

self.assertEqual(response.headers['Content-Encoding'], b'gzip')
newresponse = mw.process_response(request, response, spider)
self.assertIsNot(newresponse, response)
self.assertTrue(newresponse.body.startswith(b'<!DOCTYPE'))
self.assertNotIn('Content-Encoding', newresponse.headers)
self.assertIn(b'decoded', newresponse.flags)


class HttpCompressionSubclassTest(TestCase):

def test_init_missing_stats(self):
def test_from_crawler_missing_args(self):
class HttpCompressionMiddlewareSubclass(HttpCompressionMiddleware):

def __init__(self):
Expand All @@ -369,10 +410,28 @@ def __init__(self):
self.assertEqual(
messages,
(
(
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support a 'stats' parameter "
"or reimplement the 'from_crawler' method."
),
)
"HttpCompressionMiddleware subclasses must either modify "
"their '__init__' method to support a 'stats' and 'settings' parameters "
"or reimplement the 'from_crawler' method.",
"HttpCompressionMiddleware now accepts a 'stats' parameter which should be specified.",
"HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified."
"Example: {'COMPRESSION_KEEP_ENCODING_HEADER': True}",
),
)

def test_init_missing_args(self):
with catch_warnings(record=True) as caught_warnings:
self.assertIsNotNone(HttpCompressionMiddleware(stats=None, settings=None))
messages = tuple(
str(warning.message) for warning in caught_warnings
if warning.category is ScrapyDeprecationWarning
)

self.assertEqual(
messages,
(
"HttpCompressionMiddleware now accepts a 'stats' parameter which should be specified.",
"HttpCompressionMiddleware now accepts a 'settings' parameter which should be specified."
"Example: {'COMPRESSION_KEEP_ENCODING_HEADER': True}",
),
)