Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for zstd in HttpCompressionMiddleware #4831

Merged
merged 7 commits into from Oct 8, 2020
7 changes: 5 additions & 2 deletions docs/topics/downloader-middleware.rst
Expand Up @@ -684,11 +684,14 @@ HttpCompressionMiddleware
This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites.

This middleware also supports decoding `brotli-compressed`_ responses,
provided `brotlipy`_ is installed.
This middleware also supports decoding `brotli-compressed`_ as well as
`zstd-compressed`_ responses, provided that `brotlipy`_ or `zstandard`_ is
installed, respectively.

.. _brotli-compressed: https://www.ietf.org/rfc/rfc7932.txt
.. _brotlipy: https://pypi.org/project/brotlipy/
.. _zstd-compressed: https://www.ietf.org/rfc/rfc8478.txt
.. _zstandard: https://pypi.org/project/zstandard/

HttpCompressionMiddleware Settings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
12 changes: 12 additions & 0 deletions scrapy/downloadermiddlewares/httpcompression.py
@@ -1,4 +1,5 @@
import zlib
import io
starrify marked this conversation as resolved.
Show resolved Hide resolved

from scrapy.utils.gz import gunzip
from scrapy.http import Response, TextResponse
Expand All @@ -14,6 +15,12 @@
except ImportError:
pass

try:
import zstandard
ACCEPTED_ENCODINGS.append(b'zstd')
except ImportError:
pass


class HttpCompressionMiddleware:
"""This middleware allows compressed (gzip, deflate) traffic to be
Expand Down Expand Up @@ -67,4 +74,9 @@ def _decode(self, body, encoding):
body = zlib.decompress(body, -15)
if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
body = brotli.decompress(body)
if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS:
# Using its streaming API since its simple API could handle only cases
# where there is content size data embedded in the frame
reader = zstandard.ZstdDecompressor().stream_reader(io.BytesIO(body))
Copy link
Member

@kmike kmike Oct 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hey! I wonder if it makes sense to use a context manager, to close the reader after it is used. This is very minor, as it is not doing much (https://github.com/indygreg/python-zstandard/blob/53b71dc3f96961564c9c140bf88b0aa118589247/zstandard/cffi.py#L1937), but it still may clean up some references earlier.

body = reader.read()
return body
3 changes: 2 additions & 1 deletion tests/requirements-py3.txt
Expand Up @@ -16,6 +16,7 @@ uvloop; platform_system != "Windows"

# optional for shell wrapper tests
bpython
brotlipy
brotlipy # optional for HTTP compress downloader middleware tests
zstandard # optional for HTTP compress downloader middleware tests
ipython
pywin32; sys_platform == "win32"
Binary file not shown.
Binary file not shown.
Binary file not shown.
27 changes: 27 additions & 0 deletions tests/test_downloadermiddleware_httpcompression.py
Expand Up @@ -20,6 +20,12 @@
'rawdeflate': ('html-rawdeflate.bin', 'deflate'),
'zlibdeflate': ('html-zlibdeflate.bin', 'deflate'),
'br': ('html-br.bin', 'br'),
# $ zstd raw.html --content-size -o html-zstd-static-content-size.bin
'zstd-static-content-size': ('html-zstd-static-content-size.bin', 'zstd'),
# $ zstd raw.html --no-content-size -o html-zstd-static-no-content-size.bin
'zstd-static-no-content-size': ('html-zstd-static-no-content-size.bin', 'zstd'),
# $ cat raw.html | zstd -o html-zstd-streaming-no-content-size.bin
'zstd-streaming-no-content-size': ('html-zstd-static-no-content-size.bin', 'zstd'),
}


Expand Down Expand Up @@ -80,6 +86,27 @@ def test_process_response_br(self):
assert newresponse.body.startswith(b"<!DOCTYPE")
assert 'Content-Encoding' not in newresponse.headers

def test_process_response_zstd(self):
try:
import zstandard # noqa: F401
except ImportError:
raise SkipTest("no zstd support (zstandard)")
raw_content = None
for check_key in FORMAT:
if not check_key.startswith('zstd-'):
continue
response = self._getresponse(check_key)
request = response.request
self.assertEqual(response.headers['Content-Encoding'], b'zstd')
newresponse = self.mw.process_response(request, response, self.spider)
if raw_content is None:
raw_content = newresponse.body
else:
assert raw_content == newresponse.body
assert newresponse is not response
assert newresponse.body.startswith(b"<!DOCTYPE")
assert 'Content-Encoding' not in newresponse.headers

def test_process_response_rawdeflate(self):
response = self._getresponse('rawdeflate')
request = response.request
Expand Down