Skip to content

Commit

Permalink
gh-95534: Improve gzip reading speed by 10% (#97664)
Browse files Browse the repository at this point in the history
Change summary:
+ There is now a `gzip.READ_BUFFER_SIZE` constant that is 128KB. Other programs that read in 128KB chunks: pigz and cat. So this seems best practice among good programs. Also it is faster than 8 kb chunks.
+ a zlib._ZlibDecompressor was added. This is the _bz2.BZ2Decompressor ported to zlib. Since the zlib.Decompress object is better for in-memory decompression, the _ZlibDecompressor is hidden. It only makes sense in file decompression, and that is already implemented now in the gzip library. No need to bother the users with this.
+ The ZlibDecompressor uses the older Cpython arrange_output_buffer functions, as those are faster and more appropriate for the use case. 
+ GzipFile.read has been optimized. There is no longer a `unconsumed_tail` member to write back to padded file. This is instead handled by the ZlibDecompressor itself, which has an internal buffer. `_add_read_data` has been inlined, as it was just two calls.

EDIT: While I am adding improvements anyway, I figured I could add another one-liner optimization now to the python -m gzip application. That read chunks in io.DEFAULT_BUFFER_SIZE previously, but has been updated now to use READ_BUFFER_SIZE chunks.
  • Loading branch information
rhpvorderman committed Oct 17, 2022
1 parent bb38b39 commit eae7dad
Show file tree
Hide file tree
Showing 5 changed files with 850 additions and 80 deletions.
24 changes: 12 additions & 12 deletions Lib/gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
_COMPRESS_LEVEL_TRADEOFF = 6
_COMPRESS_LEVEL_BEST = 9

READ_BUFFER_SIZE = 128 * 1024


def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
encoding=None, errors=None, newline=None):
Expand Down Expand Up @@ -446,7 +448,7 @@ def _read_gzip_header(fp):

class _GzipReader(_compression.DecompressReader):
def __init__(self, fp):
super().__init__(_PaddedFile(fp), zlib.decompressobj,
super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
wbits=-zlib.MAX_WBITS)
# Set flag indicating start of a new member
self._new_member = True
Expand Down Expand Up @@ -494,12 +496,13 @@ def read(self, size=-1):
self._new_member = False

# Read a chunk of data from the file
buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
if self._decompressor.needs_input:
buf = self._fp.read(READ_BUFFER_SIZE)
uncompress = self._decompressor.decompress(buf, size)
else:
uncompress = self._decompressor.decompress(b"", size)

uncompress = self._decompressor.decompress(buf, size)
if self._decompressor.unconsumed_tail != b"":
self._fp.prepend(self._decompressor.unconsumed_tail)
elif self._decompressor.unused_data != b"":
if self._decompressor.unused_data != b"":
# Prepend the already read bytes to the fileobj so they can
# be seen by _read_eof() and _read_gzip_header()
self._fp.prepend(self._decompressor.unused_data)
Expand All @@ -510,14 +513,11 @@ def read(self, size=-1):
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")

self._add_read_data( uncompress )
self._crc = zlib.crc32(uncompress, self._crc)
self._stream_size += len(uncompress)
self._pos += len(uncompress)
return uncompress

def _add_read_data(self, data):
self._crc = zlib.crc32(data, self._crc)
self._stream_size = self._stream_size + len(data)

def _read_eof(self):
# We've read to the end of the file
# We check that the computed CRC and size of the
Expand Down Expand Up @@ -647,7 +647,7 @@ def main():
f = builtins.open(arg, "rb")
g = open(arg + ".gz", "wb")
while True:
chunk = f.read(io.DEFAULT_BUFFER_SIZE)
chunk = f.read(READ_BUFFER_SIZE)
if not chunk:
break
g.write(chunk)
Expand Down
167 changes: 167 additions & 0 deletions Lib/test/test_zlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,173 @@ def choose_lines(source, number, seed=None, generator=random):
"""


class ZlibDecompressorTest():
# Test adopted from test_bz2.py
TEXT = HAMLET_SCENE
DATA = zlib.compress(HAMLET_SCENE)
BAD_DATA = b"Not a valid deflate block"
def test_Constructor(self):
self.assertRaises(TypeError, zlib._ZlibDecompressor, 42)

def testDecompress(self):
zlibd = zlib._ZlibDecompressor()
self.assertRaises(TypeError, zlibd.decompress)
text = zlibd.decompress(self.DATA)
self.assertEqual(text, self.TEXT)

def testDecompressChunks10(self):
zlibd = zlib._ZlibDecompressor()
text = b''
n = 0
while True:
str = self.DATA[n*10:(n+1)*10]
if not str:
break
text += zlibd.decompress(str)
n += 1
self.assertEqual(text, self.TEXT)

def testDecompressUnusedData(self):
zlibd = zlib._ZlibDecompressor()
unused_data = b"this is unused data"
text = zlibd.decompress(self.DATA+unused_data)
self.assertEqual(text, self.TEXT)
self.assertEqual(zlibd.unused_data, unused_data)

def testEOFError(self):
zlibd = zlib._ZlibDecompressor()
text = zlibd.decompress(self.DATA)
self.assertRaises(EOFError, zlibd.decompress, b"anything")
self.assertRaises(EOFError, zlibd.decompress, b"")

@support.skip_if_pgo_task
@bigmemtest(size=_4G + 100, memuse=3.3)
def testDecompress4G(self, size):
# "Test zlib._ZlibDecompressor.decompress() with >4GiB input"
blocksize = 10 * 1024 * 1024
block = random.randbytes(blocksize)
try:
data = block * (size // blocksize + 1)
compressed = zlib.compress(data)
zlibd = zlib._ZlibDecompressor()
decompressed = zlibd.decompress(compressed)
self.assertTrue(decompressed == data)
finally:
data = None
compressed = None
decompressed = None

def testPickle(self):
for proto in range(pickle.HIGHEST_PROTOCOL + 1):
with self.assertRaises(TypeError):
pickle.dumps(zlib._ZlibDecompressor(), proto)

def testDecompressorChunksMaxsize(self):
zlibd = zlib._ZlibDecompressor()
max_length = 100
out = []

# Feed some input
len_ = len(self.BIG_DATA) - 64
out.append(zlibd.decompress(self.BIG_DATA[:len_],
max_length=max_length))
self.assertFalse(zlibd.needs_input)
self.assertEqual(len(out[-1]), max_length)

# Retrieve more data without providing more input
out.append(zlibd.decompress(b'', max_length=max_length))
self.assertFalse(zlibd.needs_input)
self.assertEqual(len(out[-1]), max_length)

# Retrieve more data while providing more input
out.append(zlibd.decompress(self.BIG_DATA[len_:],
max_length=max_length))
self.assertLessEqual(len(out[-1]), max_length)

# Retrieve remaining uncompressed data
while not zlibd.eof:
out.append(zlibd.decompress(b'', max_length=max_length))
self.assertLessEqual(len(out[-1]), max_length)

out = b"".join(out)
self.assertEqual(out, self.BIG_TEXT)
self.assertEqual(zlibd.unused_data, b"")

def test_decompressor_inputbuf_1(self):
# Test reusing input buffer after moving existing
# contents to beginning
zlibd = zlib._ZlibDecompressor()
out = []

# Create input buffer and fill it
self.assertEqual(zlibd.decompress(self.DATA[:100],
max_length=0), b'')

# Retrieve some results, freeing capacity at beginning
# of input buffer
out.append(zlibd.decompress(b'', 2))

# Add more data that fits into input buffer after
# moving existing data to beginning
out.append(zlibd.decompress(self.DATA[100:105], 15))

# Decompress rest of data
out.append(zlibd.decompress(self.DATA[105:]))
self.assertEqual(b''.join(out), self.TEXT)

def test_decompressor_inputbuf_2(self):
# Test reusing input buffer by appending data at the
# end right away
zlibd = zlib._ZlibDecompressor()
out = []

# Create input buffer and empty it
self.assertEqual(zlibd.decompress(self.DATA[:200],
max_length=0), b'')
out.append(zlibd.decompress(b''))

# Fill buffer with new data
out.append(zlibd.decompress(self.DATA[200:280], 2))

# Append some more data, not enough to require resize
out.append(zlibd.decompress(self.DATA[280:300], 2))

# Decompress rest of data
out.append(zlibd.decompress(self.DATA[300:]))
self.assertEqual(b''.join(out), self.TEXT)

def test_decompressor_inputbuf_3(self):
# Test reusing input buffer after extending it

zlibd = zlib._ZlibDecompressor()
out = []

# Create almost full input buffer
out.append(zlibd.decompress(self.DATA[:200], 5))

# Add even more data to it, requiring resize
out.append(zlibd.decompress(self.DATA[200:300], 5))

# Decompress rest of data
out.append(zlibd.decompress(self.DATA[300:]))
self.assertEqual(b''.join(out), self.TEXT)

def test_failure(self):
zlibd = zlib._ZlibDecompressor()
self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30)
# Previously, a second call could crash due to internal inconsistency
self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30)

@support.refcount_test
def test_refleaks_in___init__(self):
gettotalrefcount = support.get_attribute(sys, 'gettotalrefcount')
zlibd = zlib._ZlibDecompressor()
refs_before = gettotalrefcount()
for i in range(100):
zlibd.__init__()
self.assertAlmostEqual(gettotalrefcount() - refs_before, 0, delta=10)


class CustomInt:
def __index__(self):
return 100
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:meth:`gzip.GzipFile.read` reads 10% faster.
100 changes: 99 additions & 1 deletion Modules/clinic/zlibmodule.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit eae7dad

Please sign in to comment.