gh-95534: Improve gzip reading speed by 10% (#97664)

Change summary: + There is now a `gzip.READ_BUFFER_SIZE` constant that is 128KB. Other programs that read in 128KB chunks: pigz and cat. So this seems best practice among good programs. Also it is faster than 8 kb chunks. + a zlib._ZlibDecompressor was added. This is the _bz2.BZ2Decompressor ported to zlib. Since the zlib.Decompress object is better for in-memory decompression, the _ZlibDecompressor is hidden. It only makes sense in file decompression, and that is already implemented now in the gzip library. No need to bother the users with this. + The ZlibDecompressor uses the older Cpython arrange_output_buffer functions, as those are faster and more appropriate for the use case. + GzipFile.read has been optimized. There is no longer a `unconsumed_tail` member to write back to padded file. This is instead handled by the ZlibDecompressor itself, which has an internal buffer. `_add_read_data` has been inlined, as it was just two calls. EDIT: While I am adding improvements anyway, I figured I could add another one-liner optimization now to the python -m gzip application. That read chunks in io.DEFAULT_BUFFER_SIZE previously, but has been updated now to use READ_BUFFER_SIZE chunks.
python · Oct 17, 2022 · eae7dad · eae7dad
1 parent bb38b39
commit eae7dad
Show file tree

Hide file tree

Showing 5 changed files with 850 additions and 80 deletions.
diff --git a/Lib/gzip.py b/Lib/gzip.py
@@ -21,6 +21,8 @@
 _COMPRESS_LEVEL_TRADEOFF = 6
 _COMPRESS_LEVEL_BEST = 9
 
+READ_BUFFER_SIZE = 128 * 1024
+
 
 def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
          encoding=None, errors=None, newline=None):
@@ -446,7 +448,7 @@ def _read_gzip_header(fp):
 
 class _GzipReader(_compression.DecompressReader):
     def __init__(self, fp):
-        super().__init__(_PaddedFile(fp), zlib.decompressobj,
+        super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
                          wbits=-zlib.MAX_WBITS)
         # Set flag indicating start of a new member
         self._new_member = True
@@ -494,12 +496,13 @@ def read(self, size=-1):
                 self._new_member = False
 
             # Read a chunk of data from the file
-            buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
+            if self._decompressor.needs_input:
+                buf = self._fp.read(READ_BUFFER_SIZE)
+                uncompress = self._decompressor.decompress(buf, size)
+            else:
+                uncompress = self._decompressor.decompress(b"", size)
 
-            uncompress = self._decompressor.decompress(buf, size)
-            if self._decompressor.unconsumed_tail != b"":
-                self._fp.prepend(self._decompressor.unconsumed_tail)
-            elif self._decompressor.unused_data != b"":
+            if self._decompressor.unused_data != b"":
                 # Prepend the already read bytes to the fileobj so they can
                 # be seen by _read_eof() and _read_gzip_header()
                 self._fp.prepend(self._decompressor.unused_data)
@@ -510,14 +513,11 @@ def read(self, size=-1):
                 raise EOFError("Compressed file ended before the "
                                "end-of-stream marker was reached")
 
-        self._add_read_data( uncompress )
+        self._crc = zlib.crc32(uncompress, self._crc)
+        self._stream_size += len(uncompress)
         self._pos += len(uncompress)
         return uncompress
 
-    def _add_read_data(self, data):
-        self._crc = zlib.crc32(data, self._crc)
-        self._stream_size = self._stream_size + len(data)
-
     def _read_eof(self):
         # We've read to the end of the file
         # We check that the computed CRC and size of the
@@ -647,7 +647,7 @@ def main():
                 f = builtins.open(arg, "rb")
                 g = open(arg + ".gz", "wb")
         while True:
-            chunk = f.read(io.DEFAULT_BUFFER_SIZE)
+            chunk = f.read(READ_BUFFER_SIZE)
             if not chunk:
                 break
             g.write(chunk)

diff --git a/Lib/test/test_zlib.py b/Lib/test/test_zlib.py
@@ -944,6 +944,173 @@ def choose_lines(source, number, seed=None, generator=random):
 """
 
 
+class ZlibDecompressorTest():
+    # Test adopted from test_bz2.py
+    TEXT = HAMLET_SCENE
+    DATA = zlib.compress(HAMLET_SCENE)
+    BAD_DATA = b"Not a valid deflate block"
+    def test_Constructor(self):
+        self.assertRaises(TypeError, zlib._ZlibDecompressor, 42)
+
+    def testDecompress(self):
+        zlibd = zlib._ZlibDecompressor()
+        self.assertRaises(TypeError, zlibd.decompress)
+        text = zlibd.decompress(self.DATA)
+        self.assertEqual(text, self.TEXT)
+
+    def testDecompressChunks10(self):
+        zlibd = zlib._ZlibDecompressor()
+        text = b''
+        n = 0
+        while True:
+            str = self.DATA[n*10:(n+1)*10]
+            if not str:
+                break
+            text += zlibd.decompress(str)
+            n += 1
+        self.assertEqual(text, self.TEXT)
+
+    def testDecompressUnusedData(self):
+        zlibd = zlib._ZlibDecompressor()
+        unused_data = b"this is unused data"
+        text = zlibd.decompress(self.DATA+unused_data)
+        self.assertEqual(text, self.TEXT)
+        self.assertEqual(zlibd.unused_data, unused_data)
+
+    def testEOFError(self):
+        zlibd = zlib._ZlibDecompressor()
+        text = zlibd.decompress(self.DATA)
+        self.assertRaises(EOFError, zlibd.decompress, b"anything")
+        self.assertRaises(EOFError, zlibd.decompress, b"")
+
+    @support.skip_if_pgo_task
+    @bigmemtest(size=_4G + 100, memuse=3.3)
+    def testDecompress4G(self, size):
+        # "Test zlib._ZlibDecompressor.decompress() with >4GiB input"
+        blocksize = 10 * 1024 * 1024
+        block = random.randbytes(blocksize)
+        try:
+            data = block * (size // blocksize + 1)
+            compressed = zlib.compress(data)
+            zlibd = zlib._ZlibDecompressor()
+            decompressed = zlibd.decompress(compressed)
+            self.assertTrue(decompressed == data)
+        finally:
+            data = None
+            compressed = None
+            decompressed = None
+
+    def testPickle(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            with self.assertRaises(TypeError):
+                pickle.dumps(zlib._ZlibDecompressor(), proto)
+
+    def testDecompressorChunksMaxsize(self):
+        zlibd = zlib._ZlibDecompressor()
+        max_length = 100
+        out = []
+
+        # Feed some input
+        len_ = len(self.BIG_DATA) - 64
+        out.append(zlibd.decompress(self.BIG_DATA[:len_],
+                                  max_length=max_length))
+        self.assertFalse(zlibd.needs_input)
+        self.assertEqual(len(out[-1]), max_length)
+
+        # Retrieve more data without providing more input
+        out.append(zlibd.decompress(b'', max_length=max_length))
+        self.assertFalse(zlibd.needs_input)
+        self.assertEqual(len(out[-1]), max_length)
+
+        # Retrieve more data while providing more input
+        out.append(zlibd.decompress(self.BIG_DATA[len_:],
+                                  max_length=max_length))
+        self.assertLessEqual(len(out[-1]), max_length)
+
+        # Retrieve remaining uncompressed data
+        while not zlibd.eof:
+            out.append(zlibd.decompress(b'', max_length=max_length))
+            self.assertLessEqual(len(out[-1]), max_length)
+
+        out = b"".join(out)
+        self.assertEqual(out, self.BIG_TEXT)
+        self.assertEqual(zlibd.unused_data, b"")
+
+    def test_decompressor_inputbuf_1(self):
+        # Test reusing input buffer after moving existing
+        # contents to beginning
+        zlibd = zlib._ZlibDecompressor()
+        out = []
+
+        # Create input buffer and fill it
+        self.assertEqual(zlibd.decompress(self.DATA[:100],
+                                        max_length=0), b'')
+
+        # Retrieve some results, freeing capacity at beginning
+        # of input buffer
+        out.append(zlibd.decompress(b'', 2))
+
+        # Add more data that fits into input buffer after
+        # moving existing data to beginning
+        out.append(zlibd.decompress(self.DATA[100:105], 15))
+
+        # Decompress rest of data
+        out.append(zlibd.decompress(self.DATA[105:]))
+        self.assertEqual(b''.join(out), self.TEXT)
+
+    def test_decompressor_inputbuf_2(self):
+        # Test reusing input buffer by appending data at the
+        # end right away
+        zlibd = zlib._ZlibDecompressor()
+        out = []
+
+        # Create input buffer and empty it
+        self.assertEqual(zlibd.decompress(self.DATA[:200],
+                                        max_length=0), b'')
+        out.append(zlibd.decompress(b''))
+
+        # Fill buffer with new data
+        out.append(zlibd.decompress(self.DATA[200:280], 2))
+
+        # Append some more data, not enough to require resize
+        out.append(zlibd.decompress(self.DATA[280:300], 2))
+
+        # Decompress rest of data
+        out.append(zlibd.decompress(self.DATA[300:]))
+        self.assertEqual(b''.join(out), self.TEXT)
+
+    def test_decompressor_inputbuf_3(self):
+        # Test reusing input buffer after extending it
+
+        zlibd = zlib._ZlibDecompressor()
+        out = []
+
+        # Create almost full input buffer
+        out.append(zlibd.decompress(self.DATA[:200], 5))
+
+        # Add even more data to it, requiring resize
+        out.append(zlibd.decompress(self.DATA[200:300], 5))
+
+        # Decompress rest of data
+        out.append(zlibd.decompress(self.DATA[300:]))
+        self.assertEqual(b''.join(out), self.TEXT)
+
+    def test_failure(self):
+        zlibd = zlib._ZlibDecompressor()
+        self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30)
+        # Previously, a second call could crash due to internal inconsistency
+        self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30)
+
+    @support.refcount_test
+    def test_refleaks_in___init__(self):
+        gettotalrefcount = support.get_attribute(sys, 'gettotalrefcount')
+        zlibd = zlib._ZlibDecompressor()
+        refs_before = gettotalrefcount()
+        for i in range(100):
+            zlibd.__init__()
+        self.assertAlmostEqual(gettotalrefcount() - refs_before, 0, delta=10)
+
+
 class CustomInt:
     def __index__(self):
         return 100

diff --git a/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst b/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst
@@ -0,0 +1 @@
+:meth:`gzip.GzipFile.read` reads 10% faster.
diff --git a/Modules/clinic/zlibmodule.c.h b/Modules/clinic/zlibmodule.c.h