pycompression · rhpvorderman · Apr 16, 2024 · Apr 15, 2024 · Apr 15, 2024 · Apr 16, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,10 @@ Changelog
 .. This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
+version 0.5.0-dev
+-----------------
++ Fix a bug where files larger than 4GB could not be decompressed.
+
 version 0.4.2
 -----------------
 + Fix a reference counting error that happened on module initialization and

diff --git a/src/zlib_ng/zlib_ngmodule.c b/src/zlib_ng/zlib_ngmodule.c
@@ -2590,8 +2590,9 @@ GzipReader_read_into_buffer(GzipReader *self, uint8_t *out_buffer, size_t out_bu
                         return -1;
                     }
                     uint32_t length = load_u32_le(current_pos);
-                    current_pos += 4; 
-                    if (length != self->zst.total_out) {
+                    current_pos += 4;
+                    // ISIZE is the length of the original data modulo 2^32
+                    if (length != (0xFFFFFFFFUL & self->zst.total_out)) {
                         Py_BLOCK_THREADS;
                         PyErr_SetString(BadGzipFile, "Incorrect length of data produced");
                         return -1;

diff --git a/tests/test_gzip_ng.py b/tests/test_gzip_ng.py
@@ -292,6 +292,31 @@ def test_decompress_incorrect_length():
     error.match("Incorrect length of data produced")
 
 
+def test_decompress_on_long_input():
+    # Ensure that a compressed payload with length bigger than 2**32 (ISIZE is
+    # overflown) can be decompressed. To avoid writing the whole uncompressed payload
+    # into memory, the test writes the compressed data in chunks. The payload consists
+    # almost exclusively of zeros to achieve an exteremely efficient compression rate,
+    # so that the compressed data also fits in memory.
+
+    buffered_stream = io.BytesIO()
+    n = 20
+    block_size = 2**n
+    iterations = 2**(32 - n)
+    zeros_block = bytes(block_size)
+
+    # To avoid writing the whole compressed data, we will write the compressed data
+    with gzip_ng.open(buffered_stream, "wb") as gz:
+        for _ in range(iterations):
+            gz.write(zeros_block)
+        gz.write(b"\x01" * 123)
+    buffered_stream.seek(0)
+    with gzip_ng.open(buffered_stream, "rb") as gz:
+        for _ in range(iterations):
+            assert zeros_block == gz.read(block_size)
+        assert gz.read() == b"\x01" * 123
+
+
 def test_decompress_incorrect_checksum():
     # Create a wrong checksum by using a non-default seed.
     wrong_checksum = zlib.crc32(DATA, 50)