Skip to content

Commit

Permalink
gh-89550: Buffer GzipFile.write to reduce execution time by ~15% (#10…
Browse files Browse the repository at this point in the history
…1251)

Use `io.BufferedWriter` to buffer gzip writes.

---------

Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
Co-authored-by: Gregory P. Smith <greg@krypto.org>
  • Loading branch information
3 people committed May 8, 2023
1 parent 405eacc commit 9af4854
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 5 deletions.
40 changes: 35 additions & 5 deletions Lib/gzip.py
Expand Up @@ -22,6 +22,7 @@
_COMPRESS_LEVEL_BEST = 9

READ_BUFFER_SIZE = 128 * 1024
_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE


def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
Expand Down Expand Up @@ -120,6 +121,21 @@ class BadGzipFile(OSError):
"""Exception raised in some cases for invalid gzip files."""


class _WriteBufferStream(io.RawIOBase):
"""Minimal object to pass WriteBuffer flushes into GzipFile"""
def __init__(self, gzip_file):
self.gzip_file = gzip_file

def write(self, data):
return self.gzip_file._write_raw(data)

def seekable(self):
return False

def writable(self):
return True


class GzipFile(_compression.BaseStream):
"""The GzipFile class simulates most of the methods of a file object with
the exception of the truncate() method.
Expand Down Expand Up @@ -184,6 +200,7 @@ def __init__(self, filename=None, mode=None,
if mode is None:
mode = getattr(fileobj, 'mode', 'rb')


if mode.startswith('r'):
self.mode = READ
raw = _GzipReader(fileobj)
Expand All @@ -206,6 +223,9 @@ def __init__(self, filename=None, mode=None,
zlib.DEF_MEM_LEVEL,
0)
self._write_mtime = mtime
self._buffer_size = _WRITE_BUFFER_SIZE
self._buffer = io.BufferedWriter(_WriteBufferStream(self),
buffer_size=self._buffer_size)
else:
raise ValueError("Invalid mode: {!r}".format(mode))

Expand All @@ -231,6 +251,11 @@ def _init_write(self, filename):
self.bufsize = 0
self.offset = 0 # Current file offset for seek(), tell(), etc

def tell(self):
self._check_not_closed()
self._buffer.flush()
return super().tell()

def _write_gzip_header(self, compresslevel):
self.fileobj.write(b'\037\213') # magic header
self.fileobj.write(b'\010') # compression method
Expand Down Expand Up @@ -272,6 +297,10 @@ def write(self,data):
if self.fileobj is None:
raise ValueError("write() on closed GzipFile object")

return self._buffer.write(data)

def _write_raw(self, data):
# Called by our self._buffer underlying WriteBufferStream.
if isinstance(data, (bytes, bytearray)):
length = len(data)
else:
Expand Down Expand Up @@ -322,16 +351,17 @@ def close(self):
fileobj = self.fileobj
if fileobj is None:
return
self.fileobj = None
try:
if self.mode == WRITE:
self._buffer.flush()
fileobj.write(self.compress.flush())
write32u(fileobj, self.crc)
# self.size may exceed 2 GiB, or even 4 GiB
write32u(fileobj, self.size & 0xffffffff)
elif self.mode == READ:
self._buffer.close()
finally:
self.fileobj = None
myfileobj = self.myfileobj
if myfileobj:
self.myfileobj = None
Expand All @@ -341,7 +371,7 @@ def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
self._check_not_closed()
if self.mode == WRITE:
# Ensure the compressor's buffer is flushed
self.fileobj.write(self.compress.flush(zlib_mode))
self._buffer.flush()
self.fileobj.flush()

def fileno(self):
Expand Down Expand Up @@ -378,10 +408,10 @@ def seek(self, offset, whence=io.SEEK_SET):
if offset < self.offset:
raise OSError('Negative seek in write mode')
count = offset - self.offset
chunk = b'\0' * 1024
for i in range(count // 1024):
chunk = b'\0' * self._buffer_size
for i in range(count // self._buffer_size):
self.write(chunk)
self.write(b'\0' * (count % 1024))
self.write(b'\0' * (count % self._buffer_size))
elif self.mode == READ:
self._check_not_closed()
return self._buffer.seek(offset, whence)
Expand Down
@@ -0,0 +1,2 @@
Decrease execution time of some :mod:`gzip` file writes by 15% by
adding more appropriate buffering.

0 comments on commit 9af4854

Please sign in to comment.