Skip to content

Commit

Permalink
Issue #1675951: Allow GzipFile to work with unseekable file objects.
Browse files Browse the repository at this point in the history
Patch by Florian Festi.
  • Loading branch information
pitrou committed Sep 23, 2010
1 parent dda7fdf commit 7b96984
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 18 deletions.
3 changes: 3 additions & 0 deletions Doc/library/gzip.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ The module defines the following items:
.. versionchanged:: 3.2
Support for zero-padded files was added.

.. versionchanged:: 3.2
Support for unseekable files was added.


.. function:: open(filename, mode='rb', compresslevel=9)

Expand Down
91 changes: 73 additions & 18 deletions Lib/gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,62 @@ def open(filename, mode="rb", compresslevel=9):
"""
return GzipFile(filename, mode, compresslevel)

class _PaddedFile:
"""Minimal read-only file object that prepends a string to the contents
of an actual file. Shouldn't be used outside of gzip.py, as it lacks
essential functionality."""

def __init__(self, f, prepend=b''):
self._buffer = prepend
self._length = len(prepend)
self.file = f
self._read = 0

def read(self, size):
if self._read is None:
return self.file.read(size)
if self._read + size <= self._length:
read = self._read
self._read += size
return self._buffer[read:self._read]
else:
read = self._read
self._read = None
return self._buffer[read:] + \
self.file.read(size-self._length+read)

def prepend(self, prepend=b'', readprevious=False):
if self._read is None:
self._buffer = prepend
elif readprevious and len(prepend) <= self._read:
self._read -= len(prepend)
return
else:
self._buffer = self._buffer[read:] + prepend
self._length = len(self._buffer)
self._read = 0

def unused(self):
if self._read is None:
return b''
return self._buffer[self._read:]

def seek(self, offset, whence=0):
# This is only ever called with offset=whence=0
if whence == 1 and self._read is not None:
if 0 <= offset + self._read <= self._length:
self._read += offset
return
else:
offset += self._length - self._read
self._read = None
self._buffer = None
return self.file.seek(offset, whence)

def __getattr__(self, name):
return getattr(name, self.file)


class GzipFile(io.BufferedIOBase):
"""The GzipFile class simulates most of the methods of a file object with
the exception of the readinto() and truncate() methods.
Expand Down Expand Up @@ -119,6 +175,7 @@ def __init__(self, filename=None, mode=None,
self.name = filename
# Starts small, scales exponentially
self.min_readsize = 100
fileobj = _PaddedFile(fileobj)

elif mode[0:1] == 'w' or mode[0:1] == 'a':
self.mode = WRITE
Expand Down Expand Up @@ -188,6 +245,9 @@ def _init_read(self):

def _read_gzip_header(self):
magic = self.fileobj.read(2)
if magic == b'':
raise EOFError("Reached EOF")

if magic != b'\037\213':
raise IOError('Not a gzipped file')
method = ord( self.fileobj.read(1) )
Expand Down Expand Up @@ -219,6 +279,11 @@ def _read_gzip_header(self):
if flag & FHCRC:
self.fileobj.read(2) # Read & discard the 16-bit header CRC

unused = self.fileobj.unused()
if unused:
uncompress = self.decompress.decompress(unused)
self._add_read_data(uncompress)

def write(self,data):
if self.mode != WRITE:
import errno
Expand Down Expand Up @@ -282,16 +347,6 @@ def _read(self, size=1024):
if self._new_member:
# If the _new_member flag is set, we have to
# jump to the next member, if there is one.
#
# First, check if we're at the end of the file;
# if so, it's time to stop; no more members to read.
pos = self.fileobj.tell() # Save current position
self.fileobj.seek(0, 2) # Seek to end of file
if pos == self.fileobj.tell():
raise EOFError("Reached EOF")
else:
self.fileobj.seek( pos ) # Return to original position

self._init_read()
self._read_gzip_header()
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Expand All @@ -305,6 +360,9 @@ def _read(self, size=1024):

if buf == b"":
uncompress = self.decompress.flush()
# Prepend the already read bytes to the fileobj to they can be
# seen by _read_eof()
self.fileobj.prepend(self.decompress.unused_data, True)
self._read_eof()
self._add_read_data( uncompress )
raise EOFError('Reached EOF')
Expand All @@ -316,10 +374,9 @@ def _read(self, size=1024):
# Ending case: we've come to the end of a member in the file,
# so seek back to the start of the unused data, finish up
# this member, and read a new gzip header.
# (The number of bytes to seek back is the length of the unused
# data, minus 8 because _read_eof() will rewind a further 8 bytes)
self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)

# Prepend the already read bytes to the fileobj to they can be
# seen by _read_eof() and _read_gzip_header()
self.fileobj.prepend(self.decompress.unused_data, True)
# Check the CRC and file size, and set the flag so we read
# a new member on the next call
self._read_eof()
Expand All @@ -334,12 +391,10 @@ def _add_read_data(self, data):
self.size = self.size + len(data)

def _read_eof(self):
# We've read to the end of the file, so we have to rewind in order
# to reread the 8 bytes containing the CRC and the file size.
# We've read to the end of the file
# We check the that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32.
self.fileobj.seek(-8, 1)
crc32 = read32(self.fileobj)
isize = read32(self.fileobj) # may exceed 2GB
if crc32 != self.crc:
Expand All @@ -355,7 +410,7 @@ def _read_eof(self):
while c == b"\x00":
c = self.fileobj.read(1)
if c:
self.fileobj.seek(-1, 1)
self.fileobj.prepend(c, True)

@property
def closed(self):
Expand Down
21 changes: 21 additions & 0 deletions Lib/test/test_gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,17 @@
"""


class UnseekableIO(io.BytesIO):
def seekable(self):
return False

def tell(self):
raise io.UnsupportedOperation

def seek(self, *args):
raise io.UnsupportedOperation


class TestGzip(unittest.TestCase):
filename = support.TESTFN

Expand Down Expand Up @@ -265,6 +276,16 @@ def test_zero_padded_file(self):
d = f.read()
self.assertEqual(d, data1 * 50, "Incorrect data in file")

def test_non_seekable_file(self):
uncompressed = data1 * 50
buf = UnseekableIO()
with gzip.GzipFile(fileobj=buf, mode="wb") as f:
f.write(uncompressed)
compressed = buf.getvalue()
buf = UnseekableIO(compressed)
with gzip.GzipFile(fileobj=buf, mode="rb") as f:
self.assertEqual(f.read(), uncompressed)

# Testing compress/decompress shortcut functions

def test_compress(self):
Expand Down
1 change: 1 addition & 0 deletions Misc/ACKS
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ Bill Fancher
Mark Favas
Niels Ferguson
Sebastian Fernandez
Florian Festi
Vincent Fiack
Tomer Filiba
Jeffrey Finkelstein
Expand Down
3 changes: 3 additions & 0 deletions Misc/NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ Core and Builtins
Library
-------

- Issue #1675951: Allow GzipFile to work with unseekable file objects.
Patch by Florian Festi.

- Logging: Added QueueListener class to facilitate logging usage for
performance-critical threads.

Expand Down

0 comments on commit 7b96984

Please sign in to comment.