Skip to content

Make decompression slightly faster for the CLI tool #78

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,22 @@ Changelog
.. This document is user facing. Please word the changes in such a way
.. that users understand how the changes affect the new version.

version 0.11.0-dev
------------------
In this release the ``python -m isal.igzip`` relatively slow decompression rate
has been improved. Previously it was 19% slower than ``igzip`` when used with
the ``-d`` flag for decompressing, now it is just 8% slower.

+ Reverse a bug in the build system which caused some docstring and parameter
information on ``igzip_lib`` and ``isal_zlib`` to disappear in the
documentation and the REPL.
+ Increase the buffer size for ``python -m isal.igzip`` so it is now closer
to speeds reached with ``igzip``.
+ Add a ``READ_BUFFER_SIZE`` attribute to ``igzip`` which allows setting the
amount of raw data that is read at once.
+ Add an ``igzip_lib.IgzipDecompressor`` object which can decompress without
using an unconsumed_tail and is therefore more efficient.

version 0.10.0
------------------
+ Added an ``igzip_lib`` module which allows more direct access to ISA-L's
Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ API-documentation: igzip
========================

.. automodule:: isal.igzip
:members: compress, decompress, open
:members: compress, decompress, open, BadGzipFile, GzipFile, READ_BUFFER_SIZE

.. autoclass:: IGzipFile
:members:
Expand Down
1 change: 1 addition & 0 deletions profile_igzipreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@


def main():
igzip.READ_BUFFER_SIZE = 32 * 1024
with igzip.open(sys.argv[1], mode="rb") as gzip_h:
while True:
block = gzip_h.read(32*1024)
Expand Down
34 changes: 13 additions & 21 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,6 @@
SYSTEM_IS_WINDOWS = sys.platform.startswith("win")


def default_compiler_directives():
return dict(language_level="3",
binding=True)


class IsalExtension(Extension):
"""Custom extension to allow for targeted modification."""
pass
Expand Down Expand Up @@ -110,23 +105,20 @@ def build_extension(self, ext):
"include")]
# -fPIC needed for proper static linking
ext.extra_compile_args = ["-fPIC"]

# Import cython here so python setup.py can be used without
# installing cython.
from Cython.Build import cythonize
compiler_directives = default_compiler_directives()
line_tracing_enabled = os.getenv("CYTHON_COVERAGE") is not None
if line_tracing_enabled:
# Add cython directives for coverage support.
compiler_directives.update(linetrace=True)
cythonized_exts = cythonize(
ext, compiler_directives=compiler_directives)

for cython_ext in cythonized_exts:
if line_tracing_enabled:
if os.getenv("CYTHON_COVERAGE") is not None:
# Import cython here so python setup.py can be used without
# installing cython.
from Cython.Build import cythonize
# Add cython directives and macros for coverage support.
cythonized_exts = cythonize(ext, compiler_directives=dict(
linetrace=True
))
for cython_ext in cythonized_exts:
cython_ext.define_macros = [("CYTHON_TRACE_NOGIL", "1")]
cython_ext._needs_stub = False
super().build_extension(cython_ext)
cython_ext._needs_stub = False
super().build_extension(cython_ext)
return
super().build_extension(ext)


# Use a cache to prevent isa-l from being build twice. According to the
Expand Down
2 changes: 2 additions & 0 deletions src/isal/_isal.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# cython: language_level=3

from .version cimport ISAL_MAJOR_VERSION as C_ISAL_MAJOR_VERSION
from .version cimport ISAL_MINOR_VERSION as C_ISAL_MINOR_VERSION
from .version cimport ISAL_PATCH_VERSION as C_ISAL_PATCH_VERSION
Expand Down
2 changes: 2 additions & 0 deletions src/isal/crc.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# cython: language_level=3

cdef extern from "<isa-l/crc.h>":
cdef unsigned int crc32_gzip_refl(
unsigned int init_crc, #!< initial CRC value, 32 bits
Expand Down
66 changes: 62 additions & 4 deletions src/isal/igzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,17 @@

from . import igzip_lib, isal_zlib

__all__ = ["IGzipFile", "open", "compress", "decompress", "BadGzipFile"]
__all__ = ["IGzipFile", "open", "compress", "decompress", "BadGzipFile",
"READ_BUFFER_SIZE"]

_COMPRESS_LEVEL_FAST = isal_zlib.ISAL_BEST_SPEED
_COMPRESS_LEVEL_TRADEOFF = isal_zlib.ISAL_DEFAULT_COMPRESSION
_COMPRESS_LEVEL_BEST = isal_zlib.ISAL_BEST_COMPRESSION

#: The amount of data that is read in at once when decompressing a file.
#: Increasing this value may increase performance.
READ_BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE

FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16

try:
Expand Down Expand Up @@ -229,8 +234,8 @@ def __init__(self, fp):
# Call the init method of gzip._GzipReader's parent here.
# It is not very invasive and allows us to override _PaddedFile
_compression.DecompressReader.__init__(
self, _PaddedFile(fp), isal_zlib.decompressobj,
wbits=-isal_zlib.MAX_WBITS)
self, _PaddedFile(fp), igzip_lib.IgzipDecompressor,
hist_bits=igzip_lib.MAX_HIST_BITS, flag=igzip_lib.DECOMP_DEFLATE)
# Set flag indicating start of a new member
self._new_member = True
self._last_mtime = None
Expand All @@ -241,6 +246,57 @@ def _add_read_data(self, data):
self._crc = isal_zlib.crc32(data, self._crc)
self._stream_size += len(data)

def read(self, size=-1):
if size < 0:
return self.readall()
# size=0 is special because decompress(max_length=0) is not supported
if not size:
return b""

# For certain input data, a single
# call to decompress() may not return
# any data. In this case, retry until we get some data or reach EOF.
while True:
if self._decompressor.eof:
# Ending case: we've come to the end of a member in the file,
# so finish up this member, and read a new gzip header.
# Check the CRC and file size, and set the flag so we read
# a new member
self._read_eof()
self._new_member = True
self._decompressor = self._decomp_factory(
**self._decomp_args)

if self._new_member:
# If the _new_member flag is set, we have to
# jump to the next member, if there is one.
self._init_read()
if not self._read_gzip_header():
self._size = self._pos
return b""
self._new_member = False

# Read a chunk of data from the file
if self._decompressor.needs_input:
buf = self._fp.read(READ_BUFFER_SIZE)
uncompress = self._decompressor.decompress(buf, size)
else:
uncompress = self._decompressor.decompress(b"", size)
if self._decompressor.unused_data != b"":
# Prepend the already read bytes to the fileobj so they can
# be seen by _read_eof() and _read_gzip_header()
self._fp.prepend(self._decompressor.unused_data)

if uncompress != b"":
break
if buf == b"":
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")

self._add_read_data(uncompress)
self._pos += len(uncompress)
return uncompress


# Aliases for improved compatibility with CPython gzip module.
GzipFile = IGzipFile
Expand Down Expand Up @@ -382,7 +438,7 @@ def _argument_parser():
# diminishing returns hit. _compression.BUFFER_SIZE = 8k. But 32K is about
# ~6% faster.
parser.add_argument("-b", "--buffer-size",
default=32 * 1024, type=int,
default=128 * 1024, type=int,
help=argparse.SUPPRESS)
return parser

Expand Down Expand Up @@ -418,6 +474,8 @@ def main():
elif not args.compress and args.file is not None:
out_file = io.open(base, "wb")

global READ_BUFFER_SIZE
READ_BUFFER_SIZE = args.buffer_size
try:
while True:
block = in_file.read(args.buffer_size)
Expand Down
21 changes: 12 additions & 9 deletions src/isal/igzip_lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# cython: language_level=3
# cython: binding=True

cdef extern from "<isa-l/igzip_lib.h>":
# Deflate compression standard defines
int ISAL_DEF_MAX_HDR_SIZE
Expand Down Expand Up @@ -490,16 +493,16 @@ cdef:

cdef int mem_level_to_bufsize(int compression_level, int mem_level, unsigned int *bufsize)

cpdef compress(data,
int level= ?,
int flag = ?,
int mem_level = ?,
int hist_bits = ?,
cdef _compress(data,
int level,
int flag,
int mem_level,
int hist_bits,
)

cpdef decompress(data,
int flag = ?,
int hist_bits= ?,
Py_ssize_t bufsize= ?)
cdef _decompress(data,
int flag,
int hist_bits,
Py_ssize_t bufsize)

cdef bytes view_bitbuffer(inflate_state * stream)
7 changes: 7 additions & 0 deletions src/isal/igzip_lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,10 @@ def compress(data, level: int = ISAL_DEFAULT_COMPRESSION,
def decompress(data, flag: int = DECOMP_DEFLATE,
hist_bits: int = MAX_HIST_BITS,
bufsize: int = DEF_BUF_SIZE) -> bytes: ...

class IgzipDecompressor:
unused_data: bytes
needs_input: bool
eof: bool

def decompress(self, data, max_length = -1) -> bytes: ...
Loading