Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: io/matlab: avoid making copies of zlib compressed data when loading files #519

Merged
merged 8 commits into from
May 4, 2013
152 changes: 152 additions & 0 deletions scipy/io/matlab/benchmarks/bench_memusage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Posix-only benchmark
from __future__ import division, absolute_import, print_function

import os
import sys
import re
import subprocess
import time
import textwrap
import tempfile
import warnings

from numpy.testing import dec

import numpy as np
from scipy.io import savemat, loadmat

@dec.skipif(not sys.platform.startswith('linux'), "Memory benchmark works only on Linux")
def bench_run():
mem_info = get_mem_info()
set_mem_rlimit(int(mem_info['memtotal'] * 0.7))

# Setup temp file, make it fit in memory
f = tempfile.NamedTemporaryFile(suffix='.mat')
os.unlink(f.name)

max_size = int(mem_info['memtotal'] * 0.7)//4
sizes = [1e6, 10e6, 100e6, 300e6, 500e6, 1000e6]

print_table_row(['** loadmat benchmark'])
print_table_row(['size (MB)', 'compression', 'time (s)',
'peak memory (MB)', 'mem factor'])

for size in sizes:
for compressed in (False, True):
if size > max_size:
print_table_row(["%.1f" % (size/1e6,), compressed, "SKIP"])
continue

try:
x = np.random.rand(size//8).view(dtype=np.uint8)
savemat(f.name, dict(x=x), do_compression=compressed, oned_as='row')
del x
except MemoryError:
x = None
print_table_row(["%.1f" % (size/1e6,), compressed, "FAIL"])
continue

code = """
from scipy.io import loadmat
loadmat('%s')
""" % (f.name,)
time, peak_mem = run_monitored(code)

print_table_row(["%.1f" % (size/1e6,), compressed, time,
"%.1f" % (peak_mem/1e6,),
"%.2f x" % (peak_mem/size,)])

print_table_row(['** savemat memory benchmark'])
print_table_row(['size (MB)', 'compression', 'time (s)',
'peak memory (MB)', 'mem factor'])

for size in sizes:
for compressed in (False, True):
if size > max_size:
print_table_row(["%.1f" % (size/1e6,), compressed, "SKIP"])
continue

code = """
import numpy as np
from scipy.io import savemat
x = np.random.rand(%d//8).view(dtype=np.uint8)
savemat('%s', dict(x=x), do_compression=%r, oned_as='row')
""" % (size, f.name, compressed)
try:
time, peak_mem = run_monitored(code)
except AssertionError:
print_table_row(["%.1f" % (size/1e6,), compressed, "FAIL"])
continue

print_table_row(["%.1f" % (size/1e6,), compressed, time,
"%.1f" % (peak_mem/1e6,),
"%.2f x" % (peak_mem/size,)])

def print_table_row(columns):
print(" | ".join("%-20s" % x for x in columns))

def run_monitored(code):
"""
Run code in a new Python process, and monitor peak memory usage.

Returns
-------
duration : float
Duration in seconds (including Python startup time)
peak_memusage : float
Peak memory usage (rough estimate only) in bytes

"""
code = textwrap.dedent(code)
process = subprocess.Popen([sys.executable, '-c', code])

peak_memusage = -1

start = time.time()
while True:
ret = process.poll()
if ret is not None:
break

with open('/proc/%d/status' % process.pid, 'r') as f:
procdata = f.read()

m = re.search('VmRSS:\s*(\d+)\s*kB', procdata, re.S|re.I)
if m is not None:
memusage = float(m.group(1)) * 1e3
peak_memusage = max(memusage, peak_memusage)

time.sleep(0.01)

process.wait()

duration = time.time() - start

if process.returncode != 0:
raise AssertionError("Running failed:\n%s" % code)

return duration, peak_memusage

def get_mem_info():
"""Get information about available memory"""
info = {}
with open('/proc/meminfo', 'r') as f:
for line in f:
p = line.split()
info[p[0].strip(':').lower()] = float(p[1]) * 1e3
return info

def set_mem_rlimit(max_mem):
"""
Set rlimit to 80% of total system memory, to avoid grinding halt
because of swapping.
"""
import resource
cur_limit = resource.getrlimit(resource.RLIMIT_AS)
if cur_limit[0] > 0:
max_mem = min(max_mem, cur_limit[0])

resource.setrlimit(resource.RLIMIT_AS, (max_mem, cur_limit[1]))

if __name__ == "__main__":
bench_run()
28 changes: 15 additions & 13 deletions scipy/io/matlab/benchmarks/bench_structarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from numpy.testing import *

from io import StringIO
from io import BytesIO

import numpy as np
import scipy.io as sio
Expand All @@ -19,25 +19,27 @@ def make_structarr(n_vars, n_fields, n_structs):


def bench_run():
str_io = StringIO()
str_io = BytesIO()
print()
print('Read / writing matlab structs')
print('='*60)
print(' write | read | vars | fields | structs ')
print(' write | read | vars | fields | structs | compressed')
print('-'*60)
print()
for n_vars, n_fields, n_structs in (
(10, 10, 20),):
(10, 10, 20), (20, 20, 40), (30, 30, 50)):
var_dict = make_structarr(n_vars, n_fields, n_structs)
str_io = StringIO()
write_time = measure('sio.savemat(str_io, var_dict)')
read_time = measure('sio.loadmat(str_io)')
print('%.5f | %.5f | %5d | %5d | %5d ' % (
write_time,
read_time,
n_vars,
n_fields,
n_structs))
for compression in (False, True):
str_io = BytesIO()
write_time = measure('sio.savemat(str_io, var_dict, do_compression=%r)' % compression)
read_time = measure('sio.loadmat(str_io)')
print('%.5f | %.5f | %5d | %5d | %5d | %r' % (
write_time,
read_time,
n_vars,
n_fields,
n_structs,
compression))


if __name__ == '__main__' :
Expand Down
20 changes: 6 additions & 14 deletions scipy/io/matlab/mio5.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@
mxCELL_CLASS, mxSTRUCT_CLASS, mxOBJECT_CLASS, mxCHAR_CLASS, \
mxSPARSE_CLASS, mxDOUBLE_CLASS, mclass_info, mclass_dtypes_template

from .streams import ZlibInputStream


class MatFile5Reader(MatFileReader):
''' Reader for Mat 5 mat files
Expand Down Expand Up @@ -215,19 +217,8 @@ def read_var_header(self):
raise ValueError("Did not read any bytes")
next_pos = self.mat_stream.tell() + byte_count
if mdtype == miCOMPRESSED:
# make new stream from compressed data
data = self.mat_stream.read(byte_count)
# Some matlab files contain zlib streams without valid
# Z_STREAM_END termination. To get round this, we use the
# decompressobj object, that allows you to decode an
# incomplete stream. See discussion at
# http://bugs.python.org/issue8672
dcor = zlib.decompressobj()
stream = BytesIO(dcor.decompress(data))
# Check the stream is not so broken as to leave cruft behind
if not dcor.flush() == b'':
raise ValueError("Something wrong with byte stream.")
del data
# Make new stream from compressed data
stream = ZlibInputStream(self.mat_stream, byte_count)
self._matrix_reader.set_stream(stream)
mdtype, byte_count = self._matrix_reader.read_full_tag()
else:
Expand Down Expand Up @@ -876,6 +867,7 @@ def put_variables(self, mdict, write_header=None):
tag = np.empty((), NDT_TAG_FULL)
tag['mdtype'] = miCOMPRESSED
tag['byte_count'] = len(out_str)
self.file_stream.write(tag.tostring() + out_str)
self.file_stream.write(tag.tostring())
self.file_stream.write(out_str)
else: # not compressing
self._matrix_writer.write_top(var, asbytes(name), is_global)
Loading