Permalink
Browse files

ENH: Integrate joblib 0.5.7

This bring in compression support for memory
  • Loading branch information...
1 parent 235f274 commit 7719b37eb86ca188e015d4be2920b755968039db @GaelVaroquaux GaelVaroquaux committed Dec 28, 2011
@@ -96,7 +96,7 @@
"""
-__version__ = '0.5.7b'
+__version__ = '0.5.7'
from .memory import Memory
@@ -95,8 +95,10 @@ class MemorizedFunc(Logger):
mmap_mode: {None, 'r+', 'r', 'w+', 'c'}
The memmapping mode used when loading from cache
numpy arrays. See numpy.load for the meaning of the
- arguments. Only used if save_npy was true when the
- cache was created.
+ arguments.
+ compress: boolean
+ Whether to zip the stored data on disk. Note that compressed
+ arrays cannot be read by memmapping.
verbose: int, optional
The verbosity flag, controls messages that are issued as
the function is revaluated.
@@ -105,8 +107,8 @@ class MemorizedFunc(Logger):
# Public interface
#-------------------------------------------------------------------------
- def __init__(self, func, cachedir, ignore=None, save_npy=True,
- mmap_mode=None, verbose=1, timestamp=None):
+ def __init__(self, func, cachedir, ignore=None, mmap_mode=None,
+ compress=False, verbose=1, timestamp=None):
"""
Parameters
----------
@@ -116,14 +118,10 @@ def __init__(self, func, cachedir, ignore=None, save_npy=True,
The path of the base directory to use as a data store
ignore: list or None
List of variable names to ignore.
- save_npy: boolean, optional
- If True, numpy arrays are saved outside of the pickle
- files in the cache, as npy files.
mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
The memmapping mode used when loading from cache
numpy arrays. See numpy.load for the meaning of the
- arguments. Only used if save_npy was true when the
- cache was created.
+ arguments.
verbose: int, optional
Verbosity flag, controls the debug messages that are issued
as functions are revaluated. The higher, the more verbose
@@ -135,8 +133,11 @@ def __init__(self, func, cachedir, ignore=None, save_npy=True,
self._verbose = verbose
self.cachedir = cachedir
self.func = func
- self.save_npy = save_npy
self.mmap_mode = mmap_mode
+ self.compress = compress
+ if compress and mmap_mode is not None:
+ warnings.warn('Compressed results cannot be memmapped',
+ stacklevel=2)
if timestamp is None:
timestamp = time.time()
self.timestamp = timestamp
@@ -189,7 +190,7 @@ def __reduce__(self):
In addition, when unpickling, we run the __init__
"""
return (self.__class__, (self.func, self.cachedir, self.ignore,
- self.save_npy, self.mmap_mode, self._verbose))
+ self.mmap_mode, self.compress, self._verbose))
#-------------------------------------------------------------------------
# Private interface
@@ -367,12 +368,7 @@ def _persist_output(self, output, dir):
try:
mkdirp(dir)
filename = os.path.join(dir, 'output.pkl')
-
- if 'numpy' in sys.modules and self.save_npy:
- numpy_pickle.dump(output, filename)
- else:
- with open(filename, 'w') as output_file:
- pickle.dump(output, output_file, protocol=2)
+ numpy_pickle.dump(output, filename, compress=self.compress)
except OSError:
" Race condition in the creation of the directory "
@@ -408,12 +404,8 @@ def load_output(self, output_dir):
self.format_signature(self.func)[0]
)
filename = os.path.join(output_dir, 'output.pkl')
- if self.save_npy:
- return numpy_pickle.load(filename,
- mmap_mode=self.mmap_mode)
- else:
- output_file = file(filename, 'r')
- return pickle.load(output_file)
+ return numpy_pickle.load(filename,
+ mmap_mode=self.mmap_mode)
# XXX: Need a method to check if results are available.
@@ -445,33 +437,34 @@ class Memory(Logger):
# Public interface
#-------------------------------------------------------------------------
- def __init__(self, cachedir, save_npy=True, mmap_mode=None,
- verbose=1):
+ def __init__(self, cachedir, mmap_mode=None, compress=False, verbose=1):
"""
Parameters
----------
cachedir: string or None
The path of the base directory to use as a data store
or None. If None is given, no caching is done and
the Memory object is completely transparent.
- save_npy: boolean, optional
- If True, numpy arrays are saved outside of the pickle
- files in the cache, as npy files.
mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
The memmapping mode used when loading from cache
numpy arrays. See numpy.load for the meaning of the
- arguments. Only used if save_npy was true when the
- cache was created.
+ arguments.
+ compress: boolean
+ Whether to zip the stored data on disk. Note that
+ compressed arrays cannot be read by memmapping.
verbose: int, optional
Verbosity flag, controls the debug messages that are issued
as functions are revaluated.
"""
# XXX: Bad explaination of the None value of cachedir
Logger.__init__(self)
self._verbose = verbose
- self.save_npy = save_npy
self.mmap_mode = mmap_mode
self.timestamp = time.time()
+ self.compress = compress
+ if compress and mmap_mode is not None:
+ warnings.warn('Compressed results cannot be memmapped',
+ stacklevel=2)
if cachedir is None:
self.cachedir = None
else:
@@ -518,9 +511,9 @@ def cache(self, func=None, ignore=None, verbose=None,
if isinstance(func, MemorizedFunc):
func = func.func
return MemorizedFunc(func, cachedir=self.cachedir,
- save_npy=self.save_npy,
mmap_mode=mmap_mode,
ignore=ignore,
+ compress=self.compress,
verbose=verbose,
timestamp=self.timestamp)
@@ -561,4 +554,4 @@ def __reduce__(self):
"""
# We need to remove 'joblib' from the end of cachedir
return (self.__class__, (self.cachedir[:-7],
- self.save_npy, self.mmap_mode, self._verbose))
+ self.mmap_mode, self.compress, self._verbose))
@@ -101,7 +101,10 @@ def __init__(self, filename, file_handle=None, mmap_mode=None):
file_handle = open(file_handle, 'rb')
self.file_handle = file_handle
Unpickler.__init__(self, self.file_handle)
- import numpy as np
+ try:
+ import numpy as np
+ except ImportError:
+ np = None
self.np = np
def _open_file(self, name):
@@ -118,6 +121,9 @@ def load_build(self):
"""
Unpickler.load_build(self)
if isinstance(self.stack[-1], NDArrayWrapper):
+ if self.np is None:
+ raise ImportError('Trying to unpickle an ndarray, '
+ "but numpy didn't import correctly")
nd_array_wrapper = self.stack.pop()
if self.np.__version__ >= '1.3':
array = self.np.load(
@@ -162,7 +168,7 @@ def _open_file(self, name):
###############################################################################
# Utility functions
-def dump(value, filename, zipped=False):
+def dump(value, filename, compress=False):
""" Persist an arbitrary Python object into a filename, with numpy arrays
saved as separate .npy files.
@@ -172,25 +178,25 @@ def dump(value, filename, zipped=False):
The object to store to disk
filename: string
The name of the file in which it is to be stored
- zipped: boolean, optional
+ compress: boolean, optional
Whether to compress the data on the disk or not
Returns
-------
filenames: list of strings
- The list of file names in which the data is stored. If zipped
- is false, each array is stored in a different file.
+ The list of file names in which the data is stored. If
+ compress is false, each array is stored in a different file.
See Also
--------
joblib.load : corresponding loader
Notes
-----
- zipped file take extra disk space during the dump, and extra
+ compressed files take extra disk space during the dump, and extra
memory during the loading.
"""
- if zipped:
+ if compress:
return _dump_zipped(value, filename)
else:
return _dump(value, filename)
@@ -243,8 +249,8 @@ def load(filename, mmap_mode=None):
The name of the file from which to load the object
mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
If not None, the arrays are memory-mapped from the disk. This
- mode has not effect for zipped files. Note that in this
- case the reconstructed object might not longer match exactly
+ mode has not effect for compressed files. Note that in this
+ case the reconstructed object might not longer match exactly
the originally pickled object.
Returns
@@ -100,24 +100,25 @@ def f(l):
yield test
# Now test clearing
- memory = Memory(cachedir=env['dir'], verbose=0)
- # First clear the cache directory, to check that our code can
- # handle that
- # NOTE: this line would raise an exception, as the database file is still
- # open; we ignore the error since we want to test what happens if the
- # directory disappears
- shutil.rmtree(env['dir'], ignore_errors=True)
- g = memory.cache(f)
- g(1)
- g.clear(warn=False)
- current_accumulator = len(accumulator)
- out = g(1)
- yield nose.tools.assert_equal, len(accumulator), \
- current_accumulator + 1
- # Also, check that Memory.eval works similarly
- yield nose.tools.assert_equal, memory.eval(f, 1), out
- yield nose.tools.assert_equal, len(accumulator), \
- current_accumulator + 1
+ for compress in (False, True):
+ memory = Memory(cachedir=env['dir'], verbose=0, compress=compress)
+ # First clear the cache directory, to check that our code can
+ # handle that
+ # NOTE: this line would raise an exception, as the database file is still
+ # open; we ignore the error since we want to test what happens if the
+ # directory disappears
+ shutil.rmtree(env['dir'], ignore_errors=True)
+ g = memory.cache(f)
+ g(1)
+ g.clear(warn=False)
+ current_accumulator = len(accumulator)
+ out = g(1)
+ yield nose.tools.assert_equal, len(accumulator), \
+ current_accumulator + 1
+ # Also, check that Memory.eval works similarly
+ yield nose.tools.assert_equal, memory.eval(f, 1), out
+ yield nose.tools.assert_equal, len(accumulator), \
+ current_accumulator + 1
def test_no_memory():
@@ -113,9 +113,9 @@ def test_standard_types():
#""" Test pickling and saving with standard types.
#"""
filename = env['filename']
- for zipped in [True, False]:
+ for compress in [True, False]:
for member in typelist:
- numpy_pickle.dump(member, filename, zipped=zipped)
+ numpy_pickle.dump(member, filename, compress=compress)
_member = numpy_pickle.load(filename)
# We compare the pickled instance to the reloaded one only if it
# can be compared to a copied one
@@ -127,10 +127,10 @@ def test_standard_types():
def test_numpy_persistence():
filename = env['filename']
a = np.random.random(10)
- for zipped in [True, False]:
+ for compress in [True, False]:
for obj in (a,), (a, a), [a, a, a]:
- filenames = numpy_pickle.dump(obj, filename, zipped=zipped)
- if not zipped:
+ filenames = numpy_pickle.dump(obj, filename, compress=compress)
+ if not compress:
# Check that one file was created per array
yield nose.tools.assert_equal, len(filenames), len(obj) + 1
# Check that these files do exist
@@ -151,7 +151,7 @@ def test_numpy_persistence():
# Now test with array subclasses
obj = np.matrix(np.zeros(10))
- filenames = numpy_pickle.dump(obj, filename, zipped=zipped)
+ filenames = numpy_pickle.dump(obj, filename, compress=compress)
obj_ = numpy_pickle.load(filename)
yield nose.tools.assert_true, isinstance(obj_, np.matrix)

0 comments on commit 7719b37

Please sign in to comment.