scikit-learn · jnothman · Aug 30, 2017 · Aug 18, 2017 · Aug 18, 2017 · Aug 19, 2017
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1380,6 +1380,7 @@ Low-level methods
    utils.validation.check_symmetric
    utils.validation.column_or_1d
    utils.validation.has_fit_parameter
+   utils.validation.check_memory
 
 Recently deprecated
 ===================

diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
@@ -15,7 +15,6 @@
 from scipy.sparse.csgraph import connected_components
 
 from ..base import BaseEstimator, ClusterMixin
-from ..externals.joblib import Memory
 from ..externals import six
 from ..metrics.pairwise import paired_distances, pairwise_distances
 from ..utils import check_array
@@ -26,6 +25,8 @@
 
 from ..externals.six.moves import xrange
 
+from sklearn.utils.validation import check_memory
+
 ###############################################################################
 # For non fully-connected graphs
 
@@ -196,7 +197,8 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False):
     else:
         if n_clusters > n_samples:
             raise ValueError('Cannot provide more clusters than samples. '
-                             '%i n_clusters was asked, and there are %i samples.'
+                             '%i n_clusters was asked, and there are'
+                             ' %i samples.'
                              % (n_clusters, n_samples))
         n_nodes = 2 * n_samples - n_clusters
 
@@ -609,7 +611,7 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin):
         "manhattan", "cosine", or 'precomputed'.
         If linkage is "ward", only "euclidean" is accepted.
 
-    memory : Instance of sklearn.externals.joblib.Memory or string, optional \
+    memory : Instance of joblib.Memory or string, optional \
             (default=None)
         Used to cache the output of the computation of the tree.
         By default, no caching is done. If a string is given, it is the
@@ -693,16 +695,7 @@ def fit(self, X, y=None):
         self
         """
         X = check_array(X, ensure_min_samples=2, estimator=self)
-        memory = self.memory
-        if memory is None:
-            memory = Memory(cachedir=None, verbose=0)
-        elif isinstance(memory, six.string_types):
-            memory = Memory(cachedir=memory, verbose=0)
-        elif not isinstance(memory, Memory):
-            raise ValueError("'memory' should either be a string or"
-                             " a sklearn.externals.joblib.Memory"
-                             " instance, got 'memory={!r}' instead.".format(
-                                 type(memory)))
+        memory = check_memory(self.memory)
 
         if self.n_clusters <= 0:
             raise ValueError("n_clusters should be an integer greater than 0."
@@ -779,7 +772,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         "manhattan", "cosine", or 'precomputed'.
         If linkage is "ward", only "euclidean" is accepted.
 
-    memory : Instance of sklearn.externals.joblib.Memory or string, optional \
+    memory : Instance of joblib.Memory or string, optional \
             (default=None)
         Used to cache the output of the computation of the tree.
         By default, no caching is done. If a string is given, it is the

diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
@@ -34,6 +34,7 @@
 from sklearn.utils.fast_dict import IntFloatDict
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_warns
+from sklearn.utils.tests.test_validation import DummyMemory, WrongDummyMemory
 
 
 def test_deprecation_of_n_components_in_linkage_tree():
@@ -50,6 +51,7 @@ def test_deprecation_of_n_components_in_linkage_tree():
     assert_equal(n_leaves, n_leaves_t)
     assert_equal(parent, parent_t)
 
+
 def test_linkage_misc():
     # Misc tests on linkage
     rng = np.random.RandomState(42)
@@ -140,6 +142,17 @@ def test_agglomerative_clustering_wrong_arg_memory():
     assert_raises(ValueError, clustering.fit, X)
 
 
+def test_agglomerative_clustering_with_cache_attribute():
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X = rng.randn(n_samples, 50)
+    clustering = AgglomerativeClustering(memory=DummyMemory())
+    clustering.fit(X)
+
+    clustering = AgglomerativeClustering(memory=WrongDummyMemory())
+    assert_raises(ValueError, clustering.fit, X)
+
+
 def test_agglomerative_clustering():
     # Check that we obtain the correct number of clusters with
     # agglomerative clustering.

diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
@@ -22,6 +22,7 @@
 from .utils import Bunch
 
 from .utils.metaestimators import _BaseComposition
+from sklearn.utils.validation import check_memory
 
 __all__ = ['Pipeline', 'FeatureUnion']
 
@@ -52,7 +53,7 @@ class Pipeline(_BaseComposition):
         chained, in the order in which they are chained, with the last object
         an estimator.
 
-    memory : Instance of sklearn.external.joblib.Memory or string, optional \
+    memory : Instance of joblib.Memory or string, optional \
             (default=None)
         Used to cache the fitted transformers of the pipeline. By default,
         no caching is performed. If a string is given, it is the path to
@@ -187,16 +188,7 @@ def _final_estimator(self):
     def _fit(self, X, y=None, **fit_params):
         self._validate_steps()
         # Setup the memory
-        memory = self.memory
-        if memory is None:
-            memory = Memory(cachedir=None, verbose=0)
-        elif isinstance(memory, six.string_types):
-            memory = Memory(cachedir=memory, verbose=0)
-        elif not isinstance(memory, Memory):
-            raise ValueError("'memory' should either be a string or"
-                             " a sklearn.externals.joblib.Memory"
-                             " instance, got 'memory={!r}' instead.".format(
-                                 type(memory)))
+        memory = check_memory(self.memory)
 
         fit_transform_one_cached = memory.cache(_fit_transform_one)
 
@@ -538,7 +530,7 @@ def make_pipeline(*steps, **kwargs):
     ----------
     *steps : list of estimators,
 
-    memory : Instance of sklearn.externals.joblib.Memory or string, optional \
+    memory : Instance of joblib.Memory or string, optional \
             (default=None)
         Used to cache the fitted transformers of the pipeline. By default,
         no caching is performed. If a string is given, it is the path to

diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
@@ -19,6 +19,7 @@
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_dict_equal
+from sklearn.utils.tests.test_validation import DummyMemory, WrongDummyMemory
 
 from sklearn.base import clone, BaseEstimator
 from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
@@ -266,6 +267,16 @@ def test_pipeline_sample_weight_unsupported():
     )
 
 
+def test_pipeline_with_cache_attribute():
+    X = np.array([[1, 2]])
+    pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=DummyMemory())
+    pipe.fit(X, y=None)
+
+    pipe = Pipeline([('transf', Transf()), ('clf', Mult())],
+                    memory=WrongDummyMemory())
+    assert_raises(ValueError, pipe.fit, X)
+
+
 def test_pipeline_raise_set_params_error():
     # Test pipeline raises set params error message for nested models.
     pipe = Pipeline([('cls', LinearRegression())])
@@ -852,9 +863,8 @@ def test_pipeline_wrong_memory():
     memory = 1
     cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())],
                            memory=memory)
-    assert_raises_regex(ValueError, "'memory' should either be a string or a"
-                        " sklearn.externals.joblib.Memory instance, got",
-                        cached_pipe.fit, X, y)
+    assert_raises_regex(ValueError, "'memory' should either be a string or"
+                        " a joblib.Memory instance", cached_pipe.fit, X, y)
 
 
 def test_pipeline_memory():

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
@@ -31,6 +31,7 @@
     check_is_fitted,
     check_consistent_length,
     assert_all_finite,
+    check_memory
 )
 import sklearn
 
@@ -539,3 +540,23 @@ def test_suppress_validation():
     assert_all_finite(X)
     sklearn.set_config(assume_finite=False)
     assert_raises(ValueError, assert_all_finite, X)
+
+
+class DummyMemory(object):
+    def __init__(self):
+        pass
+
+    def cache(self, func):
+        return func
+
+    cachedir = None
+
+
+class WrongDummyMemory(object):
+    def __init__(self):
+        pass
+
+
+def test_check_memory():
+    memory = check_memory(DummyMemory())
+    assert_raises(ValueError, check_memory, WrongDummyMemory())
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -20,6 +20,7 @@
 from ..exceptions import NonBLASDotWarning
 from ..exceptions import NotFittedError
 from ..exceptions import DataConversionWarning
+from ..externals.joblib import Memory
 
 
 FLOAT_DTYPES = (np.float64, np.float32, np.float16)
@@ -155,6 +156,32 @@ def _shape_repr(shape):
     return "(%s)" % joined
 
 
+def check_memory(memory):
+    """Check that the memory is an instance of joblib.Memory.
+
+    Raises a ValueError if the passed object does not have a
+
+    cache attribute.
+
+    Parameters
+    ----------
+    memory: input object.
+
+    Returns
+    -------
+    memory: the input memory if it is valid. A valueError if invalid memory instance.
+    """
+
+    if memory is None:
+        memory = Memory(cachedir=None, verbose=0)
+    elif isinstance(memory, six.string_types):
+        memory = Memory(cachedir=memory, verbose=0)
+    elif not hasattr(memory, 'cache'):
+            raise ValueError("'memory' should either be a string or"
+                             " a joblib.Memory instance")
+    return memory
+
+
 def check_consistent_length(*arrays):
     """Check that all arrays have consistent first dimensions.