New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG+2] Implements ducktyping to allow for alternative Memory implementations #9584
Changes from 17 commits
160d9bc
c907dfb
28e6254
d4b5024
8d48223
8cdebae
d608fc9
8e4845a
e26703c
7df3e38
2d1b58c
2a063a4
b7f4c99
065b595
869c35e
9b68a22
88ce9b0
4b40c74
79b9982
ece5049
419ef3b
9145f42
2fba11b
bc7515b
b0afc2b
008e2d5
976646d
a0d0cc9
688ca8a
5cd86fc
3337c41
59f2d38
975dfa6
9e1d6b5
e0e3e6d
b86dc4c
b3f1dfa
4775fa2
1ad7814
b73f674
fcef75b
cf4d794
3e22d23
792245e
309c88a
0d12170
b123d8f
a4e9d66
14e11d1
967b0f1
8c1b826
b6cf291
bf31e1b
510a46b
d2c6c7e
2aea261
ce88f2d
06fcd71
648749d
6bcc0a8
f7b1f72
26f2927
f078fbf
2953c3e
ac53970
d1eb1a7
c1c2707
84ec245
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,6 @@ | |
from scipy.sparse.csgraph import connected_components | ||
|
||
from ..base import BaseEstimator, ClusterMixin | ||
from ..externals.joblib import Memory | ||
from ..externals import six | ||
from ..metrics.pairwise import paired_distances, pairwise_distances | ||
from ..utils import check_array | ||
|
@@ -26,6 +25,8 @@ | |
|
||
from ..externals.six.moves import xrange | ||
|
||
from sklearn.utils.validation import check_memory | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we usually make relative import: ..utils.validation import check_memory There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move this to line 21 as well under the other utils |
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unnecessary space |
||
############################################################################### | ||
# For non fully-connected graphs | ||
|
||
|
@@ -196,7 +197,8 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False): | |
else: | ||
if n_clusters > n_samples: | ||
raise ValueError('Cannot provide more clusters than samples. ' | ||
'%i n_clusters was asked, and there are %i samples.' | ||
'%i n_clusters was asked, and there are' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just to be sure. was it not PEP8 previosuly. If it was less than 80 characters avoid to make a diff for nothing. |
||
' %i samples.' | ||
% (n_clusters, n_samples)) | ||
n_nodes = 2 * n_samples - n_clusters | ||
|
||
|
@@ -609,7 +611,7 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin): | |
"manhattan", "cosine", or 'precomputed'. | ||
If linkage is "ward", only "euclidean" is accepted. | ||
|
||
memory : Instance of sklearn.externals.joblib.Memory or string, optional \ | ||
memory : Instance of joblib.Memory or string, optional \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it fitting on a single line? |
||
(default=None) | ||
Used to cache the output of the computation of the tree. | ||
By default, no caching is done. If a string is given, it is the | ||
|
@@ -693,16 +695,7 @@ def fit(self, X, y=None): | |
self | ||
""" | ||
X = check_array(X, ensure_min_samples=2, estimator=self) | ||
memory = self.memory | ||
if memory is None: | ||
memory = Memory(cachedir=None, verbose=0) | ||
elif isinstance(memory, six.string_types): | ||
memory = Memory(cachedir=memory, verbose=0) | ||
elif not isinstance(memory, Memory): | ||
raise ValueError("'memory' should either be a string or" | ||
" a sklearn.externals.joblib.Memory" | ||
" instance, got 'memory={!r}' instead.".format( | ||
type(memory))) | ||
memory = check_memory(self.memory) | ||
|
||
if self.n_clusters <= 0: | ||
raise ValueError("n_clusters should be an integer greater than 0." | ||
|
@@ -779,7 +772,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): | |
"manhattan", "cosine", or 'precomputed'. | ||
If linkage is "ward", only "euclidean" is accepted. | ||
|
||
memory : Instance of sklearn.externals.joblib.Memory or string, optional \ | ||
memory : Instance of joblib.Memory or string, optional \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it fitting on a single line |
||
(default=None) | ||
Used to cache the output of the computation of the tree. | ||
By default, no caching is done. If a string is given, it is the | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,7 @@ | |
from sklearn.utils.fast_dict import IntFloatDict | ||
from sklearn.utils.testing import assert_array_equal | ||
from sklearn.utils.testing import assert_warns | ||
from sklearn.utils.tests.test_validation import DummyMemory, WrongDummyMemory | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would need @jnothman to tell me if doing this is fine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TBH, once we're using |
||
|
||
|
||
def test_deprecation_of_n_components_in_linkage_tree(): | ||
|
@@ -50,6 +51,7 @@ def test_deprecation_of_n_components_in_linkage_tree(): | |
assert_equal(n_leaves, n_leaves_t) | ||
assert_equal(parent, parent_t) | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you revert this change? Going in the same direction as @jnothman's remark above, adding unrelated changes like on this in a PR adds noise to the diff, makes reviewing harder, and makes it less likely to get good feeback on your PR. That means you need to look carefully at your diff before and after pushing a commit into a PR. Maybe your editor is doing it automatically in which case you need to find out how to disable this feature. |
||
def test_linkage_misc(): | ||
# Misc tests on linkage | ||
rng = np.random.RandomState(42) | ||
|
@@ -140,6 +142,17 @@ def test_agglomerative_clustering_wrong_arg_memory(): | |
assert_raises(ValueError, clustering.fit, X) | ||
|
||
|
||
def test_agglomerative_clustering_with_cache_attribute(): | ||
rng = np.random.RandomState(0) | ||
n_samples = 100 | ||
X = rng.randn(n_samples, 50) | ||
clustering = AgglomerativeClustering(memory=DummyMemory()) | ||
clustering.fit(X) | ||
|
||
clustering = AgglomerativeClustering(memory=WrongDummyMemory()) | ||
assert_raises(ValueError, clustering.fit, X) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use assert_raises_message or assert_raises_regex to check the message which is raised. |
||
|
||
|
||
def test_agglomerative_clustering(): | ||
# Check that we obtain the correct number of clusters with | ||
# agglomerative clustering. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
from .utils import Bunch | ||
|
||
from .utils.metaestimators import _BaseComposition | ||
from sklearn.utils.validation import check_memory | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make relative import as previously stated |
||
|
||
__all__ = ['Pipeline', 'FeatureUnion'] | ||
|
||
|
@@ -52,7 +53,7 @@ class Pipeline(_BaseComposition): | |
chained, in the order in which they are chained, with the last object | ||
an estimator. | ||
|
||
memory : Instance of sklearn.external.joblib.Memory or string, optional \ | ||
memory : Instance of joblib.Memory or string, optional \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it fitting on one line |
||
(default=None) | ||
Used to cache the fitted transformers of the pipeline. By default, | ||
no caching is performed. If a string is given, it is the path to | ||
|
@@ -187,16 +188,7 @@ def _final_estimator(self): | |
def _fit(self, X, y=None, **fit_params): | ||
self._validate_steps() | ||
# Setup the memory | ||
memory = self.memory | ||
if memory is None: | ||
memory = Memory(cachedir=None, verbose=0) | ||
elif isinstance(memory, six.string_types): | ||
memory = Memory(cachedir=memory, verbose=0) | ||
elif not isinstance(memory, Memory): | ||
raise ValueError("'memory' should either be a string or" | ||
" a sklearn.externals.joblib.Memory" | ||
" instance, got 'memory={!r}' instead.".format( | ||
type(memory))) | ||
memory = check_memory(self.memory) | ||
|
||
fit_transform_one_cached = memory.cache(_fit_transform_one) | ||
|
||
|
@@ -538,7 +530,7 @@ def make_pipeline(*steps, **kwargs): | |
---------- | ||
*steps : list of estimators, | ||
|
||
memory : Instance of sklearn.externals.joblib.Memory or string, optional \ | ||
memory : Instance of joblib.Memory or string, optional \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. one line |
||
(default=None) | ||
Used to cache the fitted transformers of the pipeline. By default, | ||
no caching is performed. If a string is given, it is the path to | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
from sklearn.utils.testing import assert_array_equal | ||
from sklearn.utils.testing import assert_array_almost_equal | ||
from sklearn.utils.testing import assert_dict_equal | ||
from sklearn.utils.tests.test_validation import DummyMemory, WrongDummyMemory | ||
|
||
from sklearn.base import clone, BaseEstimator | ||
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union | ||
|
@@ -266,6 +267,16 @@ def test_pipeline_sample_weight_unsupported(): | |
) | ||
|
||
|
||
def test_pipeline_with_cache_attribute(): | ||
X = np.array([[1, 2]]) | ||
pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=DummyMemory()) | ||
pipe.fit(X, y=None) | ||
|
||
pipe = Pipeline([('transf', Transf()), ('clf', Mult())], | ||
memory=WrongDummyMemory()) | ||
assert_raises(ValueError, pipe.fit, X) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use assert_raises_message or assert_raises_regex to check the message which is raised. |
||
|
||
|
||
def test_pipeline_raise_set_params_error(): | ||
# Test pipeline raises set params error message for nested models. | ||
pipe = Pipeline([('cls', LinearRegression())]) | ||
|
@@ -852,9 +863,8 @@ def test_pipeline_wrong_memory(): | |
memory = 1 | ||
cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())], | ||
memory=memory) | ||
assert_raises_regex(ValueError, "'memory' should either be a string or a" | ||
" sklearn.externals.joblib.Memory instance, got", | ||
cached_pipe.fit, X, y) | ||
assert_raises_regex(ValueError, "'memory' should either be a string or" | ||
" a joblib.Memory instance", cached_pipe.fit, X, y) | ||
|
||
|
||
def test_pipeline_memory(): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ | |
check_is_fitted, | ||
check_consistent_length, | ||
assert_all_finite, | ||
check_memory | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just as a note for you future PR: |
||
) | ||
import sklearn | ||
|
||
|
@@ -539,3 +540,23 @@ def test_suppress_validation(): | |
assert_all_finite(X) | ||
sklearn.set_config(assume_finite=False) | ||
assert_raises(ValueError, assert_all_finite, X) | ||
|
||
|
||
class DummyMemory(object): | ||
def __init__(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't need init as well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added. |
||
pass | ||
|
||
def cache(self, func): | ||
return func | ||
|
||
cachedir = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put this in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't want to require cachedir, I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It could be useful for later checking maybe. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what you mean. An example? What I mean is that we might as well accept (and therefore test) objects that have a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh ok. I only wanted to make test something like memory = check_memory(DummyMemory())
assert_equal(memory.cachedir = 'something') but it can be cachedir or anything else as attributes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not just:
?? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes much better indeed |
||
|
||
|
||
class WrongDummyMemory(object): | ||
def __init__(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't need init as well |
||
pass | ||
|
||
|
||
def test_check_memory(): | ||
memory = check_memory(DummyMemory()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we also need to test what happens when a string or None is passed in. |
||
assert_raises(ValueError, check_memory, WrongDummyMemory()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use assert_raises_regex to check the message raised |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
from ..exceptions import NonBLASDotWarning | ||
from ..exceptions import NotFittedError | ||
from ..exceptions import DataConversionWarning | ||
from ..externals.joblib import Memory | ||
|
||
|
||
FLOAT_DTYPES = (np.float64, np.float32, np.float16) | ||
|
@@ -155,6 +156,32 @@ def _shape_repr(shape): | |
return "(%s)" % joined | ||
|
||
|
||
def check_memory(memory): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to test this functions in |
||
"""Check that the memory is an instance of joblib.Memory. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to come up with a better wording here and everywhere else in this PR. I would use "memory-like" (maybe Memory-like i.e.with a capital M), similarly to array-like in numpy. The docstring would read something like """Check that ``memory`` is memory-like.
memory-like means that ``memory`` can be converted into a
sklearn.externals.joblib.Memory instance (typically a str denoting the
``cachedir``) or has the same interface.
Parameters
----------
memory : memory-like or None
Returns
------
memory : object with the sklearn.externals.joblib.Memory interface
Raises
------
ValueError
if ``memory`` is not memory-like
""" And then use memory-like as the type info in all the docstrings, possibly with a rst link to check_memory for more details. |
||
|
||
Raises a ValueError if the passed object does not have a | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need a blank line between the short description and the long description. |
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no blank line here, please |
||
cache attribute. | ||
|
||
Parameters | ||
---------- | ||
memory: input object. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need space before |
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @lesteve we should put a description to follow numpydoc? (That's true that there is no much to say apart of "memory instance to be validated" and "Validated memory instance" |
||
Returns | ||
------- | ||
memory: the input memory if it is valid. A valueError if invalid memory instance. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need space before |
||
""" | ||
|
||
if memory is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to add a docstring There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can actually tackle this case in the same way as a string: if memory is None or isinstance(memory, six.string_types):
memory = Memory(cachedir=memory, verbose=0)t
elif not hasattr(memory, 'cache'):
# rest of the code is the same |
||
memory = Memory(cachedir=None, verbose=0) | ||
elif isinstance(memory, six.string_types): | ||
memory = Memory(cachedir=memory, verbose=0) | ||
elif not hasattr(memory, 'cache'): | ||
raise ValueError("'memory' should either be a string or" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't find the error message enough meaningful.
|
||
" a joblib.Memory instance") | ||
return memory | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that the raise statement should come here. |
||
|
||
|
||
def check_consistent_length(*arrays): | ||
"""Check that all arrays have consistent first dimensions. | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please put in alphabetical order.