From 0492836be3acb7b787aad13fd21c36eb4ea45e7b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 11 Dec 2019 11:22:18 -0800 Subject: [PATCH 01/34] support a scalar fit param --- sklearn/model_selection/_search.py | 10 ++++++++-- sklearn/model_selection/tests/test_search.py | 21 +++++++++++++++++++- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index e6a8493ef6250..2890a784ed5b3 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -649,8 +649,14 @@ def fit(self, X, y=None, groups=None, **fit_params): X, y, groups = indexable(X, y, groups) # make sure fit_params are sliceable - fit_params_values = indexable(*fit_params.values()) - fit_params = dict(zip(fit_params.keys(), fit_params_values)) + # TODO: remove in 0.24 + try: + fit_params_values = indexable(*fit_params.values()) + fit_params = dict(zip(fit_params.keys(), fit_params_values)) + except TypeError: + warnings.warn("Support for scaler fit params is deprecated. " + "Passing scalar values as a fit parameter will " + "raise an error in v0.24", FutureWarning) n_splits = cv.get_n_splits(X, y, groups) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 056927bee75d0..429215fe09df0 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -27,7 +27,7 @@ from scipy.stats import bernoulli, expon, uniform -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import clone from sklearn.exceptions import NotFittedError from sklearn.datasets import make_classification @@ -1846,3 +1846,22 @@ def test_search_cv__pairwise_property_equivalence_of_precomputed(): attr_message = "GridSearchCV not identical with precomputed metric" assert (preds_original == preds_precomputed).all(), attr_message + + +def test_scalar_fit_param(): + # test that a scalar fit param is supported with a warning. + # TODO: it should raise an error from v0.24. Issue #15805 + class TestEstimator(BaseEstimator, ClassifierMixin): + def __init__(self, a=None): + self.a = a + + def fit(self, X, y, r): + assert r == 42 + + def predict(self, X): + return np.zeros(shape=(len(X))) + + cv = GridSearchCV(TestEstimator(), param_grid = {'a': [1, 2]}) + X, y = make_classification() + with pytest.warns(FutureWarning, match="Support for scaler fit params"): + cv.fit(X, y, r=42) From 1ae9d033d7f109fe67cb5e1fa8107de81470dd6f Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 11 Dec 2019 11:25:14 -0800 Subject: [PATCH 02/34] pep8 --- sklearn/model_selection/tests/test_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 429215fe09df0..c34490a848ba7 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1861,7 +1861,7 @@ def fit(self, X, y, r): def predict(self, X): return np.zeros(shape=(len(X))) - cv = GridSearchCV(TestEstimator(), param_grid = {'a': [1, 2]}) + cv = GridSearchCV(TestEstimator(), param_grid={'a': [1, 2]}) X, y = make_classification() with pytest.warns(FutureWarning, match="Support for scaler fit params"): cv.fit(X, y, r=42) From 341f8e0b081000e3569087ce2192d62d981642bc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:05:10 +0100 Subject: [PATCH 03/34] TST add test for desired behavior --- sklearn/model_selection/tests/test_search.py | 43 +++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index c34490a848ba7..cf766ed15327a 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -36,6 +36,7 @@ from sklearn.model_selection import fit_grid_point from sklearn.model_selection import cross_val_score +from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedShuffleSplit @@ -1849,8 +1850,9 @@ def test_search_cv__pairwise_property_equivalence_of_precomputed(): def test_scalar_fit_param(): - # test that a scalar fit param is supported with a warning. - # TODO: it should raise an error from v0.24. Issue #15805 + # check general support for scalar in fit_params + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15805 class TestEstimator(BaseEstimator, ClassifierMixin): def __init__(self, a=None): self.a = a @@ -1861,7 +1863,36 @@ def fit(self, X, y, r): def predict(self, X): return np.zeros(shape=(len(X))) - cv = GridSearchCV(TestEstimator(), param_grid={'a': [1, 2]}) - X, y = make_classification() - with pytest.warns(FutureWarning, match="Support for scaler fit params"): - cv.fit(X, y, r=42) + model = GridSearchCV(TestEstimator(), param_grid={'a': [1, 2]}) + X, y = make_classification(random_state=42) + model.fit(X, y, r=42) + + +def _custom_lgbm_metric(y_test, y_pred): + # y_pred are probablities which need to be thresholded + y_pred = (y_pred > 0.5).astype(int) + acc = accuracy_score(y_test, y_pred) + # required output of format: (eval_name, eval_result, is_higher_better) + return ('accuracy', acc, True) + + +@pytest.mark.parametrize("metric", ['auc', _custom_lgbm_metric]) +def test_scalar_fit_param_lgbm(metric): + # check support for scalar in fit_params in LightGBM + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15805 + lgbm = pytest.importorskip("lightgbm") + X_train, X_valid, y_train, y_valid = train_test_split( + *make_classification(random_state=42), random_state=42 + ) + model = GridSearchCV( + lgbm.LGBMClassifier(n_estimators=5), + param_grid={'learning_rate': [0.1, 0.01]} + ) + fit_params = { + 'eval_set': [(X_valid, y_valid)], + 'eval_metric': metric, + 'early_stopping_rounds': 5, + 'verbose': False + } + model.fit(X_train, y_train, **fit_params) From 53d7a91c05f396e8d8ab5c867fc58d0cb72ceb51 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:06:03 +0100 Subject: [PATCH 04/34] FIX introduce _check_fit_params to validate parameters --- sklearn/model_selection/_search.py | 12 +----- sklearn/utils/validation.py | 63 ++++++++++++++++++++++++------ 2 files changed, 54 insertions(+), 21 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 2890a784ed5b3..31a6b607d3591 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -33,7 +33,7 @@ from ..utils import check_random_state from ..utils.fixes import MaskedArray from ..utils.random import sample_without_replacement -from ..utils.validation import indexable, check_is_fitted +from ..utils.validation import indexable, check_is_fitted, _check_fit_params from ..utils.metaestimators import if_delegate_has_method from ..metrics._scorer import _check_multimetric_scoring from ..metrics import check_scoring @@ -648,15 +648,7 @@ def fit(self, X, y=None, groups=None, **fit_params): refit_metric = 'score' X, y, groups = indexable(X, y, groups) - # make sure fit_params are sliceable - # TODO: remove in 0.24 - try: - fit_params_values = indexable(*fit_params.values()) - fit_params = dict(zip(fit_params.keys(), fit_params_values)) - except TypeError: - warnings.warn("Support for scaler fit params is deprecated. " - "Passing scalar values as a fit parameter will " - "raise an error in v0.24", FutureWarning) + fit_params = _check_fit_params(fit_params) n_splits = cv.get_n_splits(X, y, groups) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5502fdd534965..655a6b819952f 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -16,7 +16,8 @@ import numpy as np import scipy.sparse as sp from distutils.version import LooseVersion -from inspect import signature, isclass, Parameter +from inspect import signature, Parameter +from inspect import isclass, isfunction, ismethod, ismodule from numpy.core.numeric import ComplexWarning import joblib @@ -212,6 +213,26 @@ def check_consistent_length(*arrays): " samples: %r" % [int(l) for l in lengths]) +def _convert_iterable(iterable): + """Helper convert iterable to arrays of sparse matrices. + + Convert sparse matrices to csr and non-interable objects to arrays. + Let passes `None`. + + Parameters + ---------- + iterable : {list, dataframe, array, sparse} or None + Object to be converted to a sliceable iterable. + """ + if sp.issparse(iterable): + return iterable.tocsr() + elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"): + return iterable + elif iterable is None: + return iterable + return np.array(iterable) + + def indexable(*iterables): """Make arrays indexable for cross-validation. @@ -224,16 +245,7 @@ def indexable(*iterables): *iterables : lists, dataframes, arrays, sparse matrices List of objects to ensure sliceability. """ - result = [] - for X in iterables: - if sp.issparse(X): - result.append(X.tocsr()) - elif hasattr(X, "__getitem__") or hasattr(X, "iloc"): - result.append(X) - elif X is None: - result.append(X) - else: - result.append(np.array(X)) + result = [_convert_iterable(X) for X in iterables] check_consistent_length(*result) return result @@ -1257,3 +1269,32 @@ def inner_f(*args, **kwargs): kwargs.update({k: arg for k, arg in zip(all_args, args)}) return f(**kwargs) return inner_f + + +def _check_fit_params(fit_params): + """Check and validate the parameters passed during `fit`. + + Parameters + ---------- + fit_params : dict + Dictionary containing the parameters passed at fit. + + Returns + ------- + fit_params_validated : dict + Validated parameters. We ensure that the values are iterable. + """ + fit_params_validated = {} + for param_key, param_value in fit_params.items(): + is_scalar = [ + check(param_value) + for check in [np.isscalar, ismodule, isclass, ismethod, isfunction] + ] + if any(is_scalar): + # keep scalar as is for backward-compatibility + # https://github.com/scikit-learn/scikit-learn/issues/15805 + fit_params_validated[param_key] = param_value + else: + # ensure iterable will be sliceable + fit_params_validated[param_key] = _convert_iterable(param_value) + return fit_params_validated \ No newline at end of file From ef64f0bb0afe12e32b299f0c664d47392e58e476 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:06:12 +0100 Subject: [PATCH 05/34] DOC update whats new --- doc/whats_new/v0.22.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index af08b832e9f6f..d7572334b2113 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -21,6 +21,15 @@ Changelog - |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with boolean columns to floats. :pr:`15797` by `Thomas Fan`_. +:mod:`sklearn.model_selection` +.............................. + +- |Fix| :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` will accept scalar provided in + `fit_params`. Change in 0.22 was breaking backward compatibility. + :pr:`15863` by :user:`Adrin Jalali ` and + :user:`Guillaume Lemaitre `. + .. _changes_0_22: Version 0.22.0 From 9567b4431d5427986515314de43ea7fc7f04b6b9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:14:32 +0100 Subject: [PATCH 06/34] TST tests both grid-search and randomize-search --- sklearn/model_selection/tests/test_search.py | 21 ++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index cf766ed15327a..6b06c9f4c7fa3 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1849,7 +1849,12 @@ def test_search_cv__pairwise_property_equivalence_of_precomputed(): assert (preds_original == preds_precomputed).all(), attr_message -def test_scalar_fit_param(): +@pytest.mark.parametrize( + "SearchCV, param_search", + [(GridSearchCV, {'a': [0.1, 0.01]}), + (RandomizedSearchCV, {'a': np.random.randint(1, 3, size=2)})] +) +def test_scalar_fit_param(SearchCV, param_search): # check general support for scalar in fit_params # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15805 @@ -1863,7 +1868,7 @@ def fit(self, X, y, r): def predict(self, X): return np.zeros(shape=(len(X))) - model = GridSearchCV(TestEstimator(), param_grid={'a': [1, 2]}) + model = SearchCV(TestEstimator(), param_search) X, y = make_classification(random_state=42) model.fit(X, y, r=42) @@ -1877,7 +1882,12 @@ def _custom_lgbm_metric(y_test, y_pred): @pytest.mark.parametrize("metric", ['auc', _custom_lgbm_metric]) -def test_scalar_fit_param_lgbm(metric): +@pytest.mark.parametrize( + "SearchCV, param_search", + [(GridSearchCV, {'learning_rate': [0.1, 0.01]}), + (RandomizedSearchCV, {'learning_rate': uniform(0.01, 0.1)})] +) +def test_scalar_fit_param_lgbm(metric, SearchCV, param_search): # check support for scalar in fit_params in LightGBM # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15805 @@ -1885,9 +1895,8 @@ def test_scalar_fit_param_lgbm(metric): X_train, X_valid, y_train, y_valid = train_test_split( *make_classification(random_state=42), random_state=42 ) - model = GridSearchCV( - lgbm.LGBMClassifier(n_estimators=5), - param_grid={'learning_rate': [0.1, 0.01]} + model = SearchCV( + lgbm.LGBMClassifier(n_estimators=5), param_search ) fit_params = { 'eval_set': [(X_valid, y_valid)], From f340ab6f13a7fdd2a20129ad806f608c407448c6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:15:08 +0100 Subject: [PATCH 07/34] PEP8 --- sklearn/utils/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 1213790c6eba7..e0205b46060c3 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1299,4 +1299,5 @@ def _check_fit_params(fit_params): else: # ensure iterable will be sliceable fit_params_validated[param_key] = _convert_iterable(param_value) - return fit_params_validated \ No newline at end of file + + return fit_params_validated From ffb7ce569a641f878c7834d7d91de36587943f46 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:35:46 +0100 Subject: [PATCH 08/34] DOC revert unecessary change --- doc/whats_new/v0.22.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 3efc455be0ca4..21ff085d33346 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -63,6 +63,13 @@ Changelog :pr:`15863` by :user:`Adrin Jalali ` and :user:`Guillaume Lemaitre `. +:mod:`sklearn.inspection` +......................... + +- |Fix| :func:`inspection.plot_partial_dependence` and + :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks + the number of axes passed in. :pr:`15760` by `Thomas Fan`_. + :mod:`sklearn.utils` .................... From 2b5b1db0205291ac69abe3d6c90e3057b83eabcd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:35:58 +0100 Subject: [PATCH 09/34] TST add test for _check_fit_params --- sklearn/model_selection/tests/test_search.py | 2 +- sklearn/utils/tests/test_validation.py | 28 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 6b06c9f4c7fa3..c082e4ef6096e 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1862,7 +1862,7 @@ class TestEstimator(BaseEstimator, ClassifierMixin): def __init__(self, a=None): self.a = a - def fit(self, X, y, r): + def fit(self, X, y, r=None): assert r == 42 def predict(self, X): diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index f121f11658051..4cc1db29ce012 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -4,6 +4,7 @@ import os from tempfile import NamedTemporaryFile +from inspect import isclass, isfunction from itertools import product import pytest @@ -28,6 +29,7 @@ from sklearn.utils.estimator_checks import _NotAnArray from sklearn.random_projection import _sparse_random_matrix from sklearn.linear_model import ARDRegression +from sklearn.metrics import accuracy_score from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR @@ -46,6 +48,7 @@ _check_sample_weight, _allclose_dense_sparse, FLOAT_DTYPES) +from sklearn.utils.validation import _check_fit_params import sklearn @@ -1053,3 +1056,28 @@ def __init__(self, a=1, b=1, *, c=1, d=1): with pytest.warns(FutureWarning, match=r"Pass c=3, d=4 as keyword args"): A2(1, 2, 3, 4) + + +def test_check_fit_params(): + fit_params = { + 'list': [1, 2, 3, 4], + 'tuple': (1, 2, 3, 4), + 'array': np.array([1, 2, 3, 4]), + 'sparse': sp.csc_matrix([1, 2, 3, 4]), + 'scalar-func': accuracy_score, + 'scalar-class': KNeighborsClassifier, + 'scalar-int': 1, + 'scalar-str': 'xxx', + 'None': None, + } + result = _check_fit_params(fit_params) + + assert isinstance(fit_params['list'], list) + assert isinstance(fit_params['tuple'], tuple) + assert isinstance(fit_params['array'], np.ndarray) + assert isinstance(fit_params['sparse'], sp.csc_matrix) + assert isfunction(fit_params['scalar-func']) + assert isclass(fit_params['scalar-class']) + assert fit_params['scalar-int'] == 1 + assert fit_params['scalar-str'] == 'xxx' + assert fit_params['None'] is None From ffbac6f1cc0553589aac500b2a2c13c6d18ad96f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:36:57 +0100 Subject: [PATCH 10/34] olivier comments --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 21ff085d33346..75057c244318b 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -58,7 +58,7 @@ Changelog .............................. - |Fix| :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` will accept scalar provided in + :class:`model_selection.RandomizedSearchCV` accept scalar values provided in `fit_params`. Change in 0.22 was breaking backward compatibility. :pr:`15863` by :user:`Adrin Jalali ` and :user:`Guillaume Lemaitre `. From c0216dc1e849b3496741a72b55cf684594b7669f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:38:26 +0100 Subject: [PATCH 11/34] TST fixes --- sklearn/utils/tests/test_validation.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 4cc1db29ce012..706e642e29c05 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1072,12 +1072,12 @@ def test_check_fit_params(): } result = _check_fit_params(fit_params) - assert isinstance(fit_params['list'], list) - assert isinstance(fit_params['tuple'], tuple) - assert isinstance(fit_params['array'], np.ndarray) - assert isinstance(fit_params['sparse'], sp.csc_matrix) - assert isfunction(fit_params['scalar-func']) - assert isclass(fit_params['scalar-class']) - assert fit_params['scalar-int'] == 1 - assert fit_params['scalar-str'] == 'xxx' - assert fit_params['None'] is None + assert isinstance(result['list'], list) + assert isinstance(result['tuple'], tuple) + assert isinstance(result['array'], np.ndarray) + assert isinstance(result['sparse'], sp.csr_matrix) + assert isfunction(result['scalar-func']) + assert isclass(result['scalar-class']) + assert result['scalar-int'] == 1 + assert result['scalar-str'] == 'xxx' + assert result['None'] is None From 46937298c89d5741f5b946b3b5d270fcb9311e94 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:40:44 +0100 Subject: [PATCH 12/34] DOC whats new --- doc/whats_new/v0.22.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 75057c244318b..ced7e9936e89c 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -63,6 +63,12 @@ Changelog :pr:`15863` by :user:`Adrin Jalali ` and :user:`Guillaume Lemaitre `. +:mod:`sklearn.utils` +.................... + +- |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with + boolean columns to floats. :pr:`15797` by `Thomas Fan`_. + :mod:`sklearn.inspection` ......................... @@ -70,12 +76,6 @@ Changelog :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks the number of axes passed in. :pr:`15760` by `Thomas Fan`_. -:mod:`sklearn.utils` -.................... - -- |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with - boolean columns to floats. :pr:`15797` by `Thomas Fan`_. - .. _changes_0_22: Version 0.22.0 From d7a2c198fc15e8c178e3a714a8a7e946b51a2008 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 14:41:07 +0100 Subject: [PATCH 13/34] DOC whats new --- doc/whats_new/v0.22.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index ced7e9936e89c..44c03adc8d649 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -35,10 +35,6 @@ Changelog Follow-up of :pr:`15898` by :user:`Shivam Gargsya `. :pr:`15933` by :user:`Guillaume Lemaitre ` and `Olivier Grisel`_. -- |Fix| :func:`inspection.plot_partial_dependence` and - :meth:`inspection.PartialDependenceDisplay.plot` now consistently checks - the number of axes passed in. :pr:`15760` by `Thomas Fan`_. - :mod:`sklearn.metrics` ...................... From 52ecee46fc83cef35ae5fcc3e10aec075c720e0d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 15:33:08 +0100 Subject: [PATCH 14/34] TST revert type of error --- sklearn/model_selection/tests/test_search.py | 32 ++++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index c082e4ef6096e..011cb227028f9 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -219,33 +219,25 @@ def test_grid_search_pipeline_steps(): assert not hasattr(param_grid['regressor'][1], 'coef_') -def check_hyperparameter_searcher_with_fit_params(klass, **klass_kwargs): +@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV]) +def test_SearchCV_with_fit_params(SearchCV): X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) clf = CheckingClassifier(expected_fit_params=['spam', 'eggs']) - searcher = klass(clf, {'foo_param': [1, 2, 3]}, cv=2, **klass_kwargs) + searcher = SearchCV( + clf, {'foo_param': [1, 2, 3]}, cv=2, error_score="raise" + ) # The CheckingClassifier generates an assertion error if # a parameter is missing or has length != len(X). - assert_raise_message(AssertionError, - "Expected fit parameter(s) ['eggs'] not seen.", - searcher.fit, X, y, spam=np.ones(10)) - assert_raise_message( - ValueError, - "Found input variables with inconsistent numbers of samples: [", - searcher.fit, X, y, spam=np.ones(1), - eggs=np.zeros(10)) - searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10)) + err_msg = r"Expected fit parameter\(s\) \['eggs'\] not seen." + with pytest.raises(AssertionError, match=err_msg): + searcher.fit(X, y, spam=np.ones(10)) - -def test_grid_search_with_fit_params(): - check_hyperparameter_searcher_with_fit_params(GridSearchCV, - error_score='raise') - - -def test_random_search_with_fit_params(): - check_hyperparameter_searcher_with_fit_params(RandomizedSearchCV, n_iter=1, - error_score='raise') + err_msg = "Fit parameter spam has length 1; expected" + with pytest.raises(AssertionError, match=err_msg): + searcher.fit(X, y, spam=np.ones(1), eggs=np.zeros(10)) + searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10)) @ignore_warnings From be69ce046c3e4895b74e0b9ac73859abf316151c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 15:38:51 +0100 Subject: [PATCH 15/34] add olivier suggestions --- sklearn/utils/validation.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index e0205b46060c3..a5d1528574803 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -213,16 +213,16 @@ def check_consistent_length(*arrays): " samples: %r" % [int(l) for l in lengths]) -def _convert_iterable(iterable): - """Helper convert iterable to arrays of sparse matrices. +def _make_indexable(iterable): + """Ensure iterable supports indexing or convert to an indexable variant. - Convert sparse matrices to csr and non-interable objects to arrays. - Let passes `None`. + Convert sparse matrices to csr and other non-indexable iterable to arrays. + Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged. Parameters ---------- iterable : {list, dataframe, array, sparse} or None - Object to be converted to a sliceable iterable. + Object to be converted to an indexable iterable. """ if sp.issparse(iterable): return iterable.tocsr() @@ -245,7 +245,7 @@ def indexable(*iterables): *iterables : lists, dataframes, arrays, sparse matrices List of objects to ensure sliceability. """ - result = [_convert_iterable(X) for X in iterables] + result = [_make_indexable(X) for X in iterables] check_consistent_length(*result) return result @@ -1284,7 +1284,7 @@ def _check_fit_params(fit_params): Returns ------- fit_params_validated : dict - Validated parameters. We ensure that the values are iterable. + Validated parameters. We ensure that the values support indexing. """ fit_params_validated = {} for param_key, param_value in fit_params.items(): @@ -1297,7 +1297,8 @@ def _check_fit_params(fit_params): # https://github.com/scikit-learn/scikit-learn/issues/15805 fit_params_validated[param_key] = param_value else: - # ensure iterable will be sliceable - fit_params_validated[param_key] = _convert_iterable(param_value) + # Any other fit_params should support indexing + # (e.g. for cross-validation). + fit_params_validated[param_key] = _make_indexable(param_value) return fit_params_validated From 9b71a9c5e05ee39e1c5161650cda45435908278b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 16:54:38 +0100 Subject: [PATCH 16/34] address olivier comments --- sklearn/model_selection/tests/test_search.py | 10 ++++++++-- sklearn/utils/validation.py | 11 +++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 011cb227028f9..1970318808424 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1847,7 +1847,7 @@ def test_search_cv__pairwise_property_equivalence_of_precomputed(): (RandomizedSearchCV, {'a': np.random.randint(1, 3, size=2)})] ) def test_scalar_fit_param(SearchCV, param_search): - # check general support for scalar in fit_params + # unofficially sanctioned tolerance for scalar values in fit_params # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15805 class TestEstimator(BaseEstimator, ClassifierMixin): @@ -1880,7 +1880,7 @@ def _custom_lgbm_metric(y_test, y_pred): (RandomizedSearchCV, {'learning_rate': uniform(0.01, 0.1)})] ) def test_scalar_fit_param_lgbm(metric, SearchCV, param_search): - # check support for scalar in fit_params in LightGBM + # check support for scalar values in fit_params in LightGBM # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15805 lgbm = pytest.importorskip("lightgbm") @@ -1890,6 +1890,12 @@ def test_scalar_fit_param_lgbm(metric, SearchCV, param_search): model = SearchCV( lgbm.LGBMClassifier(n_estimators=5), param_search ) + + # NOTE: `fit_params` should be data dependent (e.g. `sample_weight`) which + # is not the case for the following parameters. But this abuse is common in + # popular third-party libraries and we should tolerate this behavior for + # now and be careful not to break support for those without following + # proper deprecation cycle. fit_params = { 'eval_set': [(X_valid, y_valid)], 'eval_metric': metric, diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index a5d1528574803..abde1b66f01bc 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -16,8 +16,7 @@ import numpy as np import scipy.sparse as sp from distutils.version import LooseVersion -from inspect import signature, Parameter -from inspect import isclass, isfunction, ismethod, ismodule +from inspect import signature, isclass, Parameter from numpy.core.numeric import ComplexWarning import joblib @@ -1288,12 +1287,8 @@ def _check_fit_params(fit_params): """ fit_params_validated = {} for param_key, param_value in fit_params.items(): - is_scalar = [ - check(param_value) - for check in [np.isscalar, ismodule, isclass, ismethod, isfunction] - ] - if any(is_scalar): - # keep scalar as is for backward-compatibility + if not _is_arraylike(param_value): + # Non-indexable pass-through (for now for backward-compatibility). # https://github.com/scikit-learn/scikit-learn/issues/15805 fit_params_validated[param_key] = param_value else: From 46c4b9f162842856e029f8489977b85b004a660b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 17:12:23 +0100 Subject: [PATCH 17/34] address thomas comments --- sklearn/model_selection/tests/test_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 1970318808424..8bd79ace67e52 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1855,7 +1855,7 @@ def __init__(self, a=None): self.a = a def fit(self, X, y, r=None): - assert r == 42 + self.r_ = r def predict(self, X): return np.zeros(shape=(len(X))) @@ -1863,7 +1863,7 @@ def predict(self, X): model = SearchCV(TestEstimator(), param_search) X, y = make_classification(random_state=42) model.fit(X, y, r=42) - + assert model.best_estimator_.r_ == 42 def _custom_lgbm_metric(y_test, y_pred): # y_pred are probablities which need to be thresholded From 71fab3f90a2f3874d59aa80b40c098d561953031 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 17:13:52 +0100 Subject: [PATCH 18/34] PEP8 --- sklearn/model_selection/tests/test_search.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 8bd79ace67e52..c9fd2035da328 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1865,6 +1865,7 @@ def predict(self, X): model.fit(X, y, r=42) assert model.best_estimator_.r_ == 42 + def _custom_lgbm_metric(y_test, y_pred): # y_pred are probablities which need to be thresholded y_pred = (y_pred > 0.5).astype(int) From 9a85162f712b85815b9cc86ef3dbc6f626c8591d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 17:55:07 +0100 Subject: [PATCH 19/34] comments olivier --- sklearn/model_selection/_search.py | 2 +- sklearn/model_selection/tests/test_search.py | 2 +- sklearn/utils/validation.py | 8 ++++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 31a6b607d3591..a70bdd7a2f9dc 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -648,7 +648,7 @@ def fit(self, X, y=None, groups=None, **fit_params): refit_metric = 'score' X, y, groups = indexable(X, y, groups) - fit_params = _check_fit_params(fit_params) + fit_params = _check_fit_params(X, fit_params) n_splits = cv.get_n_splits(X, y, groups) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index c9fd2035da328..44b44ba8788ab 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1880,7 +1880,7 @@ def _custom_lgbm_metric(y_test, y_pred): [(GridSearchCV, {'learning_rate': [0.1, 0.01]}), (RandomizedSearchCV, {'learning_rate': uniform(0.01, 0.1)})] ) -def test_scalar_fit_param_lgbm(metric, SearchCV, param_search): +def test_scalar_fit_param_lightgbm(metric, SearchCV, param_search): # check support for scalar values in fit_params in LightGBM # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15805 diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index abde1b66f01bc..d15a7d7d3da12 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1272,11 +1272,14 @@ def inner_f(*args, **kwargs): return inner_f -def _check_fit_params(fit_params): +def _check_fit_params(X, fit_params): """Check and validate the parameters passed during `fit`. Parameters ---------- + X : array-like of shape (n_samples, n_features) + Data array. + fit_params : dict Dictionary containing the parameters passed at fit. @@ -1287,7 +1290,8 @@ def _check_fit_params(fit_params): """ fit_params_validated = {} for param_key, param_value in fit_params.items(): - if not _is_arraylike(param_value): + if (not _is_arraylike(param_value) or + _num_samples(param_value) != _num_samples(X)): # Non-indexable pass-through (for now for backward-compatibility). # https://github.com/scikit-learn/scikit-learn/issues/15805 fit_params_validated[param_key] = param_value From f41c808f4f3c5603d005c8d8e2da1c0055b63827 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 18:47:58 +0100 Subject: [PATCH 20/34] TST fix test by passing X --- sklearn/utils/tests/test_validation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 706e642e29c05..445390a6e0a32 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1059,6 +1059,7 @@ def __init__(self, a=1, b=1, *, c=1, d=1): def test_check_fit_params(): + X = np.random.randn(4, 2) fit_params = { 'list': [1, 2, 3, 4], 'tuple': (1, 2, 3, 4), @@ -1070,12 +1071,12 @@ def test_check_fit_params(): 'scalar-str': 'xxx', 'None': None, } - result = _check_fit_params(fit_params) + result = _check_fit_params(X, fit_params) assert isinstance(result['list'], list) assert isinstance(result['tuple'], tuple) assert isinstance(result['array'], np.ndarray) - assert isinstance(result['sparse'], sp.csr_matrix) + assert result['sparse'].format == 'csc' assert isfunction(result['scalar-func']) assert isclass(result['scalar-class']) assert result['scalar-int'] == 1 From c989c7075cd1d43d6fe35071fd6603ed32607744 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 20:10:45 +0100 Subject: [PATCH 21/34] avoid to call twice tocsr --- sklearn/model_selection/_validation.py | 2 -- sklearn/utils/tests/test_validation.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 67a30c6416031..f778f0ee4583e 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -942,8 +942,6 @@ def _index_param_value(X, v, indices): if not _is_arraylike(v) or _num_samples(v) != _num_samples(X): # pass through: skip indexing return v - if sp.issparse(v): - v = v.tocsr() return _safe_indexing(v, indices) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 445390a6e0a32..5de4b0cbf4e67 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1064,7 +1064,7 @@ def test_check_fit_params(): 'list': [1, 2, 3, 4], 'tuple': (1, 2, 3, 4), 'array': np.array([1, 2, 3, 4]), - 'sparse': sp.csc_matrix([1, 2, 3, 4]), + 'sparse': sp.csc_matrix([1, 2, 3, 4]).T, 'scalar-func': accuracy_score, 'scalar-class': KNeighborsClassifier, 'scalar-int': 1, @@ -1076,7 +1076,7 @@ def test_check_fit_params(): assert isinstance(result['list'], list) assert isinstance(result['tuple'], tuple) assert isinstance(result['array'], np.ndarray) - assert result['sparse'].format == 'csc' + assert result['sparse'].format == 'csr' assert isfunction(result['scalar-func']) assert isclass(result['scalar-class']) assert result['scalar-int'] == 1 From 570dfa895779fe1dbaad79d85c7da7112ac57da5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 20:13:00 +0100 Subject: [PATCH 22/34] add case column/row sparse in check_fit_param --- sklearn/utils/tests/test_validation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5de4b0cbf4e67..054eb0e79c31c 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1064,7 +1064,8 @@ def test_check_fit_params(): 'list': [1, 2, 3, 4], 'tuple': (1, 2, 3, 4), 'array': np.array([1, 2, 3, 4]), - 'sparse': sp.csc_matrix([1, 2, 3, 4]).T, + 'sparse-col': sp.csc_matrix([1, 2, 3, 4]).T, + 'sparse-row': sp.csc_matrix([1, 2, 3, 4]), 'scalar-func': accuracy_score, 'scalar-class': KNeighborsClassifier, 'scalar-int': 1, @@ -1076,7 +1077,8 @@ def test_check_fit_params(): assert isinstance(result['list'], list) assert isinstance(result['tuple'], tuple) assert isinstance(result['array'], np.ndarray) - assert result['sparse'].format == 'csr' + assert result['sparse-col'].format == 'csr' + assert result['sparse-row'].format == 'csc' assert isfunction(result['scalar-func']) assert isclass(result['scalar-class']) assert result['scalar-int'] == 1 From 444c9472d693a9ebfe4da50ceaa60f2a3fbb6f1a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 22:34:50 +0100 Subject: [PATCH 23/34] provide optional indices --- sklearn/model_selection/_validation.py | 15 +++------------ sklearn/utils/validation.py | 10 +++++++++- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index f778f0ee4583e..ea009a8d636fb 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -23,6 +23,7 @@ from ..base import is_classifier, clone from ..utils import (indexable, check_random_state, _safe_indexing, _message_with_time) +from ..utils.validation import _check_fit_params from ..utils.validation import _is_arraylike, _num_samples from ..utils.metaestimators import _safe_split from ..metrics import check_scoring @@ -489,8 +490,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} - fit_params = {k: _index_param_value(X, v, train) - for k, v in fit_params.items()} + fit_params = _check_fit_params(X, fit_params, train) train_scores = {} if parameters is not None: @@ -830,8 +830,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, """ # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} - fit_params = {k: _index_param_value(X, v, train) - for k, v in fit_params.items()} + fit_params = _check_fit_params(X, _check_fit_params, train) X_train, y_train = _safe_split(estimator, X, y, train) X_test, _ = _safe_split(estimator, X, y, test, train) @@ -937,14 +936,6 @@ def _check_is_permutation(indices, n_samples): return True -def _index_param_value(X, v, indices): - """Private helper function for parameter value indexing.""" - if not _is_arraylike(v) or _num_samples(v) != _num_samples(X): - # pass through: skip indexing - return v - return _safe_indexing(v, indices) - - def permutation_test_score(estimator, X, y, groups=None, cv=None, n_permutations=100, n_jobs=None, random_state=0, verbose=0, scoring=None): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index d15a7d7d3da12..693a74f984aea 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1272,7 +1272,7 @@ def inner_f(*args, **kwargs): return inner_f -def _check_fit_params(X, fit_params): +def _check_fit_params(X, fit_params, indices=None): """Check and validate the parameters passed during `fit`. Parameters @@ -1283,11 +1283,15 @@ def _check_fit_params(X, fit_params): fit_params : dict Dictionary containing the parameters passed at fit. + indices : array-like of shape (n_samples,), default=None + Indices to be selected if the parameter has the same size than `X`. + Returns ------- fit_params_validated : dict Validated parameters. We ensure that the values support indexing. """ + from . import _safe_indexing fit_params_validated = {} for param_key, param_value in fit_params.items(): if (not _is_arraylike(param_value) or @@ -1299,5 +1303,9 @@ def _check_fit_params(X, fit_params): # Any other fit_params should support indexing # (e.g. for cross-validation). fit_params_validated[param_key] = _make_indexable(param_value) + if indices is not None: + fit_params_validated[param_key] = _safe_indexing( + fit_params_validated[param_key], indices + ) return fit_params_validated From 9f47b585cf619b6073514d5548c9d57d1d755746 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 22:56:25 +0100 Subject: [PATCH 24/34] TST check content when indexing params --- sklearn/utils/tests/test_validation.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 054eb0e79c31c..5df2047df9ceb 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -4,7 +4,6 @@ import os from tempfile import NamedTemporaryFile -from inspect import isclass, isfunction from itertools import product import pytest @@ -34,6 +33,7 @@ from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR from sklearn.datasets import make_blobs +from sklearn.utils import _safe_indexing from sklearn.utils.validation import ( has_fit_parameter, check_is_fitted, @@ -1058,29 +1058,37 @@ def __init__(self, a=1, b=1, *, c=1, d=1): A2(1, 2, 3, 4) -def test_check_fit_params(): +@pytest.mark.parametrize("indices", [None, [1, 3]]) +def test_check_fit_params(indices): X = np.random.randn(4, 2) fit_params = { 'list': [1, 2, 3, 4], - 'tuple': (1, 2, 3, 4), 'array': np.array([1, 2, 3, 4]), 'sparse-col': sp.csc_matrix([1, 2, 3, 4]).T, 'sparse-row': sp.csc_matrix([1, 2, 3, 4]), - 'scalar-func': accuracy_score, - 'scalar-class': KNeighborsClassifier, 'scalar-int': 1, 'scalar-str': 'xxx', 'None': None, } - result = _check_fit_params(X, fit_params) + result = _check_fit_params(X, fit_params, indices) + indices_ = indices if indices is not None else list(range(X.shape[0])) assert isinstance(result['list'], list) - assert isinstance(result['tuple'], tuple) assert isinstance(result['array'], np.ndarray) assert result['sparse-col'].format == 'csr' assert result['sparse-row'].format == 'csc' - assert isfunction(result['scalar-func']) - assert isclass(result['scalar-class']) assert result['scalar-int'] == 1 assert result['scalar-str'] == 'xxx' assert result['None'] is None + + assert result['list'] == _safe_indexing(fit_params['list'], indices_) + assert_array_equal( + result['array'], _safe_indexing(fit_params['array'], indices_) + ) + assert_allclose_dense_sparse( + result['sparse-col'], + _safe_indexing(fit_params['sparse-col'], indices_) + ) + assert_allclose_dense_sparse( + result['sparse-row'], fit_params['sparse-row'] + ) From 75bd0a949d2015ab0fe4010252ad49a4c9070878 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 23:17:47 +0100 Subject: [PATCH 25/34] PEP8 --- sklearn/utils/tests/test_validation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5df2047df9ceb..adb44f2571863 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -28,7 +28,6 @@ from sklearn.utils.estimator_checks import _NotAnArray from sklearn.random_projection import _sparse_random_matrix from sklearn.linear_model import ARDRegression -from sklearn.metrics import accuracy_score from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR From c24f39d4e82ca1f172328a522bfcb100831da35a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 23:31:19 +0100 Subject: [PATCH 26/34] TST update tests to check identity --- sklearn/utils/tests/test_validation.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index adb44f2571863..a3bd7b566fbbf 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1072,13 +1072,8 @@ def test_check_fit_params(indices): result = _check_fit_params(X, fit_params, indices) indices_ = indices if indices is not None else list(range(X.shape[0])) - assert isinstance(result['list'], list) - assert isinstance(result['array'], np.ndarray) - assert result['sparse-col'].format == 'csr' - assert result['sparse-row'].format == 'csc' - assert result['scalar-int'] == 1 - assert result['scalar-str'] == 'xxx' - assert result['None'] is None + for key in ['sparse-row', 'scalar-int', 'scalar-str', 'None']: + assert result[key] is fit_params[key] assert result['list'] == _safe_indexing(fit_params['list'], indices_) assert_array_equal( @@ -1088,6 +1083,3 @@ def test_check_fit_params(indices): result['sparse-col'], _safe_indexing(fit_params['sparse-col'], indices_) ) - assert_allclose_dense_sparse( - result['sparse-row'], fit_params['sparse-row'] - ) From 63679fdc9f35171bfd34147c40f80b67038a6cf2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Dec 2019 23:52:13 +0100 Subject: [PATCH 27/34] stupid fix --- sklearn/model_selection/_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index ea009a8d636fb..1e48a5a610e3c 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -830,7 +830,7 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, """ # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} - fit_params = _check_fit_params(X, _check_fit_params, train) + fit_params = _check_fit_params(X, fit_params, train) X_train, y_train = _safe_split(estimator, X, y, train) X_test, _ = _safe_split(estimator, X, y, test, train) From 849615bec7784ba1567aadbb2c7de607a3601172 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 24 Dec 2019 09:15:07 +0100 Subject: [PATCH 28/34] use a distribution in RandomizedSearchCV --- sklearn/model_selection/tests/test_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 44b44ba8788ab..6231652ffe446 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1844,7 +1844,7 @@ def test_search_cv__pairwise_property_equivalence_of_precomputed(): @pytest.mark.parametrize( "SearchCV, param_search", [(GridSearchCV, {'a': [0.1, 0.01]}), - (RandomizedSearchCV, {'a': np.random.randint(1, 3, size=2)})] + (RandomizedSearchCV, {'a': uniform(1, 3)})] ) def test_scalar_fit_param(SearchCV, param_search): # unofficially sanctioned tolerance for scalar values in fit_params From 7837cdf751036ddc1e8400aff73760e12d01c3bd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 24 Dec 2019 10:00:03 +0100 Subject: [PATCH 29/34] MNT add lightgbm to one of the CI build --- azure-pipelines.yml | 6 +++--- build_tools/azure/install.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e2ff71802ce72..bd44777e11186 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -71,11 +71,11 @@ jobs: JOBLIB_VERSION: '0.12.3' COVERAGE: 'true' # Linux environment to test the latest available dependencies and MKL. - # It runs tests requiring pandas and PyAMG. + # It runs tests requiring lightgbm, pandas and PyAMG. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' - # FIXME: pinned until SciPy wheels are available for Python 3.8 - PYTHON_VERSION: '3.8' + # Pin the python version to 3.7 since lightgbm is not available for 3.8 + PYTHON_VERSION: '3.7' PYTEST_VERSION: '4.6.2' COVERAGE: 'true' CHECK_PYTEST_SOFT_DEPENDENCY: 'true' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index aa2707bb03837..2dac02165d52f 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -91,7 +91,7 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then python -m pip install -U pip python -m pip install numpy scipy cython joblib python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist - python -m pip install pandas matplotlib pyamg + python -m pip install pandas matplotlib pyamg lightgbm fi if [[ "$COVERAGE" == "true" ]]; then From b98e194280dc1e19d50305499a3de9d1c476101f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 24 Dec 2019 10:24:13 +0100 Subject: [PATCH 30/34] move to another build --- azure-pipelines.yml | 6 +++--- build_tools/azure/install.sh | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bd44777e11186..e3052cd9ab3d6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -36,6 +36,7 @@ jobs: NUMPY_VERSION: '*' SCIPY_VERSION: '*' CYTHON_VERSION: '*' + LIGHTGBM_VERSION: '*' PILLOW_VERSION: '*' PYTEST_VERSION: '*' JOBLIB_VERSION: '*' @@ -71,11 +72,10 @@ jobs: JOBLIB_VERSION: '0.12.3' COVERAGE: 'true' # Linux environment to test the latest available dependencies and MKL. - # It runs tests requiring lightgbm, pandas and PyAMG. + # It runs tests requiring pandas and PyAMG. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' - # Pin the python version to 3.7 since lightgbm is not available for 3.8 - PYTHON_VERSION: '3.7' + PYTHON_VERSION: '3.8' PYTEST_VERSION: '4.6.2' COVERAGE: 'true' CHECK_PYTEST_SOFT_DEPENDENCY: 'true' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 2dac02165d52f..245662cd0603a 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -41,6 +41,10 @@ if [[ "$DISTRIB" == "conda" ]]; then TO_INSTALL="$TO_INSTALL matplotlib=$MATPLOTLIB_VERSION" fi + if [[ -n "$LIGHTGBM_VERSION" ]]; then + TO_INSTALL="$TO_INSTALL lightgbm=$LIGHTGBM_VERSION" + fi + if [[ "$UNAMESTR" == "Darwin" ]]; then if [[ "$SKLEARN_TEST_NO_OPENMP" != "true" ]]; then # on macOS, install an OpenMP-enabled clang/llvm from conda-forge. @@ -91,7 +95,7 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then python -m pip install -U pip python -m pip install numpy scipy cython joblib python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist - python -m pip install pandas matplotlib pyamg lightgbm + python -m pip install pandas matplotlib pyamg fi if [[ "$COVERAGE" == "true" ]]; then From 3127d2bf20f4a31bf9fd17b0f2bc737cd7826f52 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 24 Dec 2019 11:32:20 +0100 Subject: [PATCH 31/34] do not install dependencies lightgbm --- build_tools/azure/install.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 245662cd0603a..d4c3f5d8fbd44 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -41,10 +41,6 @@ if [[ "$DISTRIB" == "conda" ]]; then TO_INSTALL="$TO_INSTALL matplotlib=$MATPLOTLIB_VERSION" fi - if [[ -n "$LIGHTGBM_VERSION" ]]; then - TO_INSTALL="$TO_INSTALL lightgbm=$LIGHTGBM_VERSION" - fi - if [[ "$UNAMESTR" == "Darwin" ]]; then if [[ "$SKLEARN_TEST_NO_OPENMP" != "true" ]]; then # on macOS, install an OpenMP-enabled clang/llvm from conda-forge. @@ -96,6 +92,7 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then python -m pip install numpy scipy cython joblib python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist python -m pip install pandas matplotlib pyamg + python -m pip install lightgbm --no-deps fi if [[ "$COVERAGE" == "true" ]]; then From a096a7d600d2bbd97f311c63b862d235aece92bd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 24 Dec 2019 11:55:57 +0100 Subject: [PATCH 32/34] MNT comments on the CI setup --- azure-pipelines.yml | 3 +-- build_tools/azure/install.sh | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e3052cd9ab3d6..b029a2fd18574 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -36,7 +36,6 @@ jobs: NUMPY_VERSION: '*' SCIPY_VERSION: '*' CYTHON_VERSION: '*' - LIGHTGBM_VERSION: '*' PILLOW_VERSION: '*' PYTEST_VERSION: '*' JOBLIB_VERSION: '*' @@ -72,7 +71,7 @@ jobs: JOBLIB_VERSION: '0.12.3' COVERAGE: 'true' # Linux environment to test the latest available dependencies and MKL. - # It runs tests requiring pandas and PyAMG. + # It runs tests requiring lightgbm, pandas and PyAMG. pylatest_pip_openblas_pandas: DISTRIB: 'conda-pip-latest' PYTHON_VERSION: '3.8' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index d4c3f5d8fbd44..1ef981b5dd6e8 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -92,6 +92,7 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then python -m pip install numpy scipy cython joblib python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist python -m pip install pandas matplotlib pyamg + # do not install dependencies for lightgbm since it requires scikit-learn python -m pip install lightgbm --no-deps fi From 18b1207441999841b97d32a91e6d5ae32ad2bdba Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 27 Dec 2019 09:36:11 +0100 Subject: [PATCH 33/34] address some comments --- sklearn/utils/validation.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 693a74f984aea..810ae44a5d60e 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1284,7 +1284,7 @@ def _check_fit_params(X, fit_params, indices=None): Dictionary containing the parameters passed at fit. indices : array-like of shape (n_samples,), default=None - Indices to be selected if the parameter has the same size than `X`. + Indices to be selected if the parameter has the same size as `X`. Returns ------- @@ -1303,9 +1303,8 @@ def _check_fit_params(X, fit_params, indices=None): # Any other fit_params should support indexing # (e.g. for cross-validation). fit_params_validated[param_key] = _make_indexable(param_value) - if indices is not None: - fit_params_validated[param_key] = _safe_indexing( - fit_params_validated[param_key], indices - ) + fit_params_validated[param_key] = _safe_indexing( + fit_params_validated[param_key], indices + ) return fit_params_validated From 74d70e724a76e69640b5ca77d32d0b564bb5cf52 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 31 Dec 2019 12:02:49 +0100 Subject: [PATCH 34/34] Test fit_params compat without dependency on lightgbm --- sklearn/model_selection/tests/test_search.py | 50 ++++++++++++-------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 6231652ffe446..bacfc20eb1fc1 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1866,30 +1866,41 @@ def predict(self, X): assert model.best_estimator_.r_ == 42 -def _custom_lgbm_metric(y_test, y_pred): - # y_pred are probablities which need to be thresholded - y_pred = (y_pred > 0.5).astype(int) - acc = accuracy_score(y_test, y_pred) - # required output of format: (eval_name, eval_result, is_higher_better) - return ('accuracy', acc, True) - - -@pytest.mark.parametrize("metric", ['auc', _custom_lgbm_metric]) @pytest.mark.parametrize( "SearchCV, param_search", - [(GridSearchCV, {'learning_rate': [0.1, 0.01]}), - (RandomizedSearchCV, {'learning_rate': uniform(0.01, 0.1)})] + [(GridSearchCV, {'alpha': [0.1, 0.01]}), + (RandomizedSearchCV, {'alpha': uniform(0.01, 0.1)})] ) -def test_scalar_fit_param_lightgbm(metric, SearchCV, param_search): - # check support for scalar values in fit_params in LightGBM - # non-regression test for: +def test_scalar_fit_param_compat(SearchCV, param_search): + # check support for scalar values in fit_params, for instance in LightGBM + # that do not exactly respect the scikit-learn API contract but that we do + # not want to break without an explicit deprecation cycle and API + # recommendations for implementing early stopping with a user provided + # validation set. non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15805 - lgbm = pytest.importorskip("lightgbm") X_train, X_valid, y_train, y_valid = train_test_split( *make_classification(random_state=42), random_state=42 ) + + class _FitParamClassifier(SGDClassifier): + + def fit(self, X, y, sample_weight=None, tuple_of_arrays=None, + scalar_param=None, callable_param=None): + super().fit(X, y, sample_weight=sample_weight) + assert scalar_param > 0 + assert callable(callable_param) + + # The tuple of arrays should be preserved as tuple. + assert isinstance(tuple_of_arrays, tuple) + assert tuple_of_arrays[0].ndim == 2 + assert tuple_of_arrays[1].ndim == 1 + return self + + def _fit_param_callable(): + pass + model = SearchCV( - lgbm.LGBMClassifier(n_estimators=5), param_search + _FitParamClassifier(), param_search ) # NOTE: `fit_params` should be data dependent (e.g. `sample_weight`) which @@ -1898,9 +1909,8 @@ def test_scalar_fit_param_lightgbm(metric, SearchCV, param_search): # now and be careful not to break support for those without following # proper deprecation cycle. fit_params = { - 'eval_set': [(X_valid, y_valid)], - 'eval_metric': metric, - 'early_stopping_rounds': 5, - 'verbose': False + 'tuple_of_arrays': (X_valid, y_valid), + 'callable_param': _fit_param_callable, + 'scalar_param': 42, } model.fit(X_train, y_train, **fit_params)