Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH do not allocate local arrays in Ridge*CV of store_cv_values is False #15652

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 13 additions & 0 deletions doc/whats_new/v0.23.rst
Expand Up @@ -47,10 +47,23 @@ Changelog
:mod:`sklearn.cluster`
......................

- |Fix| example fix in model XXX. :pr:`xxxx` or :issue:`xxxx` by
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
:user:`name <user id>`

:mod:`sklearn.linear_model`
...........................

- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more
more memory efficient implementation of single linkage clustering.
:pr:`11514` by :user:`Leland McInnes <lmcinnes>`.

- |Efficiency| :class:`linear_model.RidgeCV` and
:class:`linear_model.RidgeClassifierCV` now does not allocate a
potentially large array to store dual coefficients for all hyperparameters
during its `fit`, nor an array to store all LOO predictions unless
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
`store_cv_values` is `True`.
:pr:`15652` by :user:`Jérôme Dockès <jeromedockes>`.

:mod:`sklearn.preprocessing`
............................

Expand Down
63 changes: 37 additions & 26 deletions sklearn/linear_model/_ridge.py
Expand Up @@ -1054,6 +1054,16 @@ def _matmat(self, v):
return res


class _IdentityEstimator:
"""Hack to call a scorer when we already have the predictions."""

def decision_function(self, y_predict):
return y_predict

def predict(self, y_predict):
return y_predict


class _RidgeGCV(LinearModel):
"""Ridge regression with built-in Generalized Cross-Validation

Expand Down Expand Up @@ -1087,6 +1097,10 @@ class _RidgeGCV(LinearModel):

looe = y - loov = c / diag(G^-1)

The best score (negative mean squared error or user-provided scoring) is
stored in the `best_score_` attribute, and the selected hyperparameter in
`alpha_`.

References
----------
http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
Expand Down Expand Up @@ -1462,43 +1476,40 @@ def fit(self, X, y, sample_weight=None):
else:
sqrt_sw = np.ones(X.shape[0], dtype=X.dtype)

X_mean, *decomposition = decompose(X, y, sqrt_sw)

scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
error = scorer is None

n_y = 1 if len(y.shape) == 1 else y.shape[1]
cv_values = np.zeros((n_samples * n_y, len(self.alphas)),
dtype=X.dtype)
C = []
X_mean, *decomposition = decompose(X, y, sqrt_sw)

if self.store_cv_values:
self.cv_values_ = np.empty(
(n_samples * n_y, len(self.alphas)), dtype=X.dtype)

best_coef, best_score, best_alpha = None, None, None

for i, alpha in enumerate(self.alphas):
G_inverse_diag, c = solve(
float(alpha), y, sqrt_sw, X_mean, *decomposition)
if error:
squared_errors = (c / G_inverse_diag) ** 2
cv_values[:, i] = squared_errors.ravel()
alpha_score = -squared_errors.mean()
if self.store_cv_values:
self.cv_values_[:, i] = squared_errors.ravel()
else:
predictions = y - (c / G_inverse_diag)
cv_values[:, i] = predictions.ravel()
C.append(c)
alpha_score = scorer(
_IdentityEstimator(), predictions.ravel(), y.ravel())
if self.store_cv_values:
self.cv_values_[:, i] = predictions.ravel()

if error:
best = cv_values.mean(axis=0).argmin()
else:
# The scorer want an object that will make the predictions but
# they are already computed efficiently by _RidgeGCV. This
# identity_estimator will just return them
def identity_estimator():
pass
identity_estimator.decision_function = lambda y_predict: y_predict
identity_estimator.predict = lambda y_predict: y_predict

# signature of scorer is (estimator, X, y)
out = [scorer(identity_estimator, cv_values[:, i], y.ravel())
for i in range(len(self.alphas))]
best = np.argmax(out)

self.alpha_ = self.alphas[best]
self.dual_coef_ = C[best]
if (best_score is None) or (alpha_score > best_score):
best_coef, best_score, best_alpha = c, alpha_score, alpha

self.alpha_ = best_alpha
self.best_score_ = best_score
self.dual_coef_ = best_coef
self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)

X_offset += X_mean * X_scale
Expand All @@ -1509,7 +1520,7 @@ def identity_estimator():
cv_values_shape = n_samples, len(self.alphas)
else:
cv_values_shape = n_samples, n_y, len(self.alphas)
self.cv_values_ = cv_values.reshape(cv_values_shape)
self.cv_values_ = self.cv_values_.reshape(cv_values_shape)

return self

Expand Down
21 changes: 18 additions & 3 deletions sklearn/linear_model/tests/test_ridge.py
Expand Up @@ -34,6 +34,7 @@
from sklearn.linear_model._ridge import _check_gcv_mode
from sklearn.linear_model._ridge import _X_CenterStackOp
from sklearn.datasets import make_regression
from sklearn.datasets import make_classification

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, GroupKFold, cross_val_predict
Expand Down Expand Up @@ -661,6 +662,19 @@ def _test_ridge_cv(filter_):
assert type(ridge_cv.intercept_) == np.float64


@pytest.mark.parametrize(
"ridge, make_dataset",
[(RidgeCV(), make_regression),
(RidgeClassifierCV(), make_classification)]
)
def test_ridge_gcv_cv_values_not_stored(ridge, make_dataset):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note that the previous implementation would have passed this test because the cv values were stored in a local variable during fit

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True and we don't have a real good test to detect the enhancement actually.

# Check that `cv_values_` is not stored when store_cv_values is False
X, y = make_dataset(n_samples=6, random_state=42)
ridge.set_params(store_cv_values=False)
ridge.fit(X, y)
assert not hasattr(ridge, "cv_values_")


def _test_ridge_diabetes(filter_):
ridge = Ridge(fit_intercept=False)
ridge.fit(filter_(X_diabetes), y_diabetes)
Expand Down Expand Up @@ -818,7 +832,8 @@ def test_class_weights_cv():
assert_array_equal(reg.predict([[-.2, 2]]), np.array([-1]))


def test_ridgecv_store_cv_values():
@pytest.mark.parametrize("scoring", [None, 'neg_mean_squared_error'])
def test_ridgecv_store_cv_values(scoring):
rng = np.random.RandomState(42)

n_samples = 8
Expand All @@ -827,7 +842,7 @@ def test_ridgecv_store_cv_values():
alphas = [1e-1, 1e0, 1e1]
n_alphas = len(alphas)

r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True)
r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True, scoring=scoring)

# with len(y.shape) == 1
y = rng.randn(n_samples)
Expand All @@ -840,7 +855,7 @@ def test_ridgecv_store_cv_values():
r.fit(x, y)
assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)

r = RidgeCV(cv=3, store_cv_values=True)
r = RidgeCV(cv=3, store_cv_values=True, scoring=scoring)
assert_raises_regex(ValueError, 'cv!=None and store_cv_values',
r.fit, x, y)

Expand Down