Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: make LinearRegression perfectly consistent across sparse or dense #13279

Merged
Merged
Changes from 1 commit
Commits
File filter...
Filter file types
Jump to…
Jump to file or symbol
Failed to load files and symbols.
+34 −3
Diff settings

Always

Just for now

Copy path View file
@@ -459,21 +459,32 @@ def fit(self, X, y, sample_weight=None):

X, y, X_offset, y_offset, X_scale = self._preprocess_data(
X, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
copy=self.copy_X, sample_weight=sample_weight)
copy=self.copy_X, sample_weight=sample_weight,
return_mean=True)

if sample_weight is not None:
# Sample weight can be implemented via a simple rescaling.
X, y = _rescale_data(X, y, sample_weight)

if sp.issparse(X):
X_offset_scale = X_offset / X_scale
def matvec(b):
return X.dot(b) - b.dot(X_offset_scale)
def rmatvec(b):
This conversation was marked as resolved by GaelVaroquaux

This comment has been minimized.

Copy link
@jnothman

jnothman Feb 26, 2019

Member

I think PEP8 wants blank lines before these functions

This comment has been minimized.

Copy link
@ogrisel

ogrisel Feb 27, 2019

Member

I think it's only true for top level blocks (0-indented functions /classes).

return X.T.dot(b) - (X_offset_scale) * np.sum(b)

This comment has been minimized.

Copy link
@jnothman

jnothman Feb 26, 2019

Member

redundant parentheses


X_centered = sparse.linalg.LinearOperator(shape=X.shape,
matvec=matvec,
rmatvec=rmatvec)

This comment has been minimized.

Copy link
@GaelVaroquaux

GaelVaroquaux Feb 27, 2019

Member

Very elegant!


if y.ndim < 2:
out = sparse_lsqr(X, y)
out = sparse_lsqr(X_centered, y)
self.coef_ = out[0]
self._residues = out[3]
else:
# sparse_lstsq cannot handle y with shape (M, K)
outs = Parallel(n_jobs=n_jobs_)(
delayed(sparse_lsqr)(X, y[:, j].ravel())
delayed(sparse_lsqr)(X_centered, y[:, j].ravel())
for j in range(y.shape[1]))
self.coef_ = np.vstack([out[0] for out in outs])
self._residues = np.vstack([out[3] for out in outs])
@@ -150,6 +150,26 @@ def test_linear_regression_sparse(random_state=0):
assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)


@pytest.mark.parametrize('normalize', [True, False])
@pytest.mark.parametrize('fit_intercept', [True, False])
def test_linear_regression_sparse_equal_dense(normalize, fit_intercept):
# Test that linear regression agrees between sparse and dense
rng = check_random_state(0)
n_samples = 200
n_features = 2
X = rng.randn(n_samples, n_features)
X[X < 0.1] = 0.
Xcsr = sparse.csr_matrix(X)
y = rng.rand(n_samples)
params = dict(normalize=normalize, fit_intercept=fit_intercept)
clf_dense = LinearRegression(**params)
clf_sparse = LinearRegression(**params)
clf_dense.fit(X, y)
clf_sparse.fit(Xcsr, y)
assert_almost_equal(clf_dense.intercept_, clf_sparse.intercept_)

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor
Suggested change
assert_almost_equal(clf_dense.intercept_, clf_sparse.intercept_)
assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_)
assert_array_almost_equal(clf_dense.coef_, clf_sparse.coef_)

This comment has been minimized.

Copy link
@glemaitre

glemaitre Feb 26, 2019

Contributor
Suggested change
assert_array_almost_equal(clf_dense.coef_, clf_sparse.coef_)
assert_allclose(clf_dense.coef_, clf_sparse.coef_)


def test_linear_regression_multiple_outcome(random_state=0):
# Test multiple-outcome linear regressions
X, y = make_regression(random_state=random_state)
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.