scikit-learn · TomDLT · Dec 20, 2016 · Dec 16, 2016 · Dec 16, 2016 · Dec 20, 2016
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -99,6 +99,9 @@ Enhancements
      A ``TypeError`` will be raised for any other kwargs. :issue:`8028`
      by :user:`Alexander Booth <alexandercbooth>`.
 
+   - Added ability to use sparse matrices in :func:`feature_selection.f_regression`
+     with ``center=True``. :issue:`8065` by :user:`Daniel LeJeune <acadiansith>`.
+
 Bug fixes
 .........
 

diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
@@ -92,6 +92,12 @@ def test_f_regression():
     assert_true((pv[:5] < 0.05).all())
     assert_true((pv[5:] > 1.e-4).all())
 
+    # with centering, compare with sparse
+    F, pv = f_regression(X, y, center=True)
+    F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
+    assert_array_almost_equal(F_sparse, F)
+    assert_array_almost_equal(pv_sparse, pv)
+
     # again without centering, compare with sparse
     F, pv = f_regression(X, y, center=False)
     F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)

diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py
@@ -266,17 +266,27 @@ def f_regression(X, y, center=True):
     f_classif: ANOVA F-value between label/feature for classification tasks.
     chi2: Chi-squared stats of non-negative features for classification tasks.
     """
-    if issparse(X) and center:
-        raise ValueError("center=True only allowed for dense data")
     X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64)
+    n_samples = X.shape[0]
+
+    # compute centered values
+    # note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
+    # need not center X
     if center:
         y = y - np.mean(y)
-        X = X.copy('F')  # faster in fortran
-        X -= X.mean(axis=0)
+        if issparse(X):
+            X_means = X.mean(axis=0).getA1()
+        else:
+            X_means = X.mean(axis=0)
+        # compute the scaled standard deviations via moments
+        X_norms = np.sqrt(row_norms(X.T, squared=True) -
+                          n_samples * X_means ** 2)
+    else:
+        X_norms = row_norms(X.T)
 
     # compute the correlation
     corr = safe_sparse_dot(y, X)
-    corr /= row_norms(X.T)
+    corr /= X_norms
     corr /= norm(y)
 
     # convert to p-value