ENH add gap safe screening rules to enet_coordinate_descent_gram (#31987)

lorentzenchr · ogrisel · web-flow · commit fc8122a7041f · 2025-09-11T16:45:26.000Z
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/doc/whats_new/upcoming_changes/sklearn.covariance/31987.efficiency.rst b/doc/whats_new/upcoming_changes/sklearn.covariance/31987.efficiency.rst
@@ -0,0 +1,6 @@
+- :class:`sklearn.covariance.GraphicalLasso`,
+  :class:`sklearn.covariance.GraphicalLassoCV` and
+  :func:`sklearn.covariance.graphical_lasso` with `mode="cd"` profit from the
+  fit time performance improvement of :class:`sklearn.linear_model.Lasso` by means of
+  gap safe screening rules.
+  By :user:`Christian Lorentzen <lorentzenchr>`.
diff --git a/doc/whats_new/upcoming_changes/sklearn.covariance/31987.fix.rst b/doc/whats_new/upcoming_changes/sklearn.covariance/31987.fix.rst
@@ -0,0 +1,6 @@
+- Fixed uncontrollable randomness in :class:`sklearn.covariance.GraphicalLasso`,
+  :class:`sklearn.covariance.GraphicalLassoCV` and
+  :func:`sklearn.covariance.graphical_lasso`. For `mode="cd"`, they now use cyclic
+  coordinate descent. Before, it was random coordinate descent with uncontrollable
+  random number seeding.
+  By :user:`Christian Lorentzen <lorentzenchr>`.
diff --git a/doc/whats_new/upcoming_changes/sklearn.decomposition/31987.efficiency.rst b/doc/whats_new/upcoming_changes/sklearn.decomposition/31987.efficiency.rst
@@ -0,0 +1,11 @@
+- :class:`sklearn.decomposition.DictionaryLearning` and
+  :class:`sklearn.decomposition.MiniBatchDictionaryLearning` with `fit_algorithm="cd"`,
+  :class:`sklearn.decomposition.SparseCoder` with `transform_algorithm="lasso_cd"`,
+  :class:`sklearn.decomposition.MiniBatchSparsePCA`,
+  :class:`sklearn.decomposition.SparsePCA`,
+  :func:`sklearn.decomposition.dict_learning` and
+  :func:`sklearn.decomposition.dict_learning_online` with `method="cd"`,
+  :func:`sklearn.decomposition.sparse_encode` with `algorithm="lasso_cd"`
+  all profit from the fit time performance improvement of
+  :class:`sklearn.linear_model.Lasso` by means of gap safe screening rules.
+  By :user:`Christian Lorentzen <lorentzenchr>`.
diff --git a/doc/whats_new/upcoming_changes/sklearn.linear_model/32014.efficiency.rst b/doc/whats_new/upcoming_changes/sklearn.linear_model/32014.efficiency.rst
@@ -3,12 +3,11 @@
   :class:`linear_model.MultiTaskElasticNetCV`, :class:`linear_model.MultiTaskLassoCV`
   as well as
   :func:`linear_model.lasso_path` and :func:`linear_model.enet_path` now implement
-  gap safe screening rules in the coordinate descent solver for dense `X` (with
-  `precompute=False` or `"auto"` with `n_samples < n_features`) and sparse `X`
-  (always).
+  gap safe screening rules in the coordinate descent solver for dense and sparse `X`.
   The speedup of fitting time is particularly pronounced (10-times is possible) when
   computing regularization paths like the \*CV-variants of the above estimators do.
   There is now an additional check of the stopping criterion before entering the main
   loop of descent steps. As the stopping criterion requires the computation of the dual
   gap, the screening happens whenever the dual gap is computed.
-  By :user:`Christian Lorentzen <lorentzenchr>` :pr:`31882`, :pr:`31986` and
+  By :user:`Christian Lorentzen <lorentzenchr>` :pr:`31882`, :pr:`31986`,
+  :pr:`31987` and
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
@@ -138,16 +138,23 @@ def _graphical_lasso(
                             / (precision_[idx, idx] + 1000 * eps)
                         )
                         coefs, _, _, _ = cd_fast.enet_coordinate_descent_gram(
-                            coefs,
-                            alpha,
-                            0,
-                            sub_covariance,
-                            row,
-                            row,
-                            max_iter,
-                            enet_tol,
-                            check_random_state(None),
-                            False,
+                            w=coefs,
+                            alpha=alpha,
+                            beta=0,
+                            Q=sub_covariance,
+                            q=row,
+                            y=row,
+                            # TODO: It is not ideal that the max_iter of the outer
+                            # solver (graphical lasso) is coupled with the max_iter of
+                            # the inner solver (CD). Ideally, CD has its own parameter
+                            # enet_max_iter (like enet_tol). A minimum of 20 is rather
+                            # arbitrary, but not unreasonable.
+                            max_iter=max(20, max_iter),
+                            tol=enet_tol,
+                            rng=check_random_state(None),
+                            random=False,
+                            positive=False,
+                            do_screening=True,
                         )
                     else:  # mode == "lars"
                         _, _, coefs = lars_path_gram(
diff --git a/sklearn/covariance/tests/test_graphical_lasso.py b/sklearn/covariance/tests/test_graphical_lasso.py
@@ -25,16 +25,12 @@
 )
 
 
-def test_graphical_lassos(random_state=1):
-    """Test the graphical lasso solvers.
-
-    This checks is unstable for some random seeds where the covariance found with "cd"
-    and "lars" solvers are different (4 cases / 100 tries).
-    """
+def test_graphical_lassos(global_random_seed):
+    """Test the graphical lasso solvers."""
     # Sample data from a sparse multivariate normal
-    dim = 20
+    dim = 10
     n_samples = 100
-    random_state = check_random_state(random_state)
+    random_state = check_random_state(global_random_seed)
     prec = make_sparse_spd_matrix(dim, alpha=0.95, random_state=random_state)
     cov = linalg.inv(prec)
     X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
@@ -45,24 +41,29 @@ def test_graphical_lassos(random_state=1):
         icovs = dict()
         for method in ("cd", "lars"):
             cov_, icov_, costs = graphical_lasso(
-                emp_cov, return_costs=True, alpha=alpha, mode=method
+                emp_cov,
+                return_costs=True,
+                alpha=alpha,
+                mode=method,
+                tol=1e-7,
+                enet_tol=1e-11,
+                max_iter=100,
             )
             covs[method] = cov_
             icovs[method] = icov_
             costs, dual_gap = np.array(costs).T
             # Check that the costs always decrease (doesn't hold if alpha == 0)
             if not alpha == 0:
-                # use 1e-12 since the cost can be exactly 0
-                assert_array_less(np.diff(costs), 1e-12)
+                # use 1e-10 since the cost can be exactly 0
+                assert_array_less(np.diff(costs), 1e-10)
         # Check that the 2 approaches give similar results
-        assert_allclose(covs["cd"], covs["lars"], atol=5e-4)
-        assert_allclose(icovs["cd"], icovs["lars"], atol=5e-4)
+        assert_allclose(covs["cd"], covs["lars"], atol=1e-3)
+        assert_allclose(icovs["cd"], icovs["lars"], atol=1e-3)
 
     # Smoke test the estimator
-    model = GraphicalLasso(alpha=0.25).fit(X)
+    model = GraphicalLasso(alpha=0.25, tol=1e-7, enet_tol=1e-11, max_iter=100).fit(X)
     model.score(X)
-    assert_array_almost_equal(model.covariance_, covs["cd"], decimal=4)
-    assert_array_almost_equal(model.covariance_, covs["lars"], decimal=4)
+    assert_allclose(model.covariance_, covs["cd"], rtol=1e-6)
 
     # For a centered matrix, assume_centered could be chosen True or False
     # Check that this returns indeed the same result for centered data
@@ -87,6 +88,7 @@ def test_graphical_lasso_when_alpha_equals_0(global_random_seed):
 
 
 @pytest.mark.parametrize("mode", ["cd", "lars"])
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
 def test_graphical_lasso_n_iter(mode):
     X, _ = datasets.make_classification(n_samples=5_000, n_features=20, random_state=0)
     emp_cov = empirical_covariance(X)
@@ -138,12 +140,25 @@ def test_graph_lasso_2D():
         assert_array_almost_equal(icov, icov_skggm)
 
 
-def test_graphical_lasso_iris_singular():
+@pytest.mark.parametrize("method", ["cd", "lars"])
+def test_graphical_lasso_iris_singular(method):
     # Small subset of rows to test the rank-deficient case
     # Need to choose samples such that none of the variances are zero
     indices = np.arange(10, 13)
 
     # Hard-coded solution from R glasso package for alpha=0.01
+    # library(glasso)
+    # X = t(array(c(
+    #   5.4, 3.7, 1.5, 0.2,
+    #   4.8, 3.4, 1.6, 0.2,
+    #   4.8, 3. , 1.4, 0.1),
+    #   dim = c(4, 3)
+    # ))
+    # n = nrow(X)
+    # emp_cov = cov(X) * (n - 1)/n  # without Bessel correction
+    # sol = glasso(emp_cov, 0.01, penalize.diagonal = FALSE)
+    # # print cov_R
+    # print(noquote(format(sol$w, scientific=FALSE, digits = 10)))
     cov_R = np.array(
         [
             [0.08, 0.056666662595, 0.00229729713223, 0.00153153142149],
@@ -162,12 +177,9 @@ def test_graphical_lasso_iris_singular():
     )
     X = datasets.load_iris().data[indices, :]
     emp_cov = empirical_covariance(X)
-    for method in ("cd", "lars"):
-        cov, icov = graphical_lasso(
-            emp_cov, alpha=0.01, return_costs=False, mode=method
-        )
-        assert_array_almost_equal(cov, cov_R, decimal=5)
-        assert_array_almost_equal(icov, icov_R, decimal=5)
+    cov, icov = graphical_lasso(emp_cov, alpha=0.01, return_costs=False, mode=method)
+    assert_allclose(cov, cov_R, atol=1e-6)
+    assert_allclose(icov, icov_R, atol=1e-5)
 
 
 def test_graphical_lasso_cv(global_random_seed):
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
@@ -146,6 +146,7 @@ def _sparse_encode_precomputed(
             alpha=alpha,
             fit_intercept=False,
             precompute=gram,
+            tol=1e-8,  # TODO: This parameter should be exposed.
             max_iter=max_iter,
             warm_start=True,
             positive=positive,
diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py
@@ -89,7 +89,7 @@ def ricker_matrix(width, resolution, n_components):
         return D
 
     transform_algorithm = "lasso_cd"
-    resolution = 1024
+    resolution = 256
     subsampling = 3  # subsampling factor
     n_components = resolution // subsampling
 
@@ -99,7 +99,7 @@ def ricker_matrix(width, resolution, n_components):
             ricker_matrix(
                 width=w, resolution=resolution, n_components=n_components // 5
             )
-            for w in (10, 50, 100, 500, 1000)
+            for w in (10, 50, 100, 500)
         )
     ]
 
@@ -120,7 +120,7 @@ def ricker_matrix(width, resolution, n_components):
     with warnings.catch_warnings():
         warnings.simplefilter("error", ConvergenceWarning)
         model = SparseCoder(
-            D_multi, transform_algorithm=transform_algorithm, transform_max_iter=2000
+            D_multi, transform_algorithm=transform_algorithm, transform_max_iter=500
         )
         model.fit_transform(X)
 
@@ -864,7 +864,7 @@ def test_dict_learning_dtype_match(data_type, expected_type, method):
 @pytest.mark.parametrize("method", ("lars", "cd"))
 def test_dict_learning_numerical_consistency(method):
     # verify numerically consistent among np.float32 and np.float64
-    rtol = 1e-6
+    rtol = 1e-4
     n_components = 4
     alpha = 2
 
diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py
@@ -71,7 +71,7 @@ def test_fit_transform(global_random_seed):
         n_components=3, method="cd", random_state=global_random_seed, alpha=alpha
     )
     spca_lasso.fit(Y)
-    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
+    assert_allclose(spca_lasso.components_, spca_lars.components_, rtol=5e-4)
 
 
 # TODO: remove mark once loky bug is fixed:
@@ -117,7 +117,7 @@ def test_fit_transform_tall(global_random_seed):
     U1 = spca_lars.fit_transform(Y)
     spca_lasso = SparsePCA(n_components=3, method="cd", random_state=rng)
     U2 = spca_lasso.fit(Y).transform(Y)
-    assert_array_almost_equal(U1, U2)
+    assert_allclose(U1, U2, rtol=1e-4, atol=1e-5)
 
 
 def test_initialization(global_random_seed):
diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py