scikit-learn-contrib · Badr-MOUFAD · Aug 26, 2022 · Aug 24, 2022 · Aug 24, 2022 · Aug 24, 2022
diff --git a/skglm/estimators.py b/skglm/estimators.py
@@ -1142,7 +1142,7 @@ def path(self, yXT, y, Cs, coef_init=None, return_n_iter=True, **params):
             Target vector relative to X.
 
         Cs : ndarray shape (n_Cs,)
-            Values of regularization strenghts for which solutions are
+            Values of regularization strengths for which solutions are
             computed.
 
         coef_init : array, shape (n_features,), optional

diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py
@@ -0,0 +1,117 @@
+import numpy as np
+from numba import njit
+from scipy.sparse import issparse
+from skglm.utils import AndersonAcceleration
+
+
+def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None,
+                   use_acc=True, cd_type='greedy', tol=1e-4, verbose=False):
+    """Run coordinate descent while keeping the gradients up-to-date with Gram updates.
+
+    Minimize::
+        1 / (2*n_samples) * norm(y - Xw)**2 + penalty(w)
+
+    Which can be rewritten as::
+        w.T @ Q @ w / (2*n_samples) - q.T @ w / n_samples + penalty(w)
+
+    where::
+        Q = X.T @ X (gram matrix)
+        q = X.T @ y
+    """
+    n_samples, n_features = X.shape
+    scaled_gram = X.T @ X / n_samples
+    scaled_Xty = X.T @ y / n_samples
+    scaled_y_norm2 = np.linalg.norm(y)**2 / (2*n_samples)
+
+    if issparse(X):
+        scaled_gram = scaled_gram.toarray()
+
+    all_features = np.arange(n_features)
+    stop_crit = np.inf  # prevent ref before assign
+    p_objs_out = []
+
+    w = np.zeros(n_features) if w_init is None else w_init
+    scaled_gram_w = np.zeros(n_features) if w_init is None else scaled_gram @ w_init
+    opt = penalty.subdiff_distance(w, -scaled_Xty, all_features)  # initial: grad = -Xty
+
+    if use_acc:
+        accelerator = AndersonAcceleration(K=5)
+        w_acc = np.zeros(n_features)
+        scaled_gram_w_acc = np.zeros(n_features)
+
+    for t in range(max_iter):
+        # check convergences
+        stop_crit = np.max(opt)
+        if verbose:
+            p_obj = (0.5 * w @ scaled_gram_w - scaled_Xty @ w +
+                     scaled_y_norm2 + penalty.value(w))
+            print(
+                f"Iteration {t+1}: {p_obj:.10f}, "
+                f"stopping crit: {stop_crit:.2e}"
+            )
+
+        if stop_crit <= tol:
+            if verbose:
+                print(f"Stopping criterion max violation: {stop_crit:.2e}")
+            break
+
+        # inplace update of w, XtXw
+        _gram_cd_epoch = _gram_cd_greedy if cd_type == 'greedy' else _gram_cd_cyclic
+        opt = _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w,
+                             penalty, all_features)
+
+        # perform anderson extrapolation
+        if use_acc:
+            w_acc, scaled_gram_w_acc, is_extrapolated = accelerator.extrapolate(
+                w, scaled_gram_w)
+
+            if is_extrapolated:
+                p_obj_acc = (0.5 * w_acc @ scaled_gram_w_acc - scaled_Xty @ w_acc +
+                             penalty.value(w_acc))
+                p_obj = 0.5 * w @ scaled_gram_w - scaled_Xty @ w + penalty.value(w)
+                if p_obj_acc < p_obj:
+                    w[:] = w_acc
+                    scaled_gram_w[:] = scaled_gram_w_acc
+
+        p_obj = 0.5 * w @ scaled_gram_w - scaled_Xty @ w + penalty.value(w)
+        p_objs_out.append(p_obj)
+    return w, np.array(p_objs_out), stop_crit
+
+
+@njit
+def _gram_cd_greedy(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws):
+    # inplace update of w, XtXw, opt
+    # perform greedy cd updates
+    for _ in range(len(w)):
+        grad = scaled_gram_w - scaled_Xty
+        opt = penalty.subdiff_distance(w, grad, ws)
+        j_max = np.argmax(opt)
+
+        old_w_j = w[j_max]
+        step = 1 / scaled_gram[j_max, j_max]  # 1 / lipchitz_j
+        w[j_max] = penalty.prox_1d(old_w_j - step * grad[j_max], step, j_max)
+
+        # Gram matrix update
+        if w[j_max] != old_w_j:
+            scaled_gram_w += (w[j_max] - old_w_j) * scaled_gram[:, j_max]
+    return opt
+
+
+@njit
+def _gram_cd_cyclic(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws):
+    # inplace update of w, XtXw, opt
+    # perform greedy cd updates
+    for j in range(len(w)):
+        grad = scaled_gram_w - scaled_Xty
+
+        old_w_j = w[j]
+        step = 1 / scaled_gram[j, j]  # 1 / lipchitz_j
+        w[j] = penalty.prox_1d(old_w_j - step * grad[j], step, j)
+
+        # Gram matrix update
+        if w[j] != old_w_j:
+            scaled_gram_w += (w[j] - old_w_j) * scaled_gram[:, j]
+
+    # opt
+    grad = scaled_gram_w - scaled_Xty
+    return penalty.subdiff_distance(w, grad, ws)
diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py
@@ -0,0 +1,45 @@
+import pytest
+from itertools import product
+
+import numpy as np
+from numpy.linalg import norm
+from sklearn.linear_model import Lasso
+
+from skglm.penalties import L1
+from skglm.solvers.gram_cd import gram_cd_solver
+from skglm.utils import make_correlated_data, compiled_clone
+
+
+@pytest.mark.parametrize("n_samples, n_features, X_density",
+                         product([100, 200], [50, 90], [1., 0.6]))
+def test_alpha_max(n_samples, n_features, X_density):
+    X, y, _ = make_correlated_data(n_samples, n_features,
+                                   random_state=0, X_density=X_density)
+    alpha_max = norm(X.T @ y, ord=np.inf) / n_samples
+
+    l1_penalty = compiled_clone(L1(alpha_max))
+    w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=0)[0]
+
+    np.testing.assert_equal(w, 0)
+
+
+@pytest.mark.parametrize("n_samples, n_features, rho, X_density",
+                         product([500, 100], [30, 80], [1e-1, 1e-2, 1e-3], [1., 0.8]))
+def test_vs_lasso_sklearn(n_samples, n_features, rho, X_density):
+    X, y, _ = make_correlated_data(n_samples, n_features,
+                                   random_state=0, X_density=X_density)
+    alpha_max = norm(X.T @ y, ord=np.inf) / n_samples
+    alpha = rho * alpha_max
+
+    sk_lasso = Lasso(alpha, fit_intercept=False, tol=1e-9)
+    sk_lasso.fit(X, y)
+
+    l1_penalty = compiled_clone(L1(alpha))
+    w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=0, max_iter=1000)[0]
+
+    np.testing.assert_allclose(w, sk_lasso.coef_.flatten(), rtol=1e-7, atol=1e-7)
+
+
+if __name__ == '__main__':
+    test_vs_lasso_sklearn(100, 10, 0.01)
+    pass