From b8fd539772b3bdd74af2db5ae7b593685bfb31bd Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Wed, 24 Aug 2022 10:21:16 +0200 Subject: [PATCH 01/24] init commit --- skglm/estimators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skglm/estimators.py b/skglm/estimators.py index e65846222..6bdbfb4dd 100644 --- a/skglm/estimators.py +++ b/skglm/estimators.py @@ -1142,7 +1142,7 @@ def path(self, yXT, y, Cs, coef_init=None, return_n_iter=True, **params): Target vector relative to X. Cs : ndarray shape (n_Cs,) - Values of regularization strenghts for which solutions are + Values of regularization strengths for which solutions are computed. coef_init : array, shape (n_features,), optional From c2aecbae97c3773259923ebfb624d26cad6de726 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Wed, 24 Aug 2022 14:28:11 +0200 Subject: [PATCH 02/24] gram solver && unit test --- skglm/solvers/gram.py | 109 ++++++++++++++++++++++++++++++++ skglm/tests/test_gram_solver.py | 57 +++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 skglm/solvers/gram.py create mode 100644 skglm/tests/test_gram_solver.py diff --git a/skglm/solvers/gram.py b/skglm/solvers/gram.py new file mode 100644 index 000000000..6cabeb853 --- /dev/null +++ b/skglm/solvers/gram.py @@ -0,0 +1,109 @@ +import numpy as np +from numba import njit + + +def gram_solver(X, y, penalty, max_iter=20, max_epoch=1000, p0=10, tol=1e-4, + verbose=False): + """Run a Gram solver by reformulation the problem as below. + + Minimize:: + w.T @ Q @ w / (2*n_samples) - b.T @ w / n_samples + penalty(w) + + where:: + Q = X.T @ X + b = X.T @ y + """ + n_features = X.shape[1] + XtX = X.T @ X + Xty = X.T @ y + all_features = np.arange(n_features) + p_objs_out = [] + + w = np.zeros(n_features) + XtXw = np.zeros(n_features) + + for t in range(max_iter): + # compute scores + grad = _construct_grad(y, XtXw, Xty, all_features) + opt = penalty.subdiff_distance(w, grad, all_features) + + # check convergences + stop_crit = np.max(opt) + if verbose: + p_obj = _quadratic_value(X, w, XtXw, Xty) + penalty.value(w) + print( + f"Iteration {t+1}: {p_obj:.10f}, " + f"stopping crit: {stop_crit:.2e}" + ) + + if stop_crit <= tol: + if verbose: + print(f"Stopping criterion max violation: {stop_crit:.2e}") + break + + # build ws + gsupp_size = penalty.generalized_support(w).sum() + ws_size = max(min(p0, n_features), + min(n_features, 2 * gsupp_size)) + # similar to np.argsort()[-ws_size:] but without sorting + ws = np.argpartition(opt, -ws_size)[-ws_size:] + tol_in = 0.3 * stop_crit + + for epoch in range(max_epoch): + # inplace update of w, XtXw + _gram_cd_epoch(y, XtX, Xty, w, XtXw, penalty, ws) + + if epoch % 10 == 0: + grad = _construct_grad(y, XtXw, Xty, ws) + opt_in = penalty.subdiff_distance(w, grad, ws) + + stop_crit_in = np.max(opt_in) + if max(verbose-1, 0): + p_obj = _quadratic_value(X, w, XtXw, Xty) + penalty.value(w) + print( + f"Epoch {epoch+1}: {p_obj:.10f}, " + f"stopping crit in: {stop_crit_in:.2e}" + ) + + if stop_crit_in <= tol_in: + if max(verbose-1, 0): + print("Early exit") + break + + p_obj = _quadratic_value(X, w, XtXw, Xty) + penalty.value(w) + p_objs_out.append(p_obj) + return w, p_objs_out, stop_crit + + +@njit +def _gram_cd_epoch(y, XtX, Xty, w, XtXw, penalty, ws): + # inplace update of w, XtXw + for j in ws: + # skip for X[:, j] == 0 + if XtX[j, j] == 0: + continue + + old_w_j = w[j] + grad_j = (XtXw[j] - Xty[j]) / len(y) + step = 1 / XtX[j, j] # 1 / lipchitz_j + + w[j] = penalty.prox_1d(old_w_j - step * grad_j, step, j) + + # Gram matrix update + if w[j] != old_w_j: + XtXw += (w[j] - old_w_j) * XtX[:, j] + + +@njit +def _construct_grad(y, XtXw, Xty, ws): + n_samples = len(y) + grad = np.zeros(len(ws)) + for idx, j in enumerate(ws): + grad[idx] = (XtXw[j] - Xty[j]) / n_samples + return grad + + +@njit +def _quadratic_value(X, w, XtXw, Xty): + n_samples = X.shape[0] + return w @ XtXw / (2*n_samples) - Xty @ w / n_samples diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py new file mode 100644 index 000000000..3a6467f6a --- /dev/null +++ b/skglm/tests/test_gram_solver.py @@ -0,0 +1,57 @@ +import pytest +from itertools import product + +import numpy as np +from numpy.linalg import norm +from sklearn.linear_model import Lasso + +from skglm.penalties import L1 +from skglm.solvers.gram import gram_solver +from skglm.utils import make_correlated_data, compiled_clone + + +@pytest.mark.parametrize("n_samples, n_features", + product([100, 200], [50, 90])) +def test_alpha_max(n_samples, n_features): + X, y, _ = make_correlated_data(n_samples, n_features, random_state=0) + alpha_max = norm(X.T @ y, ord=np.inf) / n_samples + + l1_penalty = compiled_clone(L1(alpha_max)) + w = gram_solver(X, y, l1_penalty, tol=1e-9, verbose=2)[0] + + np.testing.assert_equal(w, 0) + + +@pytest.mark.parametrize("n_samples, n_features, rho", + product([50, 100], [20, 80], [1e-1, 1e-2])) +def test_vs_lasso_sklearn(n_samples, n_features, rho): + X, y, _ = make_correlated_data(n_samples, n_features, random_state=0) + alpha_max = norm(X.T @ y, ord=np.inf) / n_samples + alpha = rho * alpha_max + + sk_lasso = Lasso(alpha, fit_intercept=False, tol=1e-9) + sk_lasso.fit(X, y) + + l1_penalty = compiled_clone(L1(alpha)) + w = gram_solver(X, y, l1_penalty, tol=1e-9, verbose=0, p0=10)[0] + + print( + f"skglm: {compute_obj(X, y, alpha, w)}\n" + f"sklearn: {compute_obj(X, y, alpha, sk_lasso.coef_.flatten())}" + ) + + # np.testing.assert_allclose(w, sk_lasso.coef_.flatten(), rtol=1e-5, atol=1e-5) + + +def compute_obj(X, y, alpha, coef): + return norm(y - X @ coef) ** 2 / (2 * len(y)) + alpha * norm(coef, ord=1) + + +if __name__ == '__main__': + test_vs_lasso_sklearn(50, 80, 0.01) + + # print( + # f"skglm: {compute_obj(X, y, alpha, w)}\n" + # f"sklearn: {compute_obj(X, y, alpha, sk_lasso.coef_.flatten())}" + # ) + pass From 507fc8a92b2943703de2b654c778111416e8e87f Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Wed, 24 Aug 2022 15:50:44 +0200 Subject: [PATCH 03/24] fix bug gram solver && tighten test --- skglm/solvers/gram.py | 109 -------------------------------- skglm/solvers/gram_cd.py | 64 +++++++++++++++++++ skglm/tests/test_gram_solver.py | 21 ++---- 3 files changed, 69 insertions(+), 125 deletions(-) delete mode 100644 skglm/solvers/gram.py create mode 100644 skglm/solvers/gram_cd.py diff --git a/skglm/solvers/gram.py b/skglm/solvers/gram.py deleted file mode 100644 index 6cabeb853..000000000 --- a/skglm/solvers/gram.py +++ /dev/null @@ -1,109 +0,0 @@ -import numpy as np -from numba import njit - - -def gram_solver(X, y, penalty, max_iter=20, max_epoch=1000, p0=10, tol=1e-4, - verbose=False): - """Run a Gram solver by reformulation the problem as below. - - Minimize:: - w.T @ Q @ w / (2*n_samples) - b.T @ w / n_samples + penalty(w) - - where:: - Q = X.T @ X - b = X.T @ y - """ - n_features = X.shape[1] - XtX = X.T @ X - Xty = X.T @ y - all_features = np.arange(n_features) - p_objs_out = [] - - w = np.zeros(n_features) - XtXw = np.zeros(n_features) - - for t in range(max_iter): - # compute scores - grad = _construct_grad(y, XtXw, Xty, all_features) - opt = penalty.subdiff_distance(w, grad, all_features) - - # check convergences - stop_crit = np.max(opt) - if verbose: - p_obj = _quadratic_value(X, w, XtXw, Xty) + penalty.value(w) - print( - f"Iteration {t+1}: {p_obj:.10f}, " - f"stopping crit: {stop_crit:.2e}" - ) - - if stop_crit <= tol: - if verbose: - print(f"Stopping criterion max violation: {stop_crit:.2e}") - break - - # build ws - gsupp_size = penalty.generalized_support(w).sum() - ws_size = max(min(p0, n_features), - min(n_features, 2 * gsupp_size)) - # similar to np.argsort()[-ws_size:] but without sorting - ws = np.argpartition(opt, -ws_size)[-ws_size:] - tol_in = 0.3 * stop_crit - - for epoch in range(max_epoch): - # inplace update of w, XtXw - _gram_cd_epoch(y, XtX, Xty, w, XtXw, penalty, ws) - - if epoch % 10 == 0: - grad = _construct_grad(y, XtXw, Xty, ws) - opt_in = penalty.subdiff_distance(w, grad, ws) - - stop_crit_in = np.max(opt_in) - if max(verbose-1, 0): - p_obj = _quadratic_value(X, w, XtXw, Xty) + penalty.value(w) - print( - f"Epoch {epoch+1}: {p_obj:.10f}, " - f"stopping crit in: {stop_crit_in:.2e}" - ) - - if stop_crit_in <= tol_in: - if max(verbose-1, 0): - print("Early exit") - break - - p_obj = _quadratic_value(X, w, XtXw, Xty) + penalty.value(w) - p_objs_out.append(p_obj) - return w, p_objs_out, stop_crit - - -@njit -def _gram_cd_epoch(y, XtX, Xty, w, XtXw, penalty, ws): - # inplace update of w, XtXw - for j in ws: - # skip for X[:, j] == 0 - if XtX[j, j] == 0: - continue - - old_w_j = w[j] - grad_j = (XtXw[j] - Xty[j]) / len(y) - step = 1 / XtX[j, j] # 1 / lipchitz_j - - w[j] = penalty.prox_1d(old_w_j - step * grad_j, step, j) - - # Gram matrix update - if w[j] != old_w_j: - XtXw += (w[j] - old_w_j) * XtX[:, j] - - -@njit -def _construct_grad(y, XtXw, Xty, ws): - n_samples = len(y) - grad = np.zeros(len(ws)) - for idx, j in enumerate(ws): - grad[idx] = (XtXw[j] - Xty[j]) / n_samples - return grad - - -@njit -def _quadratic_value(X, w, XtXw, Xty): - n_samples = X.shape[0] - return w @ XtXw / (2*n_samples) - Xty @ w / n_samples diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py new file mode 100644 index 000000000..0f69785c3 --- /dev/null +++ b/skglm/solvers/gram_cd.py @@ -0,0 +1,64 @@ +import numpy as np +from numba import njit + + +def gram_cd_solver(X, y, penalty, max_iter=20, tol=1e-4, verbose=False): + """Run a Gram solver by reformulation the problem as below. + + Minimize:: + w.T @ Q @ w / (2*n_samples) - b.T @ w / n_samples + penalty(w) + + where:: + Q = X.T @ X + b = X.T @ y + """ + n_samples, n_features = X.shape + XtX = X.T @ X / n_samples + Xty = X.T @ y / n_samples + all_features = np.arange(n_features) + p_objs_out = [] + + w = np.zeros(n_features) + XtXw = np.zeros(n_features) + # initial: grad = -Xty + opt = penalty.subdiff_distance(w, -Xty, all_features) + + for t in range(max_iter): + # check convergences + stop_crit = np.max(opt) + if verbose: + p_obj = 0.5 * w @ XtXw - Xty @ w + penalty.value(w) + print( + f"Iteration {t+1}: {p_obj:.10f}, " + f"stopping crit: {stop_crit:.2e}" + ) + + if stop_crit <= tol: + if verbose: + print(f"Stopping criterion max violation: {stop_crit:.2e}") + break + + # inplace update of w, XtXw, opt + _gram_cd_iter(XtX, Xty, w, XtXw, penalty, opt, + all_features, n_updates=n_features) + + p_obj = 0.5 * w @ XtXw - Xty @ w + penalty.value(w) + p_objs_out.append(p_obj) + return w, p_objs_out, stop_crit + + +@njit +def _gram_cd_iter(XtX, Xty, w, XtXw, penalty, opt, ws, n_updates): + # inplace update of w, XtXw, opt + for _ in range(n_updates): + grad = XtXw - Xty + opt[:] = penalty.subdiff_distance(w, grad, ws) + j_max = np.argmax(opt) + + old_w_j = w[j_max] + step = 1 / XtX[j_max, j_max] # 1 / lipchitz_j + w[j_max] = penalty.prox_1d(old_w_j - step * grad[j_max], step, j_max) + + # Gram matrix update + if w[j_max] != old_w_j: + XtXw += (w[j_max] - old_w_j) * XtX[:, j_max] diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py index 3a6467f6a..5bb0e19e4 100644 --- a/skglm/tests/test_gram_solver.py +++ b/skglm/tests/test_gram_solver.py @@ -6,7 +6,7 @@ from sklearn.linear_model import Lasso from skglm.penalties import L1 -from skglm.solvers.gram import gram_solver +from skglm.solvers.gram_cd import gram_cd_solver from skglm.utils import make_correlated_data, compiled_clone @@ -17,13 +17,13 @@ def test_alpha_max(n_samples, n_features): alpha_max = norm(X.T @ y, ord=np.inf) / n_samples l1_penalty = compiled_clone(L1(alpha_max)) - w = gram_solver(X, y, l1_penalty, tol=1e-9, verbose=2)[0] + w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=2)[0] np.testing.assert_equal(w, 0) @pytest.mark.parametrize("n_samples, n_features, rho", - product([50, 100], [20, 80], [1e-1, 1e-2])) + product([500, 100], [30, 80], [1e-1, 1e-2, 1e-3])) def test_vs_lasso_sklearn(n_samples, n_features, rho): X, y, _ = make_correlated_data(n_samples, n_features, random_state=0) alpha_max = norm(X.T @ y, ord=np.inf) / n_samples @@ -33,14 +33,9 @@ def test_vs_lasso_sklearn(n_samples, n_features, rho): sk_lasso.fit(X, y) l1_penalty = compiled_clone(L1(alpha)) - w = gram_solver(X, y, l1_penalty, tol=1e-9, verbose=0, p0=10)[0] + w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=0, max_iter=6000)[0] - print( - f"skglm: {compute_obj(X, y, alpha, w)}\n" - f"sklearn: {compute_obj(X, y, alpha, sk_lasso.coef_.flatten())}" - ) - - # np.testing.assert_allclose(w, sk_lasso.coef_.flatten(), rtol=1e-5, atol=1e-5) + np.testing.assert_allclose(w, sk_lasso.coef_.flatten(), rtol=1e-5, atol=1e-5) def compute_obj(X, y, alpha, coef): @@ -48,10 +43,4 @@ def compute_obj(X, y, alpha, coef): if __name__ == '__main__': - test_vs_lasso_sklearn(50, 80, 0.01) - - # print( - # f"skglm: {compute_obj(X, y, alpha, w)}\n" - # f"sklearn: {compute_obj(X, y, alpha, sk_lasso.coef_.flatten())}" - # ) pass From c9b64c27abe5f4c4e979c6afb2cbfd53a99a3c5b Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Wed, 24 Aug 2022 16:10:36 +0200 Subject: [PATCH 04/24] add anderson acceleration --- skglm/solvers/gram_cd.py | 19 ++++++++++++++++++- skglm/tests/test_gram_solver.py | 8 ++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 0f69785c3..71414253b 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -1,8 +1,9 @@ import numpy as np from numba import njit +from skglm.utils import AndersonAcceleration -def gram_cd_solver(X, y, penalty, max_iter=20, tol=1e-4, verbose=False): +def gram_cd_solver(X, y, penalty, max_iter=20, use_acc=True, tol=1e-4, verbose=False): """Run a Gram solver by reformulation the problem as below. Minimize:: @@ -22,6 +23,10 @@ def gram_cd_solver(X, y, penalty, max_iter=20, tol=1e-4, verbose=False): XtXw = np.zeros(n_features) # initial: grad = -Xty opt = penalty.subdiff_distance(w, -Xty, all_features) + if use_acc: + accelerator = AndersonAcceleration(K=5) + w_acc = np.zeros(n_features) + XtXw_acc = np.zeros(n_features) for t in range(max_iter): # check convergences @@ -42,6 +47,17 @@ def gram_cd_solver(X, y, penalty, max_iter=20, tol=1e-4, verbose=False): _gram_cd_iter(XtX, Xty, w, XtXw, penalty, opt, all_features, n_updates=n_features) + # perform anderson extrapolation + if use_acc: + w_acc, XtXw_acc, is_extrapolated = accelerator.extrapolate(w, XtXw) + + if is_extrapolated: + p_obj_acc = 0.5 * w_acc @ XtXw_acc - Xty @ w_acc + penalty.value(w_acc) + p_obj = 0.5 * w @ XtXw - Xty @ w + penalty.value(w) + if p_obj_acc < p_obj: + w[:] = w_acc + XtXw[:] = XtXw_acc + p_obj = 0.5 * w @ XtXw - Xty @ w + penalty.value(w) p_objs_out.append(p_obj) return w, p_objs_out, stop_crit @@ -50,6 +66,7 @@ def gram_cd_solver(X, y, penalty, max_iter=20, tol=1e-4, verbose=False): @njit def _gram_cd_iter(XtX, Xty, w, XtXw, penalty, opt, ws, n_updates): # inplace update of w, XtXw, opt + # perform greedy cd updates for _ in range(n_updates): grad = XtXw - Xty opt[:] = penalty.subdiff_distance(w, grad, ws) diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py index 5bb0e19e4..4bee031d6 100644 --- a/skglm/tests/test_gram_solver.py +++ b/skglm/tests/test_gram_solver.py @@ -33,13 +33,9 @@ def test_vs_lasso_sklearn(n_samples, n_features, rho): sk_lasso.fit(X, y) l1_penalty = compiled_clone(L1(alpha)) - w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=0, max_iter=6000)[0] + w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=0, max_iter=1000)[0] - np.testing.assert_allclose(w, sk_lasso.coef_.flatten(), rtol=1e-5, atol=1e-5) - - -def compute_obj(X, y, alpha, coef): - return norm(y - X @ coef) ** 2 / (2 * len(y)) + alpha * norm(coef, ord=1) + np.testing.assert_allclose(w, sk_lasso.coef_.flatten(), rtol=1e-7, atol=1e-7) if __name__ == '__main__': From 20c191179aaeaa1e6090a64b984f8e3ac9f32745 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Wed, 24 Aug 2022 17:23:51 +0200 Subject: [PATCH 05/24] bug ``stop_criter`` && refactor --- skglm/solvers/gram_cd.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 71414253b..c220d8106 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -7,22 +7,22 @@ def gram_cd_solver(X, y, penalty, max_iter=20, use_acc=True, tol=1e-4, verbose=F """Run a Gram solver by reformulation the problem as below. Minimize:: - w.T @ Q @ w / (2*n_samples) - b.T @ w / n_samples + penalty(w) + w.T @ Q @ w / (2*n_samples) - q.T @ w / n_samples + penalty(w) where:: Q = X.T @ X - b = X.T @ y + q = X.T @ y """ n_samples, n_features = X.shape XtX = X.T @ X / n_samples Xty = X.T @ y / n_samples all_features = np.arange(n_features) + stop_crit = np.inf p_objs_out = [] w = np.zeros(n_features) XtXw = np.zeros(n_features) - # initial: grad = -Xty - opt = penalty.subdiff_distance(w, -Xty, all_features) + opt = penalty.subdiff_distance(w, -Xty, all_features) # initial: grad = -Xty if use_acc: accelerator = AndersonAcceleration(K=5) w_acc = np.zeros(n_features) @@ -60,7 +60,7 @@ def gram_cd_solver(X, y, penalty, max_iter=20, use_acc=True, tol=1e-4, verbose=F p_obj = 0.5 * w @ XtXw - Xty @ w + penalty.value(w) p_objs_out.append(p_obj) - return w, p_objs_out, stop_crit + return w, np.array(p_objs_out), stop_crit @njit From f2e985d3b7c4c08626ab5e672414ff4591c1c7e4 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 09:49:40 +0200 Subject: [PATCH 06/24] refactoring of var names --- skglm/solvers/gram_cd.py | 52 +++++++++++++++++++-------------- skglm/tests/test_gram_solver.py | 2 +- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index c220d8106..92ce81819 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -4,35 +4,40 @@ def gram_cd_solver(X, y, penalty, max_iter=20, use_acc=True, tol=1e-4, verbose=False): - """Run a Gram solver by reformulation the problem as below. + """Run coordinate descent while keeping the gradients up-to-date with Gram updates. Minimize:: + 1 / (2*n_samples) * norm(y - Xw)**2 + penalty(w) + + Which can be rewritten as:: w.T @ Q @ w / (2*n_samples) - q.T @ w / n_samples + penalty(w) where:: - Q = X.T @ X + Q = X.T @ X (gram matrix) q = X.T @ y """ n_samples, n_features = X.shape - XtX = X.T @ X / n_samples - Xty = X.T @ y / n_samples + scaled_gram = X.T @ X / n_samples + scaled_Xty = X.T @ y / n_samples + scaled_y_norm2 = np.linalg.norm(y) ** 2 / (2 * n_samples) all_features = np.arange(n_features) - stop_crit = np.inf + stop_crit = np.inf # prevent ref before assign p_objs_out = [] w = np.zeros(n_features) - XtXw = np.zeros(n_features) - opt = penalty.subdiff_distance(w, -Xty, all_features) # initial: grad = -Xty + scaled_gram_w = np.zeros(n_features) + opt = penalty.subdiff_distance(w, -scaled_Xty, all_features) # initial: grad = -Xty if use_acc: accelerator = AndersonAcceleration(K=5) w_acc = np.zeros(n_features) - XtXw_acc = np.zeros(n_features) + scaled_gram_w_acc = np.zeros(n_features) for t in range(max_iter): # check convergences stop_crit = np.max(opt) if verbose: - p_obj = 0.5 * w @ XtXw - Xty @ w + penalty.value(w) + p_obj = (0.5 * w @ scaled_gram_w - scaled_Xty @ w + + scaled_y_norm2 + penalty.value(w)) print( f"Iteration {t+1}: {p_obj:.10f}, " f"stopping crit: {stop_crit:.2e}" @@ -43,39 +48,42 @@ def gram_cd_solver(X, y, penalty, max_iter=20, use_acc=True, tol=1e-4, verbose=F print(f"Stopping criterion max violation: {stop_crit:.2e}") break - # inplace update of w, XtXw, opt - _gram_cd_iter(XtX, Xty, w, XtXw, penalty, opt, - all_features, n_updates=n_features) + # inplace update of w, XtXw + opt = _gram_cd_iter(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, + all_features, n_updates=n_features) # perform anderson extrapolation if use_acc: - w_acc, XtXw_acc, is_extrapolated = accelerator.extrapolate(w, XtXw) + w_acc, scaled_gram_w_acc, is_extrapolated = accelerator.extrapolate( + w, scaled_gram_w) if is_extrapolated: - p_obj_acc = 0.5 * w_acc @ XtXw_acc - Xty @ w_acc + penalty.value(w_acc) - p_obj = 0.5 * w @ XtXw - Xty @ w + penalty.value(w) + p_obj_acc = (0.5 * w_acc @ scaled_gram_w_acc - scaled_Xty @ w_acc + + penalty.value(w_acc)) + p_obj = 0.5 * w @ scaled_gram_w - scaled_Xty @ w + penalty.value(w) if p_obj_acc < p_obj: w[:] = w_acc - XtXw[:] = XtXw_acc + scaled_gram_w[:] = scaled_gram_w_acc - p_obj = 0.5 * w @ XtXw - Xty @ w + penalty.value(w) + p_obj = 0.5 * w @ scaled_gram_w - scaled_Xty @ w + penalty.value(w) p_objs_out.append(p_obj) return w, np.array(p_objs_out), stop_crit @njit -def _gram_cd_iter(XtX, Xty, w, XtXw, penalty, opt, ws, n_updates): +def _gram_cd_iter(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws, n_updates): # inplace update of w, XtXw, opt # perform greedy cd updates for _ in range(n_updates): - grad = XtXw - Xty - opt[:] = penalty.subdiff_distance(w, grad, ws) + grad = scaled_gram_w - scaled_Xty + opt = penalty.subdiff_distance(w, grad, ws) j_max = np.argmax(opt) old_w_j = w[j_max] - step = 1 / XtX[j_max, j_max] # 1 / lipchitz_j + step = 1 / scaled_gram[j_max, j_max] # 1 / lipchitz_j w[j_max] = penalty.prox_1d(old_w_j - step * grad[j_max], step, j_max) # Gram matrix update if w[j_max] != old_w_j: - XtXw += (w[j_max] - old_w_j) * XtX[:, j_max] + scaled_gram_w += (w[j_max] - old_w_j) * scaled_gram[:, j_max] + return opt diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py index 4bee031d6..1365317e9 100644 --- a/skglm/tests/test_gram_solver.py +++ b/skglm/tests/test_gram_solver.py @@ -17,7 +17,7 @@ def test_alpha_max(n_samples, n_features): alpha_max = norm(X.T @ y, ord=np.inf) / n_samples l1_penalty = compiled_clone(L1(alpha_max)) - w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=2)[0] + w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=0)[0] np.testing.assert_equal(w, 0) From 2dbc8e440c0f4dae1899576024286044159f6749 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 09:53:16 +0200 Subject: [PATCH 07/24] handle ``w_init`` --- skglm/solvers/gram_cd.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 92ce81819..e1908c42e 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -3,7 +3,8 @@ from skglm.utils import AndersonAcceleration -def gram_cd_solver(X, y, penalty, max_iter=20, use_acc=True, tol=1e-4, verbose=False): +def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, + use_acc=True, tol=1e-4, verbose=False): """Run coordinate descent while keeping the gradients up-to-date with Gram updates. Minimize:: @@ -19,13 +20,13 @@ def gram_cd_solver(X, y, penalty, max_iter=20, use_acc=True, tol=1e-4, verbose=F n_samples, n_features = X.shape scaled_gram = X.T @ X / n_samples scaled_Xty = X.T @ y / n_samples - scaled_y_norm2 = np.linalg.norm(y) ** 2 / (2 * n_samples) + scaled_y_norm2 = np.linalg.norm(y)**2 / (2*n_samples) all_features = np.arange(n_features) stop_crit = np.inf # prevent ref before assign p_objs_out = [] - w = np.zeros(n_features) - scaled_gram_w = np.zeros(n_features) + w = np.zeros(n_features) if w_init is None else w_init + scaled_gram_w = np.zeros(n_features) if w_init is None else scaled_gram @ w_init opt = penalty.subdiff_distance(w, -scaled_Xty, all_features) # initial: grad = -Xty if use_acc: accelerator = AndersonAcceleration(K=5) From 8ca7a41a0b6f3965a30289535741443a71c60e86 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 09:54:58 +0200 Subject: [PATCH 08/24] refactor ``_gram_cd_`` --- skglm/solvers/gram_cd.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index e1908c42e..f6f86253f 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -50,8 +50,8 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, break # inplace update of w, XtXw - opt = _gram_cd_iter(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, - all_features, n_updates=n_features) + opt = _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, + all_features) # perform anderson extrapolation if use_acc: @@ -72,10 +72,10 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, @njit -def _gram_cd_iter(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws, n_updates): +def _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws): # inplace update of w, XtXw, opt # perform greedy cd updates - for _ in range(n_updates): + for _ in range(len(w)): grad = scaled_gram_w - scaled_Xty opt = penalty.subdiff_distance(w, grad, ws) j_max = np.argmax(opt) From 34532334a91c2169c313fb43dc5a687eec73eae6 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 10:05:08 +0200 Subject: [PATCH 09/24] gram epoch greedy and cyclic strategy --- skglm/solvers/gram_cd.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index f6f86253f..62fd23501 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -4,7 +4,7 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, - use_acc=True, tol=1e-4, verbose=False): + use_acc=True, cd_type='greedy', tol=1e-4, verbose=False): """Run coordinate descent while keeping the gradients up-to-date with Gram updates. Minimize:: @@ -50,8 +50,9 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, break # inplace update of w, XtXw - opt = _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, - all_features) + _gram_cd_epoch = _gram_cd_greedy if cd_type == 'greedy' else _gram_cd_cyclic + opt = _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, + penalty, all_features) # perform anderson extrapolation if use_acc: @@ -72,7 +73,7 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, @njit -def _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws): +def _gram_cd_greedy(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws): # inplace update of w, XtXw, opt # perform greedy cd updates for _ in range(len(w)): @@ -88,3 +89,23 @@ def _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws): if w[j_max] != old_w_j: scaled_gram_w += (w[j_max] - old_w_j) * scaled_gram[:, j_max] return opt + + +@njit +def _gram_cd_cyclic(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws): + # inplace update of w, XtXw, opt + # perform greedy cd updates + for j in range(len(w)): + grad = scaled_gram_w - scaled_Xty + + old_w_j = w[j] + step = 1 / scaled_gram[j, j] # 1 / lipchitz_j + w[j] = penalty.prox_1d(old_w_j - step * grad[j], step, j) + + # Gram matrix update + if w[j] != old_w_j: + scaled_gram_w += (w[j] - old_w_j) * scaled_gram[:, j] + + # opt + grad = scaled_gram_w - scaled_Xty + return penalty.subdiff_distance(w, grad, ws) From 8d3dbc16461d9870a5a59b34dc0810bc62567efe Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 10:19:46 +0200 Subject: [PATCH 10/24] extend to sparse case && unitest --- skglm/solvers/gram_cd.py | 6 ++++++ skglm/tests/test_gram_solver.py | 19 +++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 62fd23501..3dc29d851 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -1,5 +1,6 @@ import numpy as np from numba import njit +from scipy.sparse import issparse from skglm.utils import AndersonAcceleration @@ -21,6 +22,10 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, scaled_gram = X.T @ X / n_samples scaled_Xty = X.T @ y / n_samples scaled_y_norm2 = np.linalg.norm(y)**2 / (2*n_samples) + + if issparse(X): + scaled_gram = scaled_gram.toarray() + all_features = np.arange(n_features) stop_crit = np.inf # prevent ref before assign p_objs_out = [] @@ -28,6 +33,7 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, w = np.zeros(n_features) if w_init is None else w_init scaled_gram_w = np.zeros(n_features) if w_init is None else scaled_gram @ w_init opt = penalty.subdiff_distance(w, -scaled_Xty, all_features) # initial: grad = -Xty + if use_acc: accelerator = AndersonAcceleration(K=5) w_acc = np.zeros(n_features) diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py index 1365317e9..d4f904b2c 100644 --- a/skglm/tests/test_gram_solver.py +++ b/skglm/tests/test_gram_solver.py @@ -10,10 +10,11 @@ from skglm.utils import make_correlated_data, compiled_clone -@pytest.mark.parametrize("n_samples, n_features", - product([100, 200], [50, 90])) -def test_alpha_max(n_samples, n_features): - X, y, _ = make_correlated_data(n_samples, n_features, random_state=0) +@pytest.mark.parametrize("n_samples, n_features, X_density", + product([100, 200], [50, 90], [1., 0.6])) +def test_alpha_max(n_samples, n_features, X_density): + X, y, _ = make_correlated_data(n_samples, n_features, + random_state=0, X_density=X_density) alpha_max = norm(X.T @ y, ord=np.inf) / n_samples l1_penalty = compiled_clone(L1(alpha_max)) @@ -22,10 +23,11 @@ def test_alpha_max(n_samples, n_features): np.testing.assert_equal(w, 0) -@pytest.mark.parametrize("n_samples, n_features, rho", - product([500, 100], [30, 80], [1e-1, 1e-2, 1e-3])) -def test_vs_lasso_sklearn(n_samples, n_features, rho): - X, y, _ = make_correlated_data(n_samples, n_features, random_state=0) +@pytest.mark.parametrize("n_samples, n_features, rho, X_density", + product([500, 100], [30, 80], [1e-1, 1e-2, 1e-3], [1., 0.8])) +def test_vs_lasso_sklearn(n_samples, n_features, rho, X_density): + X, y, _ = make_correlated_data(n_samples, n_features, + random_state=0, X_density=X_density) alpha_max = norm(X.T @ y, ord=np.inf) / n_samples alpha = rho * alpha_max @@ -39,4 +41,5 @@ def test_vs_lasso_sklearn(n_samples, n_features, rho): if __name__ == '__main__': + test_vs_lasso_sklearn(100, 10, 0.01) pass From cdd7e348f6cb396e3a34c5f2972435061694797e Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 10:41:17 +0200 Subject: [PATCH 11/24] one implementation of _gram_cd && unittest --- skglm/solvers/gram_cd.py | 59 ++++++++++++++------------------- skglm/tests/test_gram_solver.py | 22 +++++++----- 2 files changed, 37 insertions(+), 44 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 3dc29d851..704d98e53 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -5,7 +5,7 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, - use_acc=True, cd_type='greedy', tol=1e-4, verbose=False): + use_acc=True, cd_strategy='greedy', tol=1e-4, verbose=False): """Run coordinate descent while keeping the gradients up-to-date with Gram updates. Minimize:: @@ -15,8 +15,7 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, w.T @ Q @ w / (2*n_samples) - q.T @ w / n_samples + penalty(w) where:: - Q = X.T @ X (gram matrix) - q = X.T @ y + Q = X.T @ X (gram matrix), and q = X.T @ y """ n_samples, n_features = X.shape scaled_gram = X.T @ X / n_samples @@ -32,7 +31,8 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, w = np.zeros(n_features) if w_init is None else w_init scaled_gram_w = np.zeros(n_features) if w_init is None else scaled_gram @ w_init - opt = penalty.subdiff_distance(w, -scaled_Xty, all_features) # initial: grad = -Xty + grad = scaled_gram_w - scaled_Xty + opt = penalty.subdiff_distance(w, grad, all_features) if use_acc: accelerator = AndersonAcceleration(K=5) @@ -56,11 +56,10 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, break # inplace update of w, XtXw - _gram_cd_epoch = _gram_cd_greedy if cd_type == 'greedy' else _gram_cd_cyclic opt = _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, - penalty, all_features) + penalty, cd_strategy) - # perform anderson extrapolation + # perform Anderson extrapolation if use_acc: w_acc, scaled_gram_w_acc, is_extrapolated = accelerator.extrapolate( w, scaled_gram_w) @@ -73,45 +72,35 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, w[:] = w_acc scaled_gram_w[:] = scaled_gram_w_acc + # store p_obj p_obj = 0.5 * w @ scaled_gram_w - scaled_Xty @ w + penalty.value(w) p_objs_out.append(p_obj) return w, np.array(p_objs_out), stop_crit @njit -def _gram_cd_greedy(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws): - # inplace update of w, XtXw, opt - # perform greedy cd updates - for _ in range(len(w)): +def _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, cd_strategy): + all_features = np.arange(len(w)) + for j in all_features: + # compute grad grad = scaled_gram_w - scaled_Xty - opt = penalty.subdiff_distance(w, grad, ws) - j_max = np.argmax(opt) - - old_w_j = w[j_max] - step = 1 / scaled_gram[j_max, j_max] # 1 / lipchitz_j - w[j_max] = penalty.prox_1d(old_w_j - step * grad[j_max], step, j_max) - - # Gram matrix update - if w[j_max] != old_w_j: - scaled_gram_w += (w[j_max] - old_w_j) * scaled_gram[:, j_max] - return opt + # select feature j + if cd_strategy == 'greedy': + opt = penalty.subdiff_distance(w, grad, all_features) + chosen_j = np.argmax(opt) + else: # cyclic + chosen_j = j -@njit -def _gram_cd_cyclic(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, ws): - # inplace update of w, XtXw, opt - # perform greedy cd updates - for j in range(len(w)): - grad = scaled_gram_w - scaled_Xty - - old_w_j = w[j] - step = 1 / scaled_gram[j, j] # 1 / lipchitz_j - w[j] = penalty.prox_1d(old_w_j - step * grad[j], step, j) + # update w_j + old_w_j = w[chosen_j] + step = 1 / scaled_gram[chosen_j, chosen_j] # 1 / lipchitz_j + w[chosen_j] = penalty.prox_1d(old_w_j - step * grad[chosen_j], step, chosen_j) # Gram matrix update - if w[j] != old_w_j: - scaled_gram_w += (w[j] - old_w_j) * scaled_gram[:, j] + if w[chosen_j] != old_w_j: + scaled_gram_w += (w[chosen_j] - old_w_j) * scaled_gram[:, chosen_j] # opt grad = scaled_gram_w - scaled_Xty - return penalty.subdiff_distance(w, grad, ws) + return penalty.subdiff_distance(w, grad, all_features) diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py index d4f904b2c..7c168ff2b 100644 --- a/skglm/tests/test_gram_solver.py +++ b/skglm/tests/test_gram_solver.py @@ -10,22 +10,25 @@ from skglm.utils import make_correlated_data, compiled_clone -@pytest.mark.parametrize("n_samples, n_features, X_density", - product([100, 200], [50, 90], [1., 0.6])) -def test_alpha_max(n_samples, n_features, X_density): +@pytest.mark.parametrize("n_samples, n_features, X_density, cd_strategy", + product([100, 200], [50, 90], [1., 0.6], + ['greedy', 'cyclic'])) +def test_alpha_max(n_samples, n_features, X_density, cd_strategy): X, y, _ = make_correlated_data(n_samples, n_features, random_state=0, X_density=X_density) alpha_max = norm(X.T @ y, ord=np.inf) / n_samples l1_penalty = compiled_clone(L1(alpha_max)) - w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=0)[0] + w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, + verbose=0, cd_strategy=cd_strategy)[0] np.testing.assert_equal(w, 0) -@pytest.mark.parametrize("n_samples, n_features, rho, X_density", - product([500, 100], [30, 80], [1e-1, 1e-2, 1e-3], [1., 0.8])) -def test_vs_lasso_sklearn(n_samples, n_features, rho, X_density): +@pytest.mark.parametrize("n_samples, n_features, rho, X_density, cd_strategy", + product([500, 100], [30, 80], [1e-1, 1e-2, 1e-3], [1., 0.8], + ['greedy', 'cyclic'])) +def test_vs_lasso_sklearn(n_samples, n_features, rho, X_density, cd_strategy): X, y, _ = make_correlated_data(n_samples, n_features, random_state=0, X_density=X_density) alpha_max = norm(X.T @ y, ord=np.inf) / n_samples @@ -35,11 +38,12 @@ def test_vs_lasso_sklearn(n_samples, n_features, rho, X_density): sk_lasso.fit(X, y) l1_penalty = compiled_clone(L1(alpha)) - w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=0, max_iter=1000)[0] + w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=0, + max_iter=1000, cd_strategy=cd_strategy)[0] np.testing.assert_allclose(w, sk_lasso.coef_.flatten(), rtol=1e-7, atol=1e-7) if __name__ == '__main__': - test_vs_lasso_sklearn(100, 10, 0.01) + test_vs_lasso_sklearn(100, 10, 0.01, 1.) pass From f4bfeafc809a539095c60663a08786d40a669197 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 10:45:10 +0200 Subject: [PATCH 12/24] greedy_cd arg instead of cd_strategy --- skglm/solvers/gram_cd.py | 8 ++++---- skglm/tests/test_gram_solver.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 704d98e53..82e23d8c8 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -5,7 +5,7 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, - use_acc=True, cd_strategy='greedy', tol=1e-4, verbose=False): + use_acc=True, greedy_cd=True, tol=1e-4, verbose=False): """Run coordinate descent while keeping the gradients up-to-date with Gram updates. Minimize:: @@ -57,7 +57,7 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, # inplace update of w, XtXw opt = _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, - penalty, cd_strategy) + penalty, greedy_cd) # perform Anderson extrapolation if use_acc: @@ -79,14 +79,14 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, @njit -def _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, cd_strategy): +def _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, greedy_cd): all_features = np.arange(len(w)) for j in all_features: # compute grad grad = scaled_gram_w - scaled_Xty # select feature j - if cd_strategy == 'greedy': + if greedy_cd: opt = penalty.subdiff_distance(w, grad, all_features) chosen_j = np.argmax(opt) else: # cyclic diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py index 7c168ff2b..e69659632 100644 --- a/skglm/tests/test_gram_solver.py +++ b/skglm/tests/test_gram_solver.py @@ -10,25 +10,25 @@ from skglm.utils import make_correlated_data, compiled_clone -@pytest.mark.parametrize("n_samples, n_features, X_density, cd_strategy", +@pytest.mark.parametrize("n_samples, n_features, X_density, greedy_cd", product([100, 200], [50, 90], [1., 0.6], - ['greedy', 'cyclic'])) -def test_alpha_max(n_samples, n_features, X_density, cd_strategy): + [True, False])) +def test_alpha_max(n_samples, n_features, X_density, greedy_cd): X, y, _ = make_correlated_data(n_samples, n_features, random_state=0, X_density=X_density) alpha_max = norm(X.T @ y, ord=np.inf) / n_samples l1_penalty = compiled_clone(L1(alpha_max)) w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, - verbose=0, cd_strategy=cd_strategy)[0] + verbose=0, greedy_cd=greedy_cd)[0] np.testing.assert_equal(w, 0) -@pytest.mark.parametrize("n_samples, n_features, rho, X_density, cd_strategy", +@pytest.mark.parametrize("n_samples, n_features, rho, X_density, greedy_cd", product([500, 100], [30, 80], [1e-1, 1e-2, 1e-3], [1., 0.8], - ['greedy', 'cyclic'])) -def test_vs_lasso_sklearn(n_samples, n_features, rho, X_density, cd_strategy): + [True, False])) +def test_vs_lasso_sklearn(n_samples, n_features, rho, X_density, greedy_cd): X, y, _ = make_correlated_data(n_samples, n_features, random_state=0, X_density=X_density) alpha_max = norm(X.T @ y, ord=np.inf) / n_samples @@ -39,7 +39,7 @@ def test_vs_lasso_sklearn(n_samples, n_features, rho, X_density, cd_strategy): l1_penalty = compiled_clone(L1(alpha)) w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, verbose=0, - max_iter=1000, cd_strategy=cd_strategy)[0] + max_iter=1000, greedy_cd=greedy_cd)[0] np.testing.assert_allclose(w, sk_lasso.coef_.flatten(), rtol=1e-7, atol=1e-7) From 4c0accad94be820567a36bf67a9067e17cf96a7f Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 10:55:27 +0200 Subject: [PATCH 13/24] add docs --- skglm/solvers/gram_cd.py | 44 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 82e23d8c8..5f9df9364 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -4,7 +4,7 @@ from skglm.utils import AndersonAcceleration -def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, +def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, use_acc=True, greedy_cd=True, tol=1e-4, verbose=False): """Run coordinate descent while keeping the gradients up-to-date with Gram updates. @@ -16,6 +16,48 @@ def gram_cd_solver(X, y, penalty, max_iter=20, w_init=None, where:: Q = X.T @ X (gram matrix), and q = X.T @ y + + Parameters + ---------- + X : array or sparse CSC matrix, shape (n_samples, n_features) + Design matrix. + + y : array, shape (n_samples,) + Target vector. + + penalty : instance of BasePenalty + Penalty object. + + max_iter : int, default 100 + Maximum number of iterations. + + w_init : array, shape (n_features,), default None + Initial value of coefficients. + If set to None, a zero vector is used instead. + + use_acc : bool, default True + Extrapolate the iterates based on the past 5 iterates if set to True. + + greedy_cd : bool, default True + Use a greedy strategy to select features to update in Gram CD epoch + if set to True. A cyclic strategy is used otherwise. + + tol : float, default 1e-4 + Tolerance for convergence. + + verbose : bool, default False + Amount of verbosity. 0/False is silent. + + Returns + ------- + w : array, shape (n_features,) + Solution that minimizes the problem defined by datafit and penalty. + + objs_out : array, shape (n_iter,) + The objective values at every outer iteration. + + stop_crit : float + The value of the stopping criterion when the solver stops. """ n_samples, n_features = X.shape scaled_gram = X.T @ X / n_samples From dcab05440e261885207ccd0ce81116226018092e Mon Sep 17 00:00:00 2001 From: mathurinm Date: Thu, 25 Aug 2022 13:42:35 +0200 Subject: [PATCH 14/24] script fast gram, not faster than scipy --- fast_gram.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 fast_gram.py diff --git a/fast_gram.py b/fast_gram.py new file mode 100644 index 000000000..4d5b111c6 --- /dev/null +++ b/fast_gram.py @@ -0,0 +1,46 @@ +import time +import numpy as np +from numba import njit + +from skglm.utils import make_correlated_data + + +@njit +def fast_gram_sparse(data, indices, indptr): + # this needs indices to be sorted (sort_indices()), nasty bug otherwise + n_features = len(indptr) - 1 + gram = np.zeros((n_features, n_features)) + + for i in range(n_features): + i_start, i_end = indptr[i], indptr[i + 1] + gram[i, i] = (data[i_start:i_end] ** 2).sum() + for j in range(i): + j_start, j_end = indptr[j], indptr[j + 1] + scal = 0 + i_idx = i_start + j_idx = j_start + while i_idx < i_end and j_idx < j_end: + if indices[i_idx] < indices[j_idx]: + i_idx += 1 + elif indices[i_idx] > indices[j_idx]: + j_idx += 1 + else: # they match + scal += data[i_idx] * data[j_idx] + i_idx += 1 + j_idx += 1 + gram[i, j] = scal + gram[j, i] = scal + return gram + + +X, _, _ = make_correlated_data(10000, 200, random_state=0, X_density=0.5) +t0 = time.time() +gram2 = fast_gram_sparse(X.data, X.indices, X.indptr) +print(f"us: {time.time() -t0:.3f} s") + +t0 = time.time() +gram1 = (X.T @ X).toarray() +print(f"sp: {time.time() -t0:.3f} s") + + +print(np.linalg.norm(gram1 - gram2)) From e8bc96ecca88655e0bd3ff6778078206e5ef86b6 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 14:47:03 +0200 Subject: [PATCH 15/24] fast gram timing --- fast_gram.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fast_gram.py b/fast_gram.py index 4d5b111c6..1ac175035 100644 --- a/fast_gram.py +++ b/fast_gram.py @@ -33,7 +33,10 @@ def fast_gram_sparse(data, indices, indptr): return gram -X, _, _ = make_correlated_data(10000, 200, random_state=0, X_density=0.5) +X, _, _ = make_correlated_data(10, 5, random_state=0, X_density=0.5) +fast_gram_sparse(X.data, X.indices, X.indptr) + +X, _, _ = make_correlated_data(1000, 200, random_state=0, X_density=0.5) t0 = time.time() gram2 = fast_gram_sparse(X.data, X.indices, X.indptr) print(f"us: {time.time() -t0:.3f} s") From 61a67c43192763d9321c1efd9b76bb40d0cc252b Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 15:02:04 +0200 Subject: [PATCH 16/24] keep grads instead --- skglm/solvers/gram_cd.py | 27 ++++++++++----------------- skglm/tests/test_gram_solver.py | 1 - 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 5f9df9364..4a2917b16 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -79,13 +79,13 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, if use_acc: accelerator = AndersonAcceleration(K=5) w_acc = np.zeros(n_features) - scaled_gram_w_acc = np.zeros(n_features) + grad_acc = np.zeros(n_features) for t in range(max_iter): # check convergences stop_crit = np.max(opt) if verbose: - p_obj = (0.5 * w @ scaled_gram_w - scaled_Xty @ w + + p_obj = (0.5 * w @ (scaled_gram @ w) - scaled_Xty @ w + scaled_y_norm2 + penalty.value(w)) print( f"Iteration {t+1}: {p_obj:.10f}, " @@ -98,35 +98,30 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, break # inplace update of w, XtXw - opt = _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, - penalty, greedy_cd) + opt = _gram_cd_epoch(scaled_gram, w, grad, penalty, greedy_cd) # perform Anderson extrapolation if use_acc: - w_acc, scaled_gram_w_acc, is_extrapolated = accelerator.extrapolate( - w, scaled_gram_w) + w_acc, grad_acc, is_extrapolated = accelerator.extrapolate(w, grad) if is_extrapolated: - p_obj_acc = (0.5 * w_acc @ scaled_gram_w_acc - scaled_Xty @ w_acc + + p_obj_acc = (0.5 * w_acc @ (scaled_gram @ w_acc) - scaled_Xty @ w_acc + penalty.value(w_acc)) - p_obj = 0.5 * w @ scaled_gram_w - scaled_Xty @ w + penalty.value(w) + p_obj = 0.5 * w @ (scaled_gram @ w) - scaled_Xty @ w + penalty.value(w) if p_obj_acc < p_obj: w[:] = w_acc - scaled_gram_w[:] = scaled_gram_w_acc + grad[:] = grad_acc # store p_obj - p_obj = 0.5 * w @ scaled_gram_w - scaled_Xty @ w + penalty.value(w) + p_obj = 0.5 * w @ (scaled_gram @ w) - scaled_Xty @ w + penalty.value(w) p_objs_out.append(p_obj) return w, np.array(p_objs_out), stop_crit @njit -def _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, greedy_cd): +def _gram_cd_epoch(scaled_gram, w, grad, penalty, greedy_cd): all_features = np.arange(len(w)) for j in all_features: - # compute grad - grad = scaled_gram_w - scaled_Xty - # select feature j if greedy_cd: opt = penalty.subdiff_distance(w, grad, all_features) @@ -141,8 +136,6 @@ def _gram_cd_epoch(scaled_gram, scaled_Xty, w, scaled_gram_w, penalty, greedy_cd # Gram matrix update if w[chosen_j] != old_w_j: - scaled_gram_w += (w[chosen_j] - old_w_j) * scaled_gram[:, chosen_j] + grad += (w[chosen_j] - old_w_j) * scaled_gram[:, chosen_j] - # opt - grad = scaled_gram_w - scaled_Xty return penalty.subdiff_distance(w, grad, all_features) diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py index e69659632..d6a6d7cc8 100644 --- a/skglm/tests/test_gram_solver.py +++ b/skglm/tests/test_gram_solver.py @@ -45,5 +45,4 @@ def test_vs_lasso_sklearn(n_samples, n_features, rho, X_density, greedy_cd): if __name__ == '__main__': - test_vs_lasso_sklearn(100, 10, 0.01, 1.) pass From 1b6c1697bc726b7bc5ca9d1e5e0205b99865df8f Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 15:04:54 +0200 Subject: [PATCH 17/24] refactor ``chosen_j`` --- skglm/solvers/gram_cd.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 4a2917b16..96c9c2db8 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -121,21 +121,21 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, @njit def _gram_cd_epoch(scaled_gram, w, grad, penalty, greedy_cd): all_features = np.arange(len(w)) - for j in all_features: + for cd_iter in all_features: # select feature j if greedy_cd: opt = penalty.subdiff_distance(w, grad, all_features) - chosen_j = np.argmax(opt) + j = np.argmax(opt) else: # cyclic - chosen_j = j + j = cd_iter # update w_j - old_w_j = w[chosen_j] - step = 1 / scaled_gram[chosen_j, chosen_j] # 1 / lipchitz_j - w[chosen_j] = penalty.prox_1d(old_w_j - step * grad[chosen_j], step, chosen_j) + old_w_j = w[j] + step = 1 / scaled_gram[j, j] # 1 / lipchitz_j + w[j] = penalty.prox_1d(old_w_j - step * grad[j], step, j) # Gram matrix update - if w[chosen_j] != old_w_j: - grad += (w[chosen_j] - old_w_j) * scaled_gram[:, chosen_j] + if w[j] != old_w_j: + grad += (w[j] - old_w_j) * scaled_gram[:, j] return penalty.subdiff_distance(w, grad, all_features) From c9c55752cf2286da46ae371ca0f81eac6038c674 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Thu, 25 Aug 2022 15:24:58 +0200 Subject: [PATCH 18/24] script to profile --- profile_gram.py | 37 +++++++++++++++++++++++++++++++++++++ skglm/solvers/gram_cd.py | 12 ++++++++---- 2 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 profile_gram.py diff --git a/profile_gram.py b/profile_gram.py new file mode 100644 index 000000000..c2290598b --- /dev/null +++ b/profile_gram.py @@ -0,0 +1,37 @@ +import numpy as np +from scipy.sparse import csc_matrix +from skglm.utils import make_correlated_data, compiled_clone +from skglm.solvers.gram_cd import (gram_cd_solver) +from skglm.datafits.single_task import Logistic +from skglm.penalties.separable import L1 + +import line_profiler + + +# params +n_samples, n_features = 10000, 200 +X_density = 1. +rho = 0.005 +case_sparse = False + + +X, y, _ = make_correlated_data(n_samples, n_features, random_state=0, + X_density=X_density) +y = np.sign(y) +if case_sparse: + X = csc_matrix(X) + +alpha_max = np.linalg.norm(X.T @ y, ord=np.inf) / (2 * n_samples) +alpha = rho * alpha_max + + +l1_penalty = compiled_clone(L1(alpha)) +# cache numba jit compilation +gram_cd_solver(X, y, l1_penalty, max_iter=1000, tol=1e-9, use_acc=False) + +# profile code +profiler = line_profiler.LineProfiler() +profiler.add_function(gram_cd_solver) +profiler.enable_by_count() +gram_cd_solver(X, y, l1_penalty, max_iter=1000, tol=1e-9, use_acc=False) +profiler.print_stats() diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 96c9c2db8..a67964844 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -60,12 +60,16 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, The value of the stopping criterion when the solver stops. """ n_samples, n_features = X.shape - scaled_gram = X.T @ X / n_samples - scaled_Xty = X.T @ y / n_samples - scaled_y_norm2 = np.linalg.norm(y)**2 / (2*n_samples) if issparse(X): - scaled_gram = scaled_gram.toarray() + scaled_gram = X.T.dot(X) + scaled_gram = scaled_gram.toarray() / n_samples + scaled_Xty = X.T.dot(y) / n_samples + else: + scaled_gram = X.T @ X / n_samples + scaled_Xty = X.T @ y / n_samples + + scaled_y_norm2 = np.linalg.norm(y)**2 / (2*n_samples) all_features = np.arange(n_features) stop_crit = np.inf # prevent ref before assign From 68a0458474f35f6a04f92d3e8b693e4d9988d2c8 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Fri, 26 Aug 2022 09:21:40 +0200 Subject: [PATCH 19/24] potential improvements, docstring --- skglm/solvers/gram_cd.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index a67964844..346a2ee7f 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -1,6 +1,8 @@ +import warnings import numpy as np from numba import njit from scipy.sparse import issparse + from skglm.utils import AndersonAcceleration @@ -8,6 +10,9 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, use_acc=True, greedy_cd=True, tol=1e-4, verbose=False): """Run coordinate descent while keeping the gradients up-to-date with Gram updates. + This solver should be used when n_features < n_samples, and computes the + (n_features, n_features) Gram matrix which comes with an overhead. + Minimize:: 1 / (2*n_samples) * norm(y - Xw)**2 + penalty(w) @@ -39,7 +44,7 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, Extrapolate the iterates based on the past 5 iterates if set to True. greedy_cd : bool, default True - Use a greedy strategy to select features to update in Gram CD epoch + Use a greedy strategy to select features to update in coordinate descent epochs if set to True. A cyclic strategy is used otherwise. tol : float, default 1e-4 @@ -68,6 +73,7 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, else: scaled_gram = X.T @ X / n_samples scaled_Xty = X.T @ y / n_samples + # TODO potential improvement: allow to pass scaled_gram (e.g. for path computation) scaled_y_norm2 = np.linalg.norm(y)**2 / (2*n_samples) @@ -76,11 +82,14 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, p_objs_out = [] w = np.zeros(n_features) if w_init is None else w_init - scaled_gram_w = np.zeros(n_features) if w_init is None else scaled_gram @ w_init - grad = scaled_gram_w - scaled_Xty + grad = - scaled_Xty if w_init is None else scaled_gram @ w_init - scaled_Xty opt = penalty.subdiff_distance(w, grad, all_features) if use_acc: + if greedy_cd: + warnings.warn( + UserWarning, + "Anderson acceleration does not work with greedy_cd, set use_acc=False") accelerator = AndersonAcceleration(K=5) w_acc = np.zeros(n_features) grad_acc = np.zeros(n_features) @@ -109,6 +118,7 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, w_acc, grad_acc, is_extrapolated = accelerator.extrapolate(w, grad) if is_extrapolated: + # omit constant term for comparison p_obj_acc = (0.5 * w_acc @ (scaled_gram @ w_acc) - scaled_Xty @ w_acc + penalty.value(w_acc)) p_obj = 0.5 * w @ (scaled_gram @ w) - scaled_Xty @ w + penalty.value(w) @@ -138,7 +148,7 @@ def _gram_cd_epoch(scaled_gram, w, grad, penalty, greedy_cd): step = 1 / scaled_gram[j, j] # 1 / lipchitz_j w[j] = penalty.prox_1d(old_w_j - step * grad[j], step, j) - # Gram matrix update + # gradient update with Gram matrix if w[j] != old_w_j: grad += (w[j] - old_w_j) * scaled_gram[:, j] From 3788cc4eb860c0ac46e44797767c7d9e93ce4883 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Fri, 26 Aug 2022 09:30:39 +0200 Subject: [PATCH 20/24] warnings.warn arguments in correct order --- skglm/solvers/gram_cd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 346a2ee7f..4312dcbb5 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -88,8 +88,8 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, if use_acc: if greedy_cd: warnings.warn( - UserWarning, - "Anderson acceleration does not work with greedy_cd, set use_acc=False") + "Anderson acceleration does not work with greedy_cd, set use_acc=False", + UserWarning) accelerator = AndersonAcceleration(K=5) w_acc = np.zeros(n_features) grad_acc = np.zeros(n_features) From 1ce391db896736d4c34ee51c73d192e2d7c39699 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Fri, 26 Aug 2022 13:27:29 +0200 Subject: [PATCH 21/24] cleanups: ann files --- fast_gram.py | 49 ---------------------------------------- profile_gram.py | 37 ------------------------------ skglm/solvers/gram_cd.py | 2 +- 3 files changed, 1 insertion(+), 87 deletions(-) delete mode 100644 fast_gram.py delete mode 100644 profile_gram.py diff --git a/fast_gram.py b/fast_gram.py deleted file mode 100644 index 1ac175035..000000000 --- a/fast_gram.py +++ /dev/null @@ -1,49 +0,0 @@ -import time -import numpy as np -from numba import njit - -from skglm.utils import make_correlated_data - - -@njit -def fast_gram_sparse(data, indices, indptr): - # this needs indices to be sorted (sort_indices()), nasty bug otherwise - n_features = len(indptr) - 1 - gram = np.zeros((n_features, n_features)) - - for i in range(n_features): - i_start, i_end = indptr[i], indptr[i + 1] - gram[i, i] = (data[i_start:i_end] ** 2).sum() - for j in range(i): - j_start, j_end = indptr[j], indptr[j + 1] - scal = 0 - i_idx = i_start - j_idx = j_start - while i_idx < i_end and j_idx < j_end: - if indices[i_idx] < indices[j_idx]: - i_idx += 1 - elif indices[i_idx] > indices[j_idx]: - j_idx += 1 - else: # they match - scal += data[i_idx] * data[j_idx] - i_idx += 1 - j_idx += 1 - gram[i, j] = scal - gram[j, i] = scal - return gram - - -X, _, _ = make_correlated_data(10, 5, random_state=0, X_density=0.5) -fast_gram_sparse(X.data, X.indices, X.indptr) - -X, _, _ = make_correlated_data(1000, 200, random_state=0, X_density=0.5) -t0 = time.time() -gram2 = fast_gram_sparse(X.data, X.indices, X.indptr) -print(f"us: {time.time() -t0:.3f} s") - -t0 = time.time() -gram1 = (X.T @ X).toarray() -print(f"sp: {time.time() -t0:.3f} s") - - -print(np.linalg.norm(gram1 - gram2)) diff --git a/profile_gram.py b/profile_gram.py deleted file mode 100644 index c2290598b..000000000 --- a/profile_gram.py +++ /dev/null @@ -1,37 +0,0 @@ -import numpy as np -from scipy.sparse import csc_matrix -from skglm.utils import make_correlated_data, compiled_clone -from skglm.solvers.gram_cd import (gram_cd_solver) -from skglm.datafits.single_task import Logistic -from skglm.penalties.separable import L1 - -import line_profiler - - -# params -n_samples, n_features = 10000, 200 -X_density = 1. -rho = 0.005 -case_sparse = False - - -X, y, _ = make_correlated_data(n_samples, n_features, random_state=0, - X_density=X_density) -y = np.sign(y) -if case_sparse: - X = csc_matrix(X) - -alpha_max = np.linalg.norm(X.T @ y, ord=np.inf) / (2 * n_samples) -alpha = rho * alpha_max - - -l1_penalty = compiled_clone(L1(alpha)) -# cache numba jit compilation -gram_cd_solver(X, y, l1_penalty, max_iter=1000, tol=1e-9, use_acc=False) - -# profile code -profiler = line_profiler.LineProfiler() -profiler.add_function(gram_cd_solver) -profiler.enable_by_count() -gram_cd_solver(X, y, l1_penalty, max_iter=1000, tol=1e-9, use_acc=False) -profiler.print_stats() diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 4312dcbb5..f5530bc11 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -8,7 +8,7 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, use_acc=True, greedy_cd=True, tol=1e-4, verbose=False): - """Run coordinate descent while keeping the gradients up-to-date with Gram updates. + r"""Run coordinate descent while keeping the gradients up-to-date with Gram updates. This solver should be used when n_features < n_samples, and computes the (n_features, n_features) Gram matrix which comes with an overhead. From 2476a343a9a012edc897c97d883fd2c1478fdcd4 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Fri, 26 Aug 2022 13:29:15 +0200 Subject: [PATCH 22/24] fix ``p_obj`` computation --- skglm/solvers/gram_cd.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index f5530bc11..4acb11712 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -127,7 +127,8 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, grad[:] = grad_acc # store p_obj - p_obj = 0.5 * w @ (scaled_gram @ w) - scaled_Xty @ w + penalty.value(w) + p_obj = (0.5 * w @ (scaled_gram @ w) - scaled_Xty @ w + scaled_y_norm2 + + penalty.value(w)) p_objs_out.append(p_obj) return w, np.array(p_objs_out), stop_crit From 3208dfa628bc254293b1227e9a93998fb3d76057 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Fri, 26 Aug 2022 13:56:43 +0200 Subject: [PATCH 23/24] typos + less cases in test, smaller X in tests --- skglm/solvers/gram_cd.py | 9 +++++---- skglm/tests/test_gram_solver.py | 28 ++++++---------------------- 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 4acb11712..ae0e8297b 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -11,12 +11,13 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, r"""Run coordinate descent while keeping the gradients up-to-date with Gram updates. This solver should be used when n_features < n_samples, and computes the - (n_features, n_features) Gram matrix which comes with an overhead. + (n_features, n_features) Gram matrix which comes with an overhead. It is only + suited to Quadratic datafits. - Minimize:: + It minimizes:: 1 / (2*n_samples) * norm(y - Xw)**2 + penalty(w) - Which can be rewritten as:: + which can be rewritten as:: w.T @ Q @ w / (2*n_samples) - q.T @ w / n_samples + penalty(w) where:: @@ -146,7 +147,7 @@ def _gram_cd_epoch(scaled_gram, w, grad, penalty, greedy_cd): # update w_j old_w_j = w[j] - step = 1 / scaled_gram[j, j] # 1 / lipchitz_j + step = 1 / scaled_gram[j, j] # 1 / lipschitz_j w[j] = penalty.prox_1d(old_w_j - step * grad[j], step, j) # gradient update with Gram matrix diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py index d6a6d7cc8..fd2329313 100644 --- a/skglm/tests/test_gram_solver.py +++ b/skglm/tests/test_gram_solver.py @@ -10,28 +10,12 @@ from skglm.utils import make_correlated_data, compiled_clone -@pytest.mark.parametrize("n_samples, n_features, X_density, greedy_cd", - product([100, 200], [50, 90], [1., 0.6], - [True, False])) -def test_alpha_max(n_samples, n_features, X_density, greedy_cd): - X, y, _ = make_correlated_data(n_samples, n_features, - random_state=0, X_density=X_density) - alpha_max = norm(X.T @ y, ord=np.inf) / n_samples - - l1_penalty = compiled_clone(L1(alpha_max)) - w = gram_cd_solver(X, y, l1_penalty, tol=1e-9, - verbose=0, greedy_cd=greedy_cd)[0] - - np.testing.assert_equal(w, 0) - - -@pytest.mark.parametrize("n_samples, n_features, rho, X_density, greedy_cd", - product([500, 100], [30, 80], [1e-1, 1e-2, 1e-3], [1., 0.8], - [True, False])) -def test_vs_lasso_sklearn(n_samples, n_features, rho, X_density, greedy_cd): - X, y, _ = make_correlated_data(n_samples, n_features, - random_state=0, X_density=X_density) - alpha_max = norm(X.T @ y, ord=np.inf) / n_samples +@pytest.mark.parametrize("rho, X_density, greedy_cd", + product([1e-1, 1e-3], [1., 0.8], [True, False])) +def test_vs_lasso_sklearn(rho, X_density, greedy_cd): + X, y, _ = make_correlated_data( + n_samples=18, n_features=8, random_state=0, X_density=X_density) + alpha_max = norm(X.T @ y, ord=np.inf) / len(y) alpha = rho * alpha_max sk_lasso = Lasso(alpha, fit_intercept=False, tol=1e-9) From 16f6ee44357710d839d22ff018380c66b8c5db2c Mon Sep 17 00:00:00 2001 From: Badr MOUFAD Date: Fri, 26 Aug 2022 14:05:36 +0200 Subject: [PATCH 24/24] typo: ``XtXw`` --> ``grad`` --- skglm/solvers/gram_cd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 4acb11712..51fe001b0 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -110,7 +110,7 @@ def gram_cd_solver(X, y, penalty, max_iter=100, w_init=None, print(f"Stopping criterion max violation: {stop_crit:.2e}") break - # inplace update of w, XtXw + # inplace update of w, grad opt = _gram_cd_epoch(scaled_gram, w, grad, penalty, greedy_cd) # perform Anderson extrapolation