-
Notifications
You must be signed in to change notification settings - Fork 29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH flexible gram solver with penalty and using datafit #16
Changes from 14 commits
a3c7a53
34db4fc
2110db1
247bb75
11742dc
4a44957
a85f5cf
6c9b146
30363ea
14d3800
59b8f6c
6bb4706
f0ed28a
5dd40fd
07714ac
8aee1b6
0b940ee
522cf69
b5b9d09
fc791d6
9e19e08
b9ddc34
7fdac87
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,12 +2,16 @@ | |
from numba import njit | ||
from scipy import sparse | ||
from sklearn.utils import check_array | ||
from skglm.datafits.single_task import Quadratic, Quadratic_32 | ||
from skglm.solvers.cd_utils import ( | ||
dist_fix_point, construct_grad, construct_grad_sparse) | ||
from skglm.solvers.gram import cd_gram_quadratic, fista_gram_quadratic | ||
|
||
|
||
def cd_solver_path(X, y, datafit, penalty, alphas=None, | ||
coef_init=None, max_iter=20, max_epochs=50_000, | ||
p0=10, tol=1e-4, use_acc=True, return_n_iter=False, | ||
ws_strategy="subdiff", verbose=0): | ||
solver="cd_ws", ws_strategy="subdiff", verbose=0): | ||
r"""Compute optimization path with Anderson accelerated coordinate descent. | ||
|
||
The loss is customized by passing various choices of datafit and penalty: | ||
|
@@ -52,6 +56,9 @@ def cd_solver_path(X, y, datafit, penalty, alphas=None, | |
return_n_iter : bool, optional | ||
If True, number of iterations along the path are returned. | ||
|
||
solver : ('cd_ws'|'cd_gram'|'fista'), optional | ||
The solver used to solve the optimization problem. | ||
|
||
ws_strategy : ('subdiff'|'fixpoint'), optional | ||
The score used to build the working set. | ||
|
||
|
@@ -109,6 +116,7 @@ def cd_solver_path(X, y, datafit, penalty, alphas=None, | |
# else: | ||
# alphas = np.sort(alphas)[::-1] | ||
|
||
n_samples = len(y) | ||
n_alphas = len(alphas) | ||
|
||
coefs = np.zeros((n_features, n_alphas), order='F', dtype=X.dtype) | ||
|
@@ -144,10 +152,27 @@ def cd_solver_path(X, y, datafit, penalty, alphas=None, | |
w = np.zeros(n_features, dtype=X.dtype) | ||
Xw = np.zeros(X.shape[0], dtype=X.dtype) | ||
|
||
sol = cd_solver( | ||
X, y, datafit, penalty, w, Xw, | ||
max_iter=max_iter, max_epochs=max_epochs, p0=p0, tol=tol, | ||
use_acc=use_acc, verbose=verbose, ws_strategy=ws_strategy) | ||
if (isinstance(datafit, (Quadratic, Quadratic_32)) and n_samples > n_features | ||
and n_features < 10_000) or solver in ("cd_gram", "fista"): | ||
# Gram matrix must fit in memory hence the restriction n_features < 1e5 | ||
PABannier marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if not isinstance(datafit, (Quadratic, Quadratic_32)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this bit is unreachable because the check is already performed L155 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've placed it because the first condition is " There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok I understood, thanks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to make it more obvious. WDYT? |
||
raise ValueError("`cd_gram` and `fista` solvers are only supported " + | ||
"for `Quadratic` datafits.") | ||
if (hasattr(penalty, "alpha_max") and penalty.alpha / | ||
penalty.alpha_max(datafit.Xty) < 1e-3) or solver == "fista": | ||
sol = fista_gram_quadratic( | ||
X, y, penalty, max_epochs=max_epochs, tol=tol, w_init=None, | ||
PABannier marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ws_strategy=ws_strategy, verbose=verbose) | ||
else: | ||
sol = cd_gram_quadratic( | ||
X, y, penalty, max_epochs=max_epochs, tol=tol, w_init=None, | ||
ws_strategy=ws_strategy, verbose=verbose) | ||
w = sol[0] | ||
else: | ||
sol = cd_solver( | ||
X, y, datafit, penalty, w, Xw, | ||
max_iter=max_iter, max_epochs=max_epochs, p0=p0, tol=tol, | ||
use_acc=use_acc, verbose=verbose, ws_strategy=ws_strategy) | ||
|
||
coefs[:, t] = w | ||
stop_crits[t] = sol[-1] | ||
|
@@ -214,11 +239,11 @@ def cd_solver( | |
|
||
Returns | ||
------- | ||
alphas : array, shape (n_alphas,) | ||
The alphas along the path where models are computed. | ||
w : array, shape (n_features,) | ||
Coefficients. | ||
|
||
coefs : array, shape (n_features, n_alphas) | ||
Coefficients along the path. | ||
obj_out : array, shape (n_iter,) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we really return this? or the optimality condition violation instead There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do return this. See L371. |
||
Objective value at every outer iteration. | ||
|
||
stop_crit : array, shape (n_alphas,) | ||
Value of stopping criterion at convergence along the path. | ||
|
@@ -347,118 +372,6 @@ def cd_solver( | |
return w, np.array(obj_out), stop_crit | ||
|
||
|
||
@njit | ||
def dist_fix_point(w, grad, datafit, penalty, ws): | ||
"""Compute the violation of the fixed point iterate scheme. | ||
|
||
Parameters | ||
---------- | ||
w : array, shape (n_features,) | ||
Coefficient vector. | ||
|
||
grad : array, shape (n_features,) | ||
Gradient. | ||
|
||
datafit: instance of BaseDatafit | ||
Datafit. | ||
|
||
penalty: instance of BasePenalty | ||
Penalty. | ||
|
||
ws : array, shape (n_features,) | ||
The working set. | ||
|
||
Returns | ||
------- | ||
dist_fix_point : array, shape (n_features,) | ||
Violation score for every feature. | ||
""" | ||
dist_fix_point = np.zeros(ws.shape[0]) | ||
for idx, j in enumerate(ws): | ||
lcj = datafit.lipschitz[j] | ||
if lcj != 0: | ||
dist_fix_point[idx] = np.abs( | ||
w[j] - penalty.prox_1d(w[j] - grad[idx] / lcj, 1. / lcj, j)) | ||
return dist_fix_point | ||
|
||
|
||
@njit | ||
def construct_grad(X, y, w, Xw, datafit, ws): | ||
"""Compute the gradient of the datafit restricted to the working set. | ||
|
||
Parameters | ||
---------- | ||
X : array, shape (n_samples, n_features) | ||
Design matrix. | ||
|
||
y : array, shape (n_samples,) | ||
Target vector. | ||
|
||
w : array, shape (n_features,) | ||
Coefficient vector. | ||
|
||
Xw : array, shape (n_samples, ) | ||
Model fit. | ||
|
||
datafit : Datafit | ||
Datafit. | ||
|
||
ws : array, shape (n_features,) | ||
The working set. | ||
|
||
Returns | ||
------- | ||
grad : array, shape (ws_size, n_tasks) | ||
The gradient restricted to the working set. | ||
""" | ||
grad = np.zeros(ws.shape[0]) | ||
for idx, j in enumerate(ws): | ||
grad[idx] = datafit.gradient_scalar(X, y, w, Xw, j) | ||
return grad | ||
|
||
|
||
@njit | ||
def construct_grad_sparse(data, indptr, indices, y, w, Xw, datafit, ws): | ||
"""Compute the gradient of the datafit restricted to the working set. | ||
|
||
Parameters | ||
---------- | ||
data : array-like | ||
Data array of the matrix in CSC format. | ||
|
||
indptr : array-like | ||
CSC format index point array. | ||
|
||
indices : array-like | ||
CSC format index array. | ||
|
||
y : array, shape (n_samples, ) | ||
Target matrix. | ||
|
||
w : array, shape (n_features,) | ||
Coefficient matrix. | ||
|
||
Xw : array, shape (n_samples, ) | ||
Model fit. | ||
|
||
datafit : Datafit | ||
Datafit. | ||
|
||
ws : array, shape (n_features,) | ||
The working set. | ||
|
||
Returns | ||
------- | ||
grad : array, shape (ws_size, n_tasks) | ||
The gradient restricted to the working set. | ||
""" | ||
grad = np.zeros(ws.shape[0]) | ||
for idx, j in enumerate(ws): | ||
grad[idx] = datafit.gradient_scalar_sparse( | ||
data, indptr, indices, y, Xw, j) | ||
return grad | ||
|
||
|
||
@njit | ||
def _cd_epoch(X, y, w, Xw, datafit, penalty, feats): | ||
"""Run an epoch of coordinate descent in place. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import numpy as np | ||
from numba import njit | ||
|
||
|
||
@njit | ||
def dist_fix_point(w, grad, datafit, penalty, ws): | ||
"""Compute the violation of the fixed point iterate scheme. | ||
|
||
Parameters | ||
---------- | ||
w : array, shape (n_features,) | ||
Coefficient vector. | ||
|
||
grad : array, shape (n_features,) | ||
Gradient. | ||
|
||
datafit: instance of BaseDatafit | ||
Datafit. | ||
|
||
penalty: instance of BasePenalty | ||
Penalty. | ||
|
||
ws : array, shape (n_features,) | ||
The working set. | ||
|
||
Returns | ||
------- | ||
dist_fix_point : array, shape (n_features,) | ||
Violation score for every feature. | ||
""" | ||
dist_fix_point = np.zeros(ws.shape[0]) | ||
for idx, j in enumerate(ws): | ||
lcj = datafit.lipschitz[j] | ||
if lcj != 0: | ||
dist_fix_point[idx] = np.abs( | ||
w[j] - penalty.prox_1d(w[j] - grad[idx] / lcj, 1. / lcj, j)) | ||
return dist_fix_point | ||
|
||
|
||
@njit | ||
def construct_grad(X, y, w, Xw, datafit, ws): | ||
"""Compute the gradient of the datafit restricted to the working set. | ||
|
||
Parameters | ||
---------- | ||
X : array, shape (n_samples, n_features) | ||
Design matrix. | ||
|
||
y : array, shape (n_samples,) | ||
Target vector. | ||
|
||
w : array, shape (n_features,) | ||
Coefficient vector. | ||
|
||
Xw : array, shape (n_samples, ) | ||
Model fit. | ||
|
||
datafit : Datafit | ||
Datafit. | ||
|
||
ws : array, shape (n_features,) | ||
The working set. | ||
|
||
Returns | ||
------- | ||
grad : array, shape (ws_size, n_tasks) | ||
The gradient restricted to the working set. | ||
""" | ||
grad = np.zeros(ws.shape[0]) | ||
for idx, j in enumerate(ws): | ||
grad[idx] = datafit.gradient_scalar(X, y, w, Xw, j) | ||
return grad | ||
|
||
|
||
@njit | ||
def construct_grad_sparse(data, indptr, indices, y, w, Xw, datafit, ws): | ||
"""Compute the gradient of the datafit restricted to the working set. | ||
|
||
Parameters | ||
---------- | ||
data : array-like | ||
Data array of the matrix in CSC format. | ||
|
||
indptr : array-like | ||
CSC format index point array. | ||
|
||
indices : array-like | ||
CSC format index array. | ||
|
||
y : array, shape (n_samples, ) | ||
Target matrix. | ||
|
||
w : array, shape (n_features,) | ||
Coefficient matrix. | ||
|
||
Xw : array, shape (n_samples, ) | ||
Model fit. | ||
|
||
datafit : Datafit | ||
Datafit. | ||
|
||
ws : array, shape (n_features,) | ||
The working set. | ||
|
||
Returns | ||
------- | ||
grad : array, shape (ws_size, n_tasks) | ||
The gradient restricted to the working set. | ||
""" | ||
grad = np.zeros(ws.shape[0]) | ||
for idx, j in enumerate(ws): | ||
grad[idx] = datafit.gradient_scalar_sparse( | ||
data, indptr, indices, y, Xw, j) | ||
return grad | ||
|
||
|
||
@njit | ||
def prox_vec(penalty, z, stepsize, n_features): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. arf, I though we had make this function private, remove n_features (access as z.shape[1]) we need a reflection on solvers, but probably all penalties will need to implement it. We can do so in basepenalty, but I fear looping over all coordinates will be slower than performing it in one step as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
not a big difference, from my experiments. I tried with different thresholds. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with @QB3 we had an issue a while ago on flashcd with finance where this caused a big overhead. Just to keep it in mind |
||
"""Apply the proximal operator iteratively to a vector of weight. | ||
|
||
Parameters | ||
---------- | ||
penalty : instance of Penalty | ||
Penalty. | ||
|
||
z : array, shape (n_features,) | ||
Coefficient vector. | ||
|
||
stepsize : float | ||
Step size. | ||
|
||
n_features : int | ||
Number of features. | ||
""" | ||
w = np.zeros(n_features, dtype=z.dtype) | ||
for j in range(n_features): | ||
w[j] = penalty.prox_1d(z[j], stepsize, j) | ||
return w |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FISTA is not a CD solver, it's confusing to expose it to the user like this.
@mathurinm WDYT?