Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+2] ENH : SAGA support for LogisticRegression (+ L1 support), Ridge #8446

Merged
merged 1 commit into from Mar 27, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 4 additions & 1 deletion benchmarks/bench_mnist.py
Expand Up @@ -91,7 +91,10 @@ def load_data(dtype=np.float32, order='F'):
Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)),
'SampledRBF-SVM': make_pipeline(
RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)),
'LinearRegression-SAG': LogisticRegression(solver='sag', tol=1e-1, C=1e4),
'LogisticRegression-SAG': LogisticRegression(solver='sag', tol=1e-1,
C=1e4),
'LogisticRegression-SAGA': LogisticRegression(solver='saga', tol=1e-1,
C=1e4),
'MultilayerPerceptron': MLPClassifier(
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
solver='sgd', learning_rate_init=0.2, momentum=0.9, verbose=1,
Expand Down
244 changes: 244 additions & 0 deletions benchmarks/bench_saga.py
@@ -0,0 +1,244 @@
"""Author: Arthur Mensch

Benchmarks of sklearn SAGA vs lightning SAGA vs Liblinear. Shows the gain
in using multinomial logistic regression in term of learning time.
"""
import json
import time
from os.path import expanduser

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import fetch_rcv1, load_iris, load_digits, \
fetch_20newsgroups_vectorized
from sklearn.externals.joblib import delayed, Parallel, Memory
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.utils.extmath import safe_sparse_dot, softmax


def fit_single(solver, X, y, penalty='l2', single_target=True, C=1,
max_iter=10, skip_slow=False):
if skip_slow and solver == 'lightning' and penalty == 'l1':
print('skip_slowping l1 logistic regression with solver lightning.')
return

print('Solving %s logistic regression with penalty %s, solver %s.'
% ('binary' if single_target else 'multinomial',
penalty, solver))

if solver == 'lightning':
from lightning.classification import SAGAClassifier

if single_target or solver not in ['sag', 'saga']:
multi_class = 'ovr'
else:
multi_class = 'multinomial'

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
stratify=y)
n_samples = X_train.shape[0]
n_classes = np.unique(y_train).shape[0]
test_scores = [1]
train_scores = [1]
accuracies = [1 / n_classes]
times = [0]

if penalty == 'l2':
alpha = 1. / (C * n_samples)
beta = 0
lightning_penalty = None
else:
alpha = 0.
beta = 1. / (C * n_samples)
lightning_penalty = 'l1'

for this_max_iter in range(1, max_iter + 1, 2):
print('[%s, %s, %s] Max iter: %s' %
('binary' if single_target else 'multinomial',
penalty, solver, this_max_iter))
if solver == 'lightning':
lr = SAGAClassifier(loss='log', alpha=alpha, beta=beta,
penalty=lightning_penalty,
tol=-1, max_iter=this_max_iter)
else:
lr = LogisticRegression(solver=solver,
multi_class=multi_class,
C=C,
penalty=penalty,
fit_intercept=False, tol=1e-24,
max_iter=this_max_iter,
random_state=42,
)
t0 = time.clock()
lr.fit(X_train, y_train)
train_time = time.clock() - t0

scores = []
for (X, y) in [(X_train, y_train), (X_test, y_test)]:
try:
y_pred = lr.predict_proba(X)
except NotImplementedError:
# Lightning predict_proba is not implemented for n_classes > 2
y_pred = _predict_proba(lr, X)
score = log_loss(y, y_pred, normalize=False) / n_samples
score += (0.5 * alpha * np.sum(lr.coef_ ** 2) +
beta * np.sum(np.abs(lr.coef_)))
scores.append(score)
train_score, test_score = tuple(scores)

y_pred = lr.predict(X_test)
accuracy = np.sum(y_pred == y_test) / y_test.shape[0]
test_scores.append(test_score)
train_scores.append(train_score)
accuracies.append(accuracy)
times.append(train_time)
return lr, times, train_scores, test_scores, accuracies


def _predict_proba(lr, X):
pred = safe_sparse_dot(X, lr.coef_.T)
if hasattr(lr, "intercept_"):
pred += lr.intercept_
return softmax(pred)


def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20,
dataset='rcv1', n_jobs=1, skip_slow=False):
mem = Memory(cachedir=expanduser('~/cache'), verbose=0)

if dataset == 'rcv1':
rcv1 = fetch_rcv1()

lbin = LabelBinarizer()
lbin.fit(rcv1.target_names)

X = rcv1.data
y = rcv1.target
y = lbin.inverse_transform(y)
le = LabelEncoder()
y = le.fit_transform(y)
if single_target:
y_n = y.copy()
y_n[y > 16] = 1
y_n[y <= 16] = 0
y = y_n

elif dataset == 'digits':
digits = load_digits()
X, y = digits.data, digits.target
if single_target:
y_n = y.copy()
y_n[y < 5] = 1
y_n[y >= 5] = 0
y = y_n
elif dataset == 'iris':
iris = load_iris()
X, y = iris.data, iris.target
elif dataset == '20newspaper':
ng = fetch_20newsgroups_vectorized()
X = ng.data
y = ng.target
if single_target:
y_n = y.copy()
y_n[y > 4] = 1
y_n[y <= 16] = 0
y = y_n

X = X[:n_samples]
y = y[:n_samples]

cached_fit = mem.cache(fit_single)
out = Parallel(n_jobs=n_jobs, mmap_mode=None)(
delayed(cached_fit)(solver, X, y,
penalty=penalty, single_target=single_target,
C=1, max_iter=max_iter, skip_slow=skip_slow)
for solver in solvers
for penalty in penalties)

res = []
idx = 0
for solver in solvers:
for penalty in penalties:
if not (skip_slow and solver == 'lightning' and penalty == 'l1'):
lr, times, train_scores, test_scores, accuracies = out[idx]
this_res = dict(solver=solver, penalty=penalty,
single_target=single_target,
times=times, train_scores=train_scores,
test_scores=test_scores,
accuracies=accuracies)
res.append(this_res)
idx += 1

with open('bench_saga.json', 'w+') as f:
json.dump(res, f)


def plot():
import pandas as pd
with open('bench_saga.json', 'r') as f:
f = json.load(f)
res = pd.DataFrame(f)
res.set_index(['single_target', 'penalty'], inplace=True)

grouped = res.groupby(level=['single_target', 'penalty'])

colors = {'saga': 'blue', 'liblinear': 'orange', 'lightning': 'green'}

for idx, group in grouped:
single_target, penalty = idx
fig = plt.figure(figsize=(12, 4))
ax = fig.add_subplot(131)

train_scores = group['train_scores'].values
ref = np.min(np.concatenate(train_scores)) * 0.999

for scores, times, solver in zip(group['train_scores'], group['times'],
group['solver']):
scores = scores / ref - 1
ax.plot(times, scores, label=solver, color=colors[solver])
ax.set_xlabel('Time (s)')
ax.set_ylabel('Training objective (relative to min)')
ax.set_yscale('log')

ax = fig.add_subplot(132)

test_scores = group['test_scores'].values
ref = np.min(np.concatenate(test_scores)) * 0.999

for scores, times, solver in zip(group['test_scores'], group['times'],
group['solver']):
scores = scores / ref - 1
ax.plot(times, scores, label=solver, color=colors[solver])
ax.set_xlabel('Time (s)')
ax.set_ylabel('Test objective (relative to min)')
ax.set_yscale('log')

ax = fig.add_subplot(133)

for accuracy, times, solver in zip(group['accuracies'], group['times'],
group['solver']):
ax.plot(times, accuracy, label=solver, color=colors[solver])
ax.set_xlabel('Time (s)')
ax.set_ylabel('Test accuracy')
ax.legend()
name = 'single_target' if single_target else 'multi_target'
name += '_%s' % penalty
plt.suptitle(name)
name += '.png'
fig.tight_layout()
fig.subplots_adjust(top=0.9)
plt.savefig(name)
plt.close(fig)


if __name__ == '__main__':
solvers = ['saga', 'liblinear', 'lightning']
penalties = ['l1', 'l2']
single_target = True
exp(solvers, penalties, single_target, n_samples=None, n_jobs=1,
dataset='20newspaper', max_iter=20)
plot()
51 changes: 32 additions & 19 deletions doc/modules/linear_model.rst
Expand Up @@ -721,7 +721,7 @@ optimization problem
.. math:: \underset{w, c}{min\,} \|w\|_1 + C \sum_{i=1}^n \log(\exp(- y_i (X_i^T w + c)) + 1) .

The solvers implemented in the class :class:`LogisticRegression`
are "liblinear", "newton-cg", "lbfgs" and "sag":
are "liblinear", "newton-cg", "lbfgs", "sag" and "saga":

The solver "liblinear" uses a coordinate descent (CD) algorithm, and relies
on the excellent C++ `LIBLINEAR library
Expand All @@ -739,25 +739,31 @@ The "lbfgs", "sag" and "newton-cg" solvers only support L2 penalization and
are found to converge faster for some high dimensional data. Setting
`multi_class` to "multinomial" with these solvers learns a true multinomial
logistic regression model [5]_, which means that its probability estimates
should be better calibrated than the default "one-vs-rest" setting. The
"lbfgs", "sag" and "newton-cg"" solvers cannot optimize L1-penalized models,
therefore the "multinomial" setting does not learn sparse models.
should be better calibrated than the default "one-vs-rest" setting.

The solver "sag" uses a Stochastic Average Gradient descent [6]_. It is faster
The "sag" solver uses a Stochastic Average Gradient descent [6]_. It is faster
than other solvers for large datasets, when both the number of samples and the
number of features are large.

The "saga" solver [7]_ is a variant of "sag" that also supports the
non-smooth `penalty="l1"` option. This is therefore the solver of choice
for sparse multinomial logistic regression.

In a nutshell, one may choose the solver with the following rules:

================================= =============================
================================= =====================================
Case Solver
================================= =============================
Small dataset or L1 penalty "liblinear"
Multinomial loss or large dataset "lbfgs", "sag" or "newton-cg"
Very Large dataset "sag"
================================= =============================
================================= =====================================
L1 penalty "liblinear" or "saga"
Multinomial loss "lbfgs", "sag", "saga" or "newton-cg"
Very Large dataset (`n_samples`) "sag" or "saga"
================================= =====================================

The "saga" solver is often the best choice. The "liblinear" solver is
used by default for historical reasons.

For large dataset, you may also consider using :class:`SGDClassifier` with 'log' loss.
For large dataset, you may also consider using :class:`SGDClassifier`
with 'log' loss.

.. topic:: Examples:

Expand All @@ -767,6 +773,10 @@ For large dataset, you may also consider using :class:`SGDClassifier` with 'log'

* :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_multinomial.py`

* :ref:`sphx_glr_auto_examples_linear_model_plot_sparse_logistic_regression_20newsgroups.py`

* :ref:`sphx_glr_auto_examples_linear_model_plot_sparse_logistic_regression_mnist.py`

.. _liblinear_differences:

.. topic:: Differences from liblinear:
Expand All @@ -788,20 +798,23 @@ For large dataset, you may also consider using :class:`SGDClassifier` with 'log'
thus be used to perform feature selection, as detailed in
:ref:`l1_feature_selection`.

:class:`LogisticRegressionCV` implements Logistic Regression with builtin
cross-validation to find out the optimal C parameter. "newton-cg", "sag" and
"lbfgs" solvers are found to be faster for high-dimensional dense data, due to
warm-starting. For the multiclass case, if `multi_class` option is set to
"ovr", an optimal C is obtained for each class and if the `multi_class` option
is set to "multinomial", an optimal C is obtained by minimizing the cross-
entropy loss.
:class:`LogisticRegressionCV` implements Logistic Regression with
builtin cross-validation to find out the optimal C parameter.
"newton-cg", "sag", "saga" and "lbfgs" solvers are found to be faster
for high-dimensional dense data, due to warm-starting. For the
multiclass case, if `multi_class` option is set to "ovr", an optimal C
is obtained for each class and if the `multi_class` option is set to
"multinomial", an optimal C is obtained by minimizing the cross-entropy
loss.

.. topic:: References:

.. [5] Christopher M. Bishop: Pattern Recognition and Machine Learning, Chapter 4.3.4

.. [6] Mark Schmidt, Nicolas Le Roux, and Francis Bach: `Minimizing Finite Sums with the Stochastic Average Gradient. <https://hal.inria.fr/hal-00860051/document>`_

.. [7] Aaron Defazio, Francis Bach, Simon Lacoste-Julien: `SAGA: A Fast Incremental Gradient Method With Support for Non-Strongly Convex Composite Objectives. <https://arxiv.org/abs/1407.0202>`_

Stochastic Gradient Descent - SGD
=================================

Expand Down
8 changes: 8 additions & 0 deletions doc/whats_new.rst
Expand Up @@ -50,6 +50,13 @@ New features
particularly useful for targets with an exponential trend.
:issue:`7655` by :user:`Karan Desai <karandesai-96>`.

- Added solver ``saga`` that implements the improved version of Stochastic
Average Gradient, in :class:`linear_model.LogisticRegression` and
:class:`linear_model.Ridge`. It allows the use of L1 penalty with
multinomial logistic loss, and behaves marginally better than 'sag'
during the first epochs of ridge and logistic regression.
By `Arthur Mensch`_.

Enhancements
............

Expand Down Expand Up @@ -5037,3 +5044,4 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
.. _Anish Shah: https://github.com/AnishShah

.. _Neeraj Gangwar: http://neerajgangwar.in
.. _Arthur Mensch: https://amensch.fr