Skip to content

Commit

Permalink
[MRG after #12145] Add "Randomized SVD" solver option to KernelPCA fo…
Browse files Browse the repository at this point in the history
…r faster partial decompositions, like in PCA (#12069)

Co-authored-by: Sylvain MARIE <sylvain.marie@se.com>
Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Joel Nothman <joel.nothman@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
Co-authored-by: Tom Dupré la Tour <tom.dupre-la-tour@m4x.org>
  • Loading branch information
8 people committed Apr 27, 2021
1 parent 674e7df commit bf0886b
Show file tree
Hide file tree
Showing 8 changed files with 1,017 additions and 82 deletions.
148 changes: 148 additions & 0 deletions benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
@@ -0,0 +1,148 @@
"""
=============================================================
Kernel PCA Solvers comparison benchmark: time vs n_components
=============================================================
This benchmark shows that the approximate solvers provided in Kernel PCA can
help significantly improve its execution speed when an approximate solution
(small `n_components`) is acceptable. In many real-world datasets a few
hundreds of principal components are indeed sufficient enough to capture the
underlying distribution.
Description:
------------
A fixed number of training (default: 2000) and test (default: 1000) samples
with 2 features is generated using the `make_circles` helper method.
KernelPCA models are trained on the training set with an increasing number of
principal components, between 1 and `max_n_compo` (default: 1999), with
`n_compo_grid_size` positions (default: 10). For each value of `n_components`
to try, KernelPCA models are trained for the various possible `eigen_solver`
values. The execution times are displayed in a plot at the end of the
experiment.
What you can observe:
---------------------
When the number of requested principal components is small, the dense solver
takes more time to complete, while the randomized method returns similar
results with shorter execution times.
Going further:
--------------
You can adjust `max_n_compo` and `n_compo_grid_size` if you wish to explore a
different range of values for `n_components`.
You can also set `arpack_all=True` to activate arpack solver for large number
of components (this takes more time).
"""
# Authors: Sylvain MARIE, Schneider Electric

import time

import numpy as np
import matplotlib.pyplot as plt

from numpy.testing import assert_array_almost_equal
from sklearn.decomposition import KernelPCA
from sklearn.datasets import make_circles


print(__doc__)


# 1- Design the Experiment
# ------------------------
n_train, n_test = 2000, 1000 # the sample sizes to use
max_n_compo = 1999 # max n_components to try
n_compo_grid_size = 10 # nb of positions in the grid to try
# generate the grid
n_compo_range = [np.round(np.exp((x / (n_compo_grid_size - 1))
* np.log(max_n_compo)))
for x in range(0, n_compo_grid_size)]

n_iter = 3 # the number of times each experiment will be repeated
arpack_all = False # set to True if you wish to run arpack for all n_compo


# 2- Generate random data
# -----------------------
n_features = 2
X, y = make_circles(n_samples=(n_train + n_test), factor=.3, noise=.05,
random_state=0)
X_train, X_test = X[:n_train, :], X[n_train:, :]


# 3- Benchmark
# ------------
# init
ref_time = np.empty((len(n_compo_range), n_iter)) * np.nan
a_time = np.empty((len(n_compo_range), n_iter)) * np.nan
r_time = np.empty((len(n_compo_range), n_iter)) * np.nan
# loop
for j, n_components in enumerate(n_compo_range):

n_components = int(n_components)
print("Performing kPCA with n_components = %i" % n_components)

# A- reference (dense)
print(" - dense solver")
for i in range(n_iter):
start_time = time.perf_counter()
ref_pred = KernelPCA(n_components, eigen_solver="dense") \
.fit(X_train).transform(X_test)
ref_time[j, i] = time.perf_counter() - start_time

# B- arpack (for small number of components only, too slow otherwise)
if arpack_all or n_components < 100:
print(" - arpack solver")
for i in range(n_iter):
start_time = time.perf_counter()
a_pred = KernelPCA(n_components, eigen_solver="arpack") \
.fit(X_train).transform(X_test)
a_time[j, i] = time.perf_counter() - start_time
# check that the result is still correct despite the approx
assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))

# C- randomized
print(" - randomized solver")
for i in range(n_iter):
start_time = time.perf_counter()
r_pred = KernelPCA(n_components, eigen_solver="randomized") \
.fit(X_train).transform(X_test)
r_time[j, i] = time.perf_counter() - start_time
# check that the result is still correct despite the approximation
assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))

# Compute statistics for the 3 methods
avg_ref_time = ref_time.mean(axis=1)
std_ref_time = ref_time.std(axis=1)
avg_a_time = a_time.mean(axis=1)
std_a_time = a_time.std(axis=1)
avg_r_time = r_time.mean(axis=1)
std_r_time = r_time.std(axis=1)


# 4- Plots
# --------
fig, ax = plt.subplots(figsize=(12, 8))

# Display 1 plot with error bars per method
ax.errorbar(n_compo_range, avg_ref_time, yerr=std_ref_time,
marker='x', linestyle='', color='r', label='full')
ax.errorbar(n_compo_range, avg_a_time, yerr=std_a_time, marker='x',
linestyle='', color='g', label='arpack')
ax.errorbar(n_compo_range, avg_r_time, yerr=std_r_time, marker='x',
linestyle='', color='b', label='randomized')
ax.legend(loc='upper left')

# customize axes
ax.set_xscale('log')
ax.set_xlim(1, max(n_compo_range) * 1.1)
ax.set_ylabel("Execution time (s)")
ax.set_xlabel("n_components")

ax.set_title("kPCA Execution time comparison on %i samples with %i "
"features, according to the choice of `eigen_solver`"
"" % (n_train, n_features))

plt.show()
153 changes: 153 additions & 0 deletions benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py
@@ -0,0 +1,153 @@
"""
==========================================================
Kernel PCA Solvers comparison benchmark: time vs n_samples
==========================================================
This benchmark shows that the approximate solvers provided in Kernel PCA can
help significantly improve its execution speed when an approximate solution
(small `n_components`) is acceptable. In many real-world datasets the number of
samples is very large, but a few hundreds of principal components are
sufficient enough to capture the underlying distribution.
Description:
------------
An increasing number of examples is used to train a KernelPCA, between
`min_n_samples` (default: 101) and `max_n_samples` (default: 4000) with
`n_samples_grid_size` positions (default: 4). Samples have 2 features, and are
generated using `make_circles`. For each training sample size, KernelPCA models
are trained for the various possible `eigen_solver` values. All of them are
trained to obtain `n_components` principal components (default: 100). The
execution times are displayed in a plot at the end of the experiment.
What you can observe:
---------------------
When the number of samples provided gets large, the dense solver takes a lot
of time to complete, while the randomized method returns similar results in
much shorter execution times.
Going further:
--------------
You can increase `max_n_samples` and `nb_n_samples_to_try` if you wish to
explore a wider range of values for `n_samples`.
You can also set `include_arpack=True` to add this other solver in the
experiments (much slower).
Finally you can have a look at the second example of this series, "Kernel PCA
Solvers comparison benchmark: time vs n_components", where this time the number
of examples is fixed, and the desired number of components varies.
"""
# Author: Sylvain MARIE, Schneider Electric

import time

import numpy as np
import matplotlib.pyplot as plt

from numpy.testing import assert_array_almost_equal
from sklearn.decomposition import KernelPCA
from sklearn.datasets import make_circles


print(__doc__)


# 1- Design the Experiment
# ------------------------
min_n_samples, max_n_samples = 101, 4000 # min and max n_samples to try
n_samples_grid_size = 4 # nb of positions in the grid to try
# generate the grid
n_samples_range = [min_n_samples + np.floor((x / (n_samples_grid_size - 1))
* (max_n_samples - min_n_samples))
for x in range(0, n_samples_grid_size)]

n_components = 100 # the number of principal components we want to use
n_iter = 3 # the number of times each experiment will be repeated
include_arpack = False # set this to True to include arpack solver (slower)


# 2- Generate random data
# -----------------------
n_features = 2
X, y = make_circles(n_samples=max_n_samples, factor=.3, noise=.05,
random_state=0)


# 3- Benchmark
# ------------
# init
ref_time = np.empty((len(n_samples_range), n_iter)) * np.nan
a_time = np.empty((len(n_samples_range), n_iter)) * np.nan
r_time = np.empty((len(n_samples_range), n_iter)) * np.nan

# loop
for j, n_samples in enumerate(n_samples_range):

n_samples = int(n_samples)
print("Performing kPCA with n_samples = %i" % n_samples)

X_train = X[:n_samples, :]
X_test = X_train

# A- reference (dense)
print(" - dense")
for i in range(n_iter):
start_time = time.perf_counter()
ref_pred = KernelPCA(n_components, eigen_solver="dense") \
.fit(X_train).transform(X_test)
ref_time[j, i] = time.perf_counter() - start_time

# B- arpack
if include_arpack:
print(" - arpack")
for i in range(n_iter):
start_time = time.perf_counter()
a_pred = KernelPCA(n_components, eigen_solver="arpack") \
.fit(X_train).transform(X_test)
a_time[j, i] = time.perf_counter() - start_time
# check that the result is still correct despite the approx
assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))

# C- randomized
print(" - randomized")
for i in range(n_iter):
start_time = time.perf_counter()
r_pred = KernelPCA(n_components, eigen_solver="randomized") \
.fit(X_train).transform(X_test)
r_time[j, i] = time.perf_counter() - start_time
# check that the result is still correct despite the approximation
assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))

# Compute statistics for the 3 methods
avg_ref_time = ref_time.mean(axis=1)
std_ref_time = ref_time.std(axis=1)
avg_a_time = a_time.mean(axis=1)
std_a_time = a_time.std(axis=1)
avg_r_time = r_time.mean(axis=1)
std_r_time = r_time.std(axis=1)


# 4- Plots
# --------
fig, ax = plt.subplots(figsize=(12, 8))

# Display 1 plot with error bars per method
ax.errorbar(n_samples_range, avg_ref_time, yerr=std_ref_time,
marker='x', linestyle='', color='r', label='full')
if include_arpack:
ax.errorbar(n_samples_range, avg_a_time, yerr=std_a_time, marker='x',
linestyle='', color='g', label='arpack')
ax.errorbar(n_samples_range, avg_r_time, yerr=std_r_time, marker='x',
linestyle='', color='b', label='randomized')
ax.legend(loc='upper left')

# customize axes
ax.set_xlim(min(n_samples_range) * 0.9, max(n_samples_range) * 1.1)
ax.set_ylabel("Execution time (s)")
ax.set_xlabel("n_samples")

ax.set_title("Execution time comparison of kPCA with %i components on samples "
"with %i features, according to the choice of `eigen_solver`"
"" % (n_components, n_features))

plt.show()

0 comments on commit bf0886b

Please sign in to comment.