/
test_kernel_pca.py
566 lines (451 loc) · 20.3 KB
/
test_kernel_pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
import warnings
import numpy as np
import pytest
import sklearn
from sklearn.datasets import load_iris, make_blobs, make_circles
from sklearn.decomposition import PCA, KernelPCA
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import Perceptron
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils._testing import (
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils.fixes import CSR_CONTAINERS
from sklearn.utils.validation import _check_psd_eigenvalues
def test_kernel_pca():
"""Nominal test for all solvers and all known kernels + a custom one
It tests
- that fit_transform is equivalent to fit+transform
- that the shapes of transforms and inverse transforms are correct
"""
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
X_pred = rng.random_sample((2, 4))
def histogram(x, y, **kwargs):
# Histogram kernel implemented as a callable.
assert kwargs == {} # no kernel_params that we didn't ask for
return np.minimum(x, y).sum()
for eigen_solver in ("auto", "dense", "arpack", "randomized"):
for kernel in ("linear", "rbf", "poly", histogram):
# histogram kernel produces singular matrix inside linalg.solve
# XXX use a least-squares approximation?
inv = not callable(kernel)
# transform fit data
kpca = KernelPCA(
4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv
)
X_fit_transformed = kpca.fit_transform(X_fit)
X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
assert_array_almost_equal(
np.abs(X_fit_transformed), np.abs(X_fit_transformed2)
)
# non-regression test: previously, gamma would be 0 by default,
# forcing all eigenvalues to 0 under the poly kernel
assert X_fit_transformed.size != 0
# transform new data
X_pred_transformed = kpca.transform(X_pred)
assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1]
# inverse transform
if inv:
X_pred2 = kpca.inverse_transform(X_pred_transformed)
assert X_pred2.shape == X_pred.shape
def test_kernel_pca_invalid_parameters():
"""Check that kPCA raises an error if the parameters are invalid
Tests fitting inverse transform with a precomputed kernel raises a
ValueError.
"""
estimator = KernelPCA(
n_components=10, fit_inverse_transform=True, kernel="precomputed"
)
err_ms = "Cannot fit_inverse_transform with a precomputed kernel"
with pytest.raises(ValueError, match=err_ms):
estimator.fit(np.random.randn(10, 10))
def test_kernel_pca_consistent_transform():
"""Check robustness to mutations in the original training array
Test that after fitting a kPCA model, it stays independent of any
mutation of the values of the original data object by relying on an
internal copy.
"""
# X_fit_ needs to retain the old, unmodified copy of X
state = np.random.RandomState(0)
X = state.rand(10, 10)
kpca = KernelPCA(random_state=state).fit(X)
transformed1 = kpca.transform(X)
X_copy = X.copy()
X[:, 0] = 666
transformed2 = kpca.transform(X_copy)
assert_array_almost_equal(transformed1, transformed2)
def test_kernel_pca_deterministic_output():
"""Test that Kernel PCA produces deterministic output
Tests that the same inputs and random state produce the same output.
"""
rng = np.random.RandomState(0)
X = rng.rand(10, 10)
eigen_solver = ("arpack", "dense")
for solver in eigen_solver:
transformed_X = np.zeros((20, 2))
for i in range(20):
kpca = KernelPCA(n_components=2, eigen_solver=solver, random_state=rng)
transformed_X[i, :] = kpca.fit_transform(X)[0]
assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_kernel_pca_sparse(csr_container):
"""Test that kPCA works on a sparse data input.
Same test as ``test_kernel_pca except inverse_transform`` since it's not
implemented for sparse matrices.
"""
rng = np.random.RandomState(0)
X_fit = csr_container(rng.random_sample((5, 4)))
X_pred = csr_container(rng.random_sample((2, 4)))
for eigen_solver in ("auto", "arpack", "randomized"):
for kernel in ("linear", "rbf", "poly"):
# transform fit data
kpca = KernelPCA(
4,
kernel=kernel,
eigen_solver=eigen_solver,
fit_inverse_transform=False,
random_state=0,
)
X_fit_transformed = kpca.fit_transform(X_fit)
X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
assert_array_almost_equal(
np.abs(X_fit_transformed), np.abs(X_fit_transformed2)
)
# transform new data
X_pred_transformed = kpca.transform(X_pred)
assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1]
# inverse transform: not available for sparse matrices
# XXX: should we raise another exception type here? For instance:
# NotImplementedError.
with pytest.raises(NotFittedError):
kpca.inverse_transform(X_pred_transformed)
@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
@pytest.mark.parametrize("n_features", [4, 10])
def test_kernel_pca_linear_kernel(solver, n_features):
"""Test that kPCA with linear kernel is equivalent to PCA for all solvers.
KernelPCA with linear kernel should produce the same output as PCA.
"""
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, n_features))
X_pred = rng.random_sample((2, n_features))
# for a linear kernel, kernel PCA should find the same projection as PCA
# modulo the sign (direction)
# fit only the first four components: fifth is near zero eigenvalue, so
# can be trimmed due to roundoff error
n_comps = 3 if solver == "arpack" else 4
assert_array_almost_equal(
np.abs(KernelPCA(n_comps, eigen_solver=solver).fit(X_fit).transform(X_pred)),
np.abs(
PCA(n_comps, svd_solver=solver if solver != "dense" else "full")
.fit(X_fit)
.transform(X_pred)
),
)
def test_kernel_pca_n_components():
"""Test that `n_components` is correctly taken into account for projections
For all solvers this tests that the output has the correct shape depending
on the selected number of components.
"""
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
X_pred = rng.random_sample((2, 4))
for eigen_solver in ("dense", "arpack", "randomized"):
for c in [1, 2, 4]:
kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver)
shape = kpca.fit(X_fit).transform(X_pred).shape
assert shape == (2, c)
def test_remove_zero_eig():
"""Check that the ``remove_zero_eig`` parameter works correctly.
Tests that the null-space (Zero) eigenvalues are removed when
remove_zero_eig=True, whereas they are not by default.
"""
X = np.array([[1 - 1e-30, 1], [1, 1], [1, 1 - 1e-20]])
# n_components=None (default) => remove_zero_eig is True
kpca = KernelPCA()
Xt = kpca.fit_transform(X)
assert Xt.shape == (3, 0)
kpca = KernelPCA(n_components=2)
Xt = kpca.fit_transform(X)
assert Xt.shape == (3, 2)
kpca = KernelPCA(n_components=2, remove_zero_eig=True)
Xt = kpca.fit_transform(X)
assert Xt.shape == (3, 0)
def test_leave_zero_eig():
"""Non-regression test for issue #12141 (PR #12143)
This test checks that fit().transform() returns the same result as
fit_transform() in case of non-removed zero eigenvalue.
"""
X_fit = np.array([[1, 1], [0, 0]])
# Assert that even with all np warnings on, there is no div by zero warning
with warnings.catch_warnings():
# There might be warnings about the kernel being badly conditioned,
# but there should not be warnings about division by zero.
# (Numpy division by zero warning can have many message variants, but
# at least we know that it is a RuntimeWarning so lets check only this)
warnings.simplefilter("error", RuntimeWarning)
with np.errstate(all="warn"):
k = KernelPCA(n_components=2, remove_zero_eig=False, eigen_solver="dense")
# Fit, then transform
A = k.fit(X_fit).transform(X_fit)
# Do both at once
B = k.fit_transform(X_fit)
# Compare
assert_array_almost_equal(np.abs(A), np.abs(B))
def test_kernel_pca_precomputed():
"""Test that kPCA works with a precomputed kernel, for all solvers"""
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
X_pred = rng.random_sample((2, 4))
for eigen_solver in ("dense", "arpack", "randomized"):
X_kpca = (
KernelPCA(4, eigen_solver=eigen_solver, random_state=0)
.fit(X_fit)
.transform(X_pred)
)
X_kpca2 = (
KernelPCA(
4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
)
.fit(np.dot(X_fit, X_fit.T))
.transform(np.dot(X_pred, X_fit.T))
)
X_kpca_train = KernelPCA(
4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
).fit_transform(np.dot(X_fit, X_fit.T))
X_kpca_train2 = (
KernelPCA(
4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
)
.fit(np.dot(X_fit, X_fit.T))
.transform(np.dot(X_fit, X_fit.T))
)
assert_array_almost_equal(np.abs(X_kpca), np.abs(X_kpca2))
assert_array_almost_equal(np.abs(X_kpca_train), np.abs(X_kpca_train2))
@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
def test_kernel_pca_precomputed_non_symmetric(solver):
"""Check that the kernel centerer works.
Tests that a non symmetric precomputed kernel is actually accepted
because the kernel centerer does its job correctly.
"""
# a non symmetric gram matrix
K = [[1, 2], [3, 40]]
kpca = KernelPCA(
kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0
)
kpca.fit(K) # no error
# same test with centered kernel
Kc = [[9, -9], [-9, 9]]
kpca_c = KernelPCA(
kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0
)
kpca_c.fit(Kc)
# comparison between the non-centered and centered versions
assert_array_equal(kpca.eigenvectors_, kpca_c.eigenvectors_)
assert_array_equal(kpca.eigenvalues_, kpca_c.eigenvalues_)
def test_gridsearch_pipeline():
"""Check that kPCA works as expected in a grid search pipeline
Test if we can do a grid-search to find parameters to separate
circles with a perceptron model.
"""
X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
kpca = KernelPCA(kernel="rbf", n_components=2)
pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))])
param_grid = dict(kernel_pca__gamma=2.0 ** np.arange(-2, 2))
grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
grid_search.fit(X, y)
assert grid_search.best_score_ == 1
def test_gridsearch_pipeline_precomputed():
"""Check that kPCA works as expected in a grid search pipeline (2)
Test if we can do a grid-search to find parameters to separate
circles with a perceptron model. This test uses a precomputed kernel.
"""
X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
kpca = KernelPCA(kernel="precomputed", n_components=2)
pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))])
param_grid = dict(Perceptron__max_iter=np.arange(1, 5))
grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
X_kernel = rbf_kernel(X, gamma=2.0)
grid_search.fit(X_kernel, y)
assert grid_search.best_score_ == 1
def test_nested_circles():
"""Check that kPCA projects in a space where nested circles are separable
Tests that 2D nested circles become separable with a perceptron when
projected in the first 2 kPCA using an RBF kernel, while raw samples
are not directly separable in the original space.
"""
X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
# 2D nested circles are not linearly separable
train_score = Perceptron(max_iter=5).fit(X, y).score(X, y)
assert train_score < 0.8
# Project the circles data into the first 2 components of a RBF Kernel
# PCA model.
# Note that the gamma value is data dependent. If this test breaks
# and the gamma value has to be updated, the Kernel PCA example will
# have to be updated too.
kpca = KernelPCA(
kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.0
)
X_kpca = kpca.fit_transform(X)
# The data is perfectly linearly separable in that space
train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y)
assert train_score == 1.0
def test_kernel_conditioning():
"""Check that ``_check_psd_eigenvalues`` is correctly called in kPCA
Non-regression test for issue #12140 (PR #12145).
"""
# create a pathological X leading to small non-zero eigenvalue
X = [[5, 1], [5 + 1e-8, 1e-8], [5 + 1e-8, 0]]
kpca = KernelPCA(kernel="linear", n_components=2, fit_inverse_transform=True)
kpca.fit(X)
# check that the small non-zero eigenvalue was correctly set to zero
assert kpca.eigenvalues_.min() == 0
assert np.all(kpca.eigenvalues_ == _check_psd_eigenvalues(kpca.eigenvalues_))
@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
def test_precomputed_kernel_not_psd(solver):
"""Check how KernelPCA works with non-PSD kernels depending on n_components
Tests for all methods what happens with a non PSD gram matrix (this
can happen in an isomap scenario, or with custom kernel functions, or
maybe with ill-posed datasets).
When ``n_component`` is large enough to capture a negative eigenvalue, an
error should be raised. Otherwise, KernelPCA should run without error
since the negative eigenvalues are not selected.
"""
# a non PSD kernel with large eigenvalues, already centered
# it was captured from an isomap call and multiplied by 100 for compacity
K = [
[4.48, -1.0, 8.07, 2.33, 2.33, 2.33, -5.76, -12.78],
[-1.0, -6.48, 4.5, -1.24, -1.24, -1.24, -0.81, 7.49],
[8.07, 4.5, 15.48, 2.09, 2.09, 2.09, -11.1, -23.23],
[2.33, -1.24, 2.09, 4.0, -3.65, -3.65, 1.02, -0.9],
[2.33, -1.24, 2.09, -3.65, 4.0, -3.65, 1.02, -0.9],
[2.33, -1.24, 2.09, -3.65, -3.65, 4.0, 1.02, -0.9],
[-5.76, -0.81, -11.1, 1.02, 1.02, 1.02, 4.86, 9.75],
[-12.78, 7.49, -23.23, -0.9, -0.9, -0.9, 9.75, 21.46],
]
# this gram matrix has 5 positive eigenvalues and 3 negative ones
# [ 52.72, 7.65, 7.65, 5.02, 0. , -0. , -6.13, -15.11]
# 1. ask for enough components to get a significant negative one
kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=7)
# make sure that the appropriate error is raised
with pytest.raises(ValueError, match="There are significant negative eigenvalues"):
kpca.fit(K)
# 2. ask for a small enough n_components to get only positive ones
kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=2)
if solver == "randomized":
# the randomized method is still inconsistent with the others on this
# since it selects the eigenvalues based on the largest 2 modules, not
# on the largest 2 values.
#
# At least we can ensure that we return an error instead of returning
# the wrong eigenvalues
with pytest.raises(
ValueError, match="There are significant negative eigenvalues"
):
kpca.fit(K)
else:
# general case: make sure that it works
kpca.fit(K)
@pytest.mark.parametrize("n_components", [4, 10, 20])
def test_kernel_pca_solvers_equivalence(n_components):
"""Check that 'dense' 'arpack' & 'randomized' solvers give similar results"""
# Generate random data
n_train, n_test = 1_000, 100
X, _ = make_circles(
n_samples=(n_train + n_test), factor=0.3, noise=0.05, random_state=0
)
X_fit, X_pred = X[:n_train, :], X[n_train:, :]
# reference (full)
ref_pred = (
KernelPCA(n_components, eigen_solver="dense", random_state=0)
.fit(X_fit)
.transform(X_pred)
)
# arpack
a_pred = (
KernelPCA(n_components, eigen_solver="arpack", random_state=0)
.fit(X_fit)
.transform(X_pred)
)
# check that the result is still correct despite the approx
assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))
# randomized
r_pred = (
KernelPCA(n_components, eigen_solver="randomized", random_state=0)
.fit(X_fit)
.transform(X_pred)
)
# check that the result is still correct despite the approximation
assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
def test_kernel_pca_inverse_transform_reconstruction():
"""Test if the reconstruction is a good approximation.
Note that in general it is not possible to get an arbitrarily good
reconstruction because of kernel centering that does not
preserve all the information of the original data.
"""
X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
kpca = KernelPCA(
n_components=20, kernel="rbf", fit_inverse_transform=True, alpha=1e-3
)
X_trans = kpca.fit_transform(X)
X_reconst = kpca.inverse_transform(X_trans)
assert np.linalg.norm(X - X_reconst) / np.linalg.norm(X) < 1e-1
def test_kernel_pca_raise_not_fitted_error():
X = np.random.randn(15).reshape(5, 3)
kpca = KernelPCA()
kpca.fit(X)
with pytest.raises(NotFittedError):
kpca.inverse_transform(X)
def test_32_64_decomposition_shape():
"""Test that the decomposition is similar for 32 and 64 bits data
Non regression test for
https://github.com/scikit-learn/scikit-learn/issues/18146
"""
X, y = make_blobs(
n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, cluster_std=0.1
)
X = StandardScaler().fit_transform(X)
X -= X.min()
# Compare the shapes (corresponds to the number of non-zero eigenvalues)
kpca = KernelPCA()
assert kpca.fit_transform(X).shape == kpca.fit_transform(X.astype(np.float32)).shape
def test_kernel_pca_feature_names_out():
"""Check feature names out for KernelPCA."""
X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
kpca = KernelPCA(n_components=2).fit(X)
names = kpca.get_feature_names_out()
assert_array_equal([f"kernelpca{i}" for i in range(2)], names)
def test_kernel_pca_inverse_correct_gamma():
"""Check that gamma is set correctly when not provided.
Non-regression test for #26280
"""
rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
kwargs = {
"n_components": 2,
"random_state": rng,
"fit_inverse_transform": True,
"kernel": "rbf",
}
expected_gamma = 1 / X.shape[1]
kpca1 = KernelPCA(gamma=None, **kwargs).fit(X)
kpca2 = KernelPCA(gamma=expected_gamma, **kwargs).fit(X)
assert kpca1.gamma_ == expected_gamma
assert kpca2.gamma_ == expected_gamma
X1_recon = kpca1.inverse_transform(kpca1.transform(X))
X2_recon = kpca2.inverse_transform(kpca1.transform(X))
assert_allclose(X1_recon, X2_recon)
def test_kernel_pca_pandas_output():
"""Check that KernelPCA works with pandas output when the solver is arpack.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/27579
"""
pytest.importorskip("pandas")
X, _ = load_iris(as_frame=True, return_X_y=True)
with sklearn.config_context(transform_output="pandas"):
KernelPCA(n_components=2, eigen_solver="arpack").fit_transform(X)