From d84f6753e79d45986214e1e7edf68cf932e796a1 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Mon, 24 Sep 2018 13:49:13 +0200 Subject: [PATCH 001/106] Minor edits for clarity: - Added a few comments to clarify `_fit_transform`, `fit_transform` and `transform`. - Uniformized the numpy coding style for `fit` and `fit_transform`. --- sklearn/decomposition/kernel_pca.py | 33 ++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 133717e13f677..455a47a2a828e 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -215,11 +215,31 @@ def _fit_transform(self, K): self.lambdas_ = self.lambdas_[indices] self.alphas_ = self.alphas_[:, indices] - # remove eigenvectors with a zero eigenvalue + # remove eigenvectors with a zero eigenvalue (null space) if required if self.remove_zero_eig or self.n_components is None: self.alphas_ = self.alphas_[:, self.lambdas_ > 0] self.lambdas_ = self.lambdas_[self.lambdas_ > 0] + # Maintenance note on Eigenvectors normalization + # ---------------------------------------------- + # there is a link between + # the eigenvectors of K=Phi(X)'Phi(X) and the ones of Phi(X)Phi(X)' + # if v is an eigenvector of K + # then Phi(X)v is an eigenvector of Phi(X)Phi(X)' + # if u is an eigenvector of Phi(X)Phi(X)' + # then Phi(X)'u is an eigenvector of Phi(X)Phi(X)' + # + # At this stage our self.alphas_ (the v) have norm 1, we need to scale + # them so that eigenvectors in kernel feature space (the u) have norm=1 + # instead + # + # We COULD scale them here: + # self.alphas_ = self.alphas_ / np.sqrt(self.lambdas_) + # + # But the original choice was to perform that LATER when needed, in + # fit() and in transform(). We keep it to preserve the meaning of + # self.alphas_ across sk-learn versions. + return K def _fit_inverse_transform(self, X_transformed, X): @@ -253,8 +273,10 @@ def fit(self, X, y=None): self._fit_transform(K) if self.fit_inverse_transform: - sqrt_lambdas = np.diag(np.sqrt(self.lambdas_)) - X_transformed = np.dot(self.alphas_, sqrt_lambdas) + # Transform X + # (shortcut since we transform the same X that was used to fit) + X_transformed = self.alphas_ * np.sqrt(self.lambdas_) + self._fit_inverse_transform(X_transformed, X) self.X_fit_ = X @@ -275,6 +297,8 @@ def fit_transform(self, X, y=None, **params): """ self.fit(X, **params) + # Transform X + # (shortcut since we transform the same X that was used to fit) X_transformed = self.alphas_ * np.sqrt(self.lambdas_) if self.fit_inverse_transform: @@ -295,7 +319,10 @@ def transform(self, X): """ check_is_fitted(self, 'X_fit_') + # Compute centered gram matrix between X and training data X_fit_ K = self._centerer.transform(self._get_kernel(X, self.X_fit_)) + + # Project by doing a scalar product between K and the scaled eigenvects return np.dot(K, self.alphas_ / np.sqrt(self.lambdas_)) def inverse_transform(self, X): From 27b4de590132316be7351164d3dde603792950cd Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Mon, 24 Sep 2018 13:50:24 +0200 Subject: [PATCH 002/106] Fixed #12141. --- sklearn/decomposition/kernel_pca.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 455a47a2a828e..cc258a786747f 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -322,8 +322,14 @@ def transform(self, X): # Compute centered gram matrix between X and training data X_fit_ K = self._centerer.transform(self._get_kernel(X, self.X_fit_)) + # scale eigenvectors + scaled_alphas = self.alphas_ / np.sqrt(self.lambdas_) + + # properly take null-space into account for the dot product + scaled_alphas[:, self.lambdas_ == 0] = 0 + # Project by doing a scalar product between K and the scaled eigenvects - return np.dot(K, self.alphas_ / np.sqrt(self.lambdas_)) + return np.dot(K, scaled_alphas) def inverse_transform(self, X): """Transform X back to original space. From cb7cd15a80a6ebda4e453d61996af72e237f5477 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Mon, 24 Sep 2018 16:01:43 +0200 Subject: [PATCH 003/106] Added exceptions and warnings in case of numerical issues and bad kernel conditioning. Fixes #12140 --- sklearn/decomposition/kernel_pca.py | 47 ++++++++++++++++++++++++++++- sklearn/exceptions.py | 10 +++++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index cc258a786747f..35ace99d8125c 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -4,12 +4,13 @@ # License: BSD 3 clause import numpy as np +import warnings from scipy import linalg from scipy.sparse.linalg import eigsh from ..utils import check_random_state from ..utils.validation import check_is_fitted, check_array -from ..exceptions import NotFittedError +from ..exceptions import NotFittedError, KernelWarning from ..base import BaseEstimator, TransformerMixin from ..preprocessing import KernelCenterer from ..metrics.pairwise import pairwise_kernels @@ -210,6 +211,50 @@ def _fit_transform(self, K): maxiter=self.max_iter, v0=v0) + # Check that there are no significant imaginary parts + if not np.isreal(self.lambdas_).all(): + max_imag_abs = abs(np.imag(self.lambdas_)).max() + max_real_abs = abs(np.real(self.lambdas_)).max() + if max_imag_abs > 1e-5 * max_real_abs: + raise ValueError( + "there are significant imaginary parts in eigenvalues (%f " + "of the max real part). Something may be wrong with the " + "kernel. Setting all imaginary parts to zero." + "" % (max_imag_abs / max_real_abs)) + + # Remove the insignificant imaginary parts + self.lambdas_ = np.real(self.lambdas_) + + # Check that there are no significant negative eigenvalues + min_eig = self.lambdas_.min() + max_eig = self.lambdas_.max() + if -min_eig > 1e-5 * max(max_eig, 0) and -min_eig > 1e-10: + # If kernel has been computed with single precision we would + # probably need more tolerant thresholds such as: + # (-min_eig > 5e-3 * max(max_eig, 0) and -min_eig > 1e-8) + if max_eig >= 0: + warnings.warn("There are significant negative eigenvalues " + "(%f of the max positive). Something may be " + "wrong with the kernel. Replacing them by " + "zero." % (-min_eig / max_eig), KernelWarning) + else: + raise ValueError("There are significant negative eigenvalues " + "(such as %f) and no positive one. Something " + "may be wrong with the kernel." % min_eig) + + # Remove the insignificant negative values + self.lambdas_[self.lambdas_ < 0] = 0 + + # Finally check for conditioning + max_conditioning = 1e12 # Max allowed conditioning (ratio big/small) + too_small_lambdas = self.lambdas_ < max_eig / max_conditioning + if too_small_lambdas.any(): + warnings.warn("The kernel is badly conditioned: the largest " + "eigenvalue is more than %.2E times the smallest. " + "Small eigenvalues will be replaced " + "by 0" % max_conditioning, KernelWarning) + self.lambdas_[too_small_lambdas] = 0 + # sort eigenvectors in descending order indices = self.lambdas_.argsort()[::-1] self.lambdas_ = self.lambdas_[indices] diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 9cf207e40fdd6..c589015fee50a 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -12,7 +12,8 @@ 'FitFailedWarning', 'NonBLASDotWarning', 'SkipTestWarning', - 'UndefinedMetricWarning'] + 'UndefinedMetricWarning', + 'KernelWarning'] class NotFittedError(ValueError, AttributeError): @@ -154,3 +155,10 @@ class UndefinedMetricWarning(UserWarning): .. versionchanged:: 0.18 Moved from sklearn.base. """ + + +class KernelWarning(UserWarning): + """Custom warning to capture kernel issues + + .. versionadded:: 0.21 + """ From 0c4ebaa642dc91c7b97d4310fce3ee962437c7c6 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Tue, 25 Sep 2018 11:25:57 +0200 Subject: [PATCH 004/106] Added a `check_kernel_eigenvalues` validation method (with doctests). It is used in `KernelPCA`. Added corresponding test `test_errors_and_warnings` for KernelPCA. --- sklearn/decomposition/kernel_pca.py | 51 +-------- .../decomposition/tests/test_kernel_pca.py | 80 ++++++++++++- sklearn/utils/__init__.py | 4 +- sklearn/utils/validation.py | 106 +++++++++++++++++- 4 files changed, 191 insertions(+), 50 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 35ace99d8125c..43fc338d817cc 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -4,13 +4,13 @@ # License: BSD 3 clause import numpy as np -import warnings from scipy import linalg from scipy.sparse.linalg import eigsh from ..utils import check_random_state -from ..utils.validation import check_is_fitted, check_array -from ..exceptions import NotFittedError, KernelWarning +from ..utils.validation import check_is_fitted, check_array, \ + check_kernel_eigenvalues +from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin from ..preprocessing import KernelCenterer from ..metrics.pairwise import pairwise_kernels @@ -211,49 +211,8 @@ def _fit_transform(self, K): maxiter=self.max_iter, v0=v0) - # Check that there are no significant imaginary parts - if not np.isreal(self.lambdas_).all(): - max_imag_abs = abs(np.imag(self.lambdas_)).max() - max_real_abs = abs(np.real(self.lambdas_)).max() - if max_imag_abs > 1e-5 * max_real_abs: - raise ValueError( - "there are significant imaginary parts in eigenvalues (%f " - "of the max real part). Something may be wrong with the " - "kernel. Setting all imaginary parts to zero." - "" % (max_imag_abs / max_real_abs)) - - # Remove the insignificant imaginary parts - self.lambdas_ = np.real(self.lambdas_) - - # Check that there are no significant negative eigenvalues - min_eig = self.lambdas_.min() - max_eig = self.lambdas_.max() - if -min_eig > 1e-5 * max(max_eig, 0) and -min_eig > 1e-10: - # If kernel has been computed with single precision we would - # probably need more tolerant thresholds such as: - # (-min_eig > 5e-3 * max(max_eig, 0) and -min_eig > 1e-8) - if max_eig >= 0: - warnings.warn("There are significant negative eigenvalues " - "(%f of the max positive). Something may be " - "wrong with the kernel. Replacing them by " - "zero." % (-min_eig / max_eig), KernelWarning) - else: - raise ValueError("There are significant negative eigenvalues " - "(such as %f) and no positive one. Something " - "may be wrong with the kernel." % min_eig) - - # Remove the insignificant negative values - self.lambdas_[self.lambdas_ < 0] = 0 - - # Finally check for conditioning - max_conditioning = 1e12 # Max allowed conditioning (ratio big/small) - too_small_lambdas = self.lambdas_ < max_eig / max_conditioning - if too_small_lambdas.any(): - warnings.warn("The kernel is badly conditioned: the largest " - "eigenvalue is more than %.2E times the smallest. " - "Small eigenvalues will be replaced " - "by 0" % max_conditioning, KernelWarning) - self.lambdas_[too_small_lambdas] = 0 + # make sure that there are no numerical or conditioning issues + self.lambdas_ = check_kernel_eigenvalues(self.lambdas_) # sort eigenvectors in descending order indices = self.lambdas_.argsort()[::-1] diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index b0f2c5aeae52a..f4c7cadeeb356 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -1,10 +1,13 @@ import numpy as np import scipy.sparse as sp import pytest +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.exceptions import KernelWarning +from sklearn.utils.validation import check_kernel_eigenvalues from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, - assert_raises, ignore_warnings) + assert_raises, assert_warns) from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles @@ -225,3 +228,78 @@ def test_nested_circles(): # The data is perfectly linearly separable in that space train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y) assert_equal(train_score, 1.0) + + +def test_errors_and_warnings(): + """Tests that bad kernels raise error and warnings""" + + solvers = ['dense', 'arpack', 'randomized'] + solvers_except_arpack = ['dense', 'randomized'] + + # First create an identity transformer class + # ------------------------------------------ + class IdentityKernelTransformer(BaseEstimator, TransformerMixin): + """We will use this transformer so that the passed kernel matrix is + not centered when kPCA is fit""" + + def __init__(self): + pass + + def fit(self, K, y=None): + return self + + def transform(self, K, y=None, copy=True): + return K + + # Significant imaginary parts: error + # ---------------------------------- + # As of today it seems that the kernel matrix is always cast as a float + # whatever the method (precomputed or callable). + # The following test therefore fails ("did not raise"). + K = [[5, 0], + [0, 6 * 1e-5j]] + # for solver in solvers: + # kpca = KernelPCA(kernel=kernel_getter, eigen_solver=solver, + # fit_inverse_transform=False) + # kpca.n_jobs = 1 + # kpca._centerer = IdentityKernelTransformer() + # with pytest.raises(ValueError): + # kpca.fit(K) + # + # For safety concerning future evolutions the corresponding code is left in + # KernelPCA, and we test it directly by calling the inner method here: + with pytest.raises(ValueError): + check_kernel_eigenvalues((K[0][0], K[1][1])) + + # All negative eigenvalues: error + # ------------------------------- + K = [[-5, 0], + [0, -6e-5]] + for solver in solvers: + kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, + fit_inverse_transform=False) + kpca._centerer = IdentityKernelTransformer() + with pytest.raises(ValueError): + kpca.fit(K) + + # Significant negative eigenvalue: warning + # ---------------------------------------- + K = [[5, 0], + [0, -6e-5]] + for solver in solvers_except_arpack: + # Note: arpack detects this case and raises an error already + kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, + fit_inverse_transform=False) + kpca._centerer = IdentityKernelTransformer() + assert_warns(KernelWarning, lambda: kpca.fit(K)) + + # Bad conditionning + # ----------------- + K = [[5, 0], + [0, 4e-12]] + for solver in solvers_except_arpack: + # Note: arpack detects this case and raises an error already + kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, + fit_inverse_transform=False) + kpca._centerer = IdentityKernelTransformer() + assert_warns(KernelWarning, lambda: kpca.fit(K)) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 4c22752030703..a33676ab3b9ef 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -11,7 +11,7 @@ from .murmurhash import murmurhash3_32 from .validation import (as_float_array, - assert_all_finite, + assert_all_finite, check_kernel_eigenvalues, check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, check_symmetric) @@ -26,7 +26,7 @@ __all__ = ["murmurhash3_32", "as_float_array", "assert_all_finite", "check_array", - "check_random_state", + "check_random_state", "check_kernel_eigenvalues", "compute_class_weight", "compute_sample_weight", "column_or_1d", "safe_indexing", "check_consistent_length", "check_X_y", 'indexable', diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index facc51e2c5655..8c5e96cc7c7c5 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -21,7 +21,7 @@ from ..externals import six from ..utils.fixes import signature from .. import get_config as _get_config -from ..exceptions import NonBLASDotWarning +from ..exceptions import NonBLASDotWarning, KernelWarning from ..exceptions import NotFittedError from ..exceptions import DataConversionWarning from ..utils._joblib import Memory @@ -957,3 +957,107 @@ def check_non_negative(X, whom): X = X.data if sp.issparse(X) else X if (X < 0).any(): raise ValueError("Negative values in data passed to %s" % whom) + + +def check_kernel_eigenvalues(lambdas): + """Checks that the provided array of kernel eigenvalues for numerical or + conditioning issues and returns a fixed validated version. + + It checks: + + - that there are no significant imaginary parts in eigenvalues (more than + 1e-5 times the maximum real part). If this check fails, it raises a + ValueError. Otherwise all non-significant imaginary parts that may remain + are removed. + + - that eigenvalues are not all negative. If this check fails, it raises a + ValueError + + - that there are no significant negative eigenvalues (with absolute value + more than 1e-10 and more than 1e-5 times the largest positive eigenvalue). + If this check fails, it raises a KernelWarning. All negative eigenvalues + are set to zero in all cases. + + - that the eigenvalues are well conditioned. That means, that the + eigenvalues are all greater than the maximum eigenvalue divided by 1e12. + If this check fails, it raises a KernelWarning and all the eigenvalues + that are too small are set to zero. + + Note: the returned array is converted to numpy array. + + >>> check_kernel_eigenvalues((1, 2)) # nominal case + array([1, 2]) + >>> check_kernel_eigenvalues((5, 5j)) # significant imag part + Traceback (most recent call last): + ... + ValueError: There are significant imaginary parts in eigenvalues (1.000000 + of the max real part). Something may be wrong with the kernel + >>> check_kernel_eigenvalues((5, 5e-5j)) # insignificant imag part + array([ 5., 0.]) + >>> check_kernel_eigenvalues((-5, -1)) # all negative + Traceback (most recent call last): + ... + ValueError: All eigenvalues are negative (max is -1.000000). Something may + be wrong with the kernel. + >>> check_kernel_eigenvalues((5, -1)) # significant negative + array([5, 0]) + >>> check_kernel_eigenvalues((5, -5e-5)) # insignificant negative + array([ 5., 0.]) + >>> check_kernel_eigenvalues((5, 4e-12)) # bad conditioning (too small) + array([ 5., 0.]) + + Parameters + ---------- + lambdas : array-like + Array of eigenvalues to check / fix. + + Returns + ------- + lambdas_fixed : array-like + The fixed validated array of eigenvalues. + """ + + # Check that there are no significant imaginary parts + if not np.isreal(lambdas).all(): + max_imag_abs = abs(np.imag(lambdas)).max() + max_real_abs = abs(np.real(lambdas)).max() + if max_imag_abs > 1e-5 * max_real_abs: + raise ValueError( + "There are significant imaginary parts in eigenvalues (%f " + "of the max real part). Something may be wrong with the " + "kernel." % (max_imag_abs / max_real_abs)) + + # Remove the insignificant imaginary parts + lambdas = np.real(lambdas) + + # Check that there are no significant negative eigenvalues + max_eig = lambdas.max() + if max_eig < 0: + raise ValueError("All eigenvalues are negative (max is %f). Something " + "may be wrong with the kernel." % max_eig) + + else: + min_eig = lambdas.min() + if -min_eig > 1e-5 * max(max_eig, 0) and -min_eig > 1e-10: + # If kernel has been computed with single precision we would + # probably need more tolerant thresholds such as: + # (-min_eig > 5e-3 * max(max_eig, 0) and -min_eig > 1e-8) + warnings.warn("There are significant negative eigenvalues " + "(%f of the max positive). Something may be " + "wrong with the kernel. Replacing them by " + "zero." % (-min_eig / max_eig), KernelWarning) + + # Remove all negative values in all cases + lambdas[lambdas < 0] = 0 + + # Finally check for conditioning + max_conditioning = 1e12 # Max allowed conditioning (ratio big/small) + too_small_lambdas = lambdas < max_eig / max_conditioning + if too_small_lambdas.any(): + warnings.warn("The kernel is badly conditioned: the largest " + "eigenvalue is more than %.2E times the smallest. " + "Small eigenvalues will be replaced " + "by 0" % max_conditioning, KernelWarning) + lambdas[too_small_lambdas] = 0 + + return lambdas From 5285a4ed703c88615e4edf31da2fa2ff9846078c Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Tue, 25 Sep 2018 13:13:40 +0200 Subject: [PATCH 005/106] Added an additional check in the test, in order to understand the strange failure in Travis. --- sklearn/decomposition/tests/test_kernel_pca.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index f4c7cadeeb356..7117ba5cac1c4 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -275,6 +275,10 @@ def transform(self, K, y=None, copy=True): # ------------------------------- K = [[-5, 0], [0, -6e-5]] + # check that the inner method still works + with pytest.raises(ValueError): + check_kernel_eigenvalues((K[0][0], K[1][1])) + for solver in solvers: kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, fit_inverse_transform=False) From 66ce769ff78e806c2868c4880e145673289a68c5 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Tue, 25 Sep 2018 13:45:04 +0200 Subject: [PATCH 006/106] Fixed test_errors_and_warnings: a wrong solver name had been introduced ('randomized' is not available in this branch !) --- sklearn/decomposition/tests/test_kernel_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 7117ba5cac1c4..e42f42acff28a 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -233,8 +233,8 @@ def test_nested_circles(): def test_errors_and_warnings(): """Tests that bad kernels raise error and warnings""" - solvers = ['dense', 'arpack', 'randomized'] - solvers_except_arpack = ['dense', 'randomized'] + solvers = ['dense', 'arpack'] + solvers_except_arpack = ['dense'] # First create an identity transformer class # ------------------------------------------ From 98ae1acecc4868a59e4826bf02719b37c5c3c8af Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Tue, 25 Sep 2018 14:29:21 +0200 Subject: [PATCH 007/106] kPCA test_errors_and_warnings: Added a few extra checks in the test to always check the inner check_kernel_eigenvalues method before the kPCA fit. --- sklearn/decomposition/tests/test_kernel_pca.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index e42f42acff28a..eca95bbfe651a 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -231,7 +231,7 @@ def test_nested_circles(): def test_errors_and_warnings(): - """Tests that bad kernels raise error and warnings""" + """Tests that bad kernel decomposition raise error and warnings""" solvers = ['dense', 'arpack'] solvers_except_arpack = ['dense'] @@ -290,6 +290,10 @@ def transform(self, K, y=None, copy=True): # ---------------------------------------- K = [[5, 0], [0, -6e-5]] + # check that the inner method works + assert_warns(KernelWarning, + lambda: check_kernel_eigenvalues((K[0][0], K[1][1]))) + for solver in solvers_except_arpack: # Note: arpack detects this case and raises an error already kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, @@ -297,10 +301,14 @@ def transform(self, K, y=None, copy=True): kpca._centerer = IdentityKernelTransformer() assert_warns(KernelWarning, lambda: kpca.fit(K)) - # Bad conditionning - # ----------------- + # Bad conditioning + # ---------------- K = [[5, 0], [0, 4e-12]] + # check that the inner method works + assert_warns(KernelWarning, + lambda: check_kernel_eigenvalues((K[0][0], K[1][1]))) + for solver in solvers_except_arpack: # Note: arpack detects this case and raises an error already kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, From 12d1a6fd9eee1e7ed51f42c2dd63b84632e07f29 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Tue, 25 Sep 2018 14:47:37 +0200 Subject: [PATCH 008/106] Fixed test `test_errors_and_warnings`. Our identity centerer was replaced during call to fit() - we now execute the test directly on `_fit_transform` instead, so that the matrix is untouched. --- .../decomposition/tests/test_kernel_pca.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index eca95bbfe651a..17183a1d22442 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -231,7 +231,7 @@ def test_nested_circles(): def test_errors_and_warnings(): - """Tests that bad kernel decomposition raise error and warnings""" + """Tests that bad kernels raise error and warnings""" solvers = ['dense', 'arpack'] solvers_except_arpack = ['dense'] @@ -261,10 +261,11 @@ def transform(self, K, y=None, copy=True): # for solver in solvers: # kpca = KernelPCA(kernel=kernel_getter, eigen_solver=solver, # fit_inverse_transform=False) - # kpca.n_jobs = 1 # kpca._centerer = IdentityKernelTransformer() + # K = kpca._get_kernel(K) # with pytest.raises(ValueError): - # kpca.fit(K) + # # note: we can not test 'fit' because _centerer would be replaced + # kpca._fit_transform(K) # # For safety concerning future evolutions the corresponding code is left in # KernelPCA, and we test it directly by calling the inner method here: @@ -275,7 +276,7 @@ def transform(self, K, y=None, copy=True): # ------------------------------- K = [[-5, 0], [0, -6e-5]] - # check that the inner method still works + # check that the inner method works with pytest.raises(ValueError): check_kernel_eigenvalues((K[0][0], K[1][1])) @@ -283,8 +284,10 @@ def transform(self, K, y=None, copy=True): kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, fit_inverse_transform=False) kpca._centerer = IdentityKernelTransformer() + K = kpca._get_kernel(K) with pytest.raises(ValueError): - kpca.fit(K) + # note: we can not test 'fit' because _centerer would be replaced + kpca._fit_transform(K) # Significant negative eigenvalue: warning # ---------------------------------------- @@ -299,7 +302,9 @@ def transform(self, K, y=None, copy=True): kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, fit_inverse_transform=False) kpca._centerer = IdentityKernelTransformer() - assert_warns(KernelWarning, lambda: kpca.fit(K)) + K = kpca._get_kernel(K) + # note: we can not test 'fit' because _centerer would be replaced + assert_warns(KernelWarning, lambda: kpca._fit_transform(K)) # Bad conditioning # ---------------- @@ -314,4 +319,6 @@ def transform(self, K, y=None, copy=True): kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, fit_inverse_transform=False) kpca._centerer = IdentityKernelTransformer() - assert_warns(KernelWarning, lambda: kpca.fit(K)) + K = kpca._get_kernel(K) + # note: we can not test 'fit' because _centerer would be replaced + assert_warns(KernelWarning, lambda: kpca._fit_transform(K)) From 381fbb0538416e91633155c85a52d3343fb752d7 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Tue, 25 Sep 2018 15:14:21 +0200 Subject: [PATCH 009/106] Fixed doctest in `check_kernel_eigenvalues` --- sklearn/utils/validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 8c5e96cc7c7c5..2c5349df68fdf 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -987,18 +987,18 @@ def check_kernel_eigenvalues(lambdas): >>> check_kernel_eigenvalues((1, 2)) # nominal case array([1, 2]) - >>> check_kernel_eigenvalues((5, 5j)) # significant imag part + >>> check_kernel_eigenvalues((5, 5j)) # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: There are significant imaginary parts in eigenvalues (1.000000 - of the max real part). Something may be wrong with the kernel + of the max real part). Something may be wrong with the kernel >>> check_kernel_eigenvalues((5, 5e-5j)) # insignificant imag part array([ 5., 0.]) - >>> check_kernel_eigenvalues((-5, -1)) # all negative + >>> check_kernel_eigenvalues((-5, -1)) # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: All eigenvalues are negative (max is -1.000000). Something may - be wrong with the kernel. + be wrong with the kernel. >>> check_kernel_eigenvalues((5, -1)) # significant negative array([5, 0]) >>> check_kernel_eigenvalues((5, -5e-5)) # insignificant negative From 7d985020a85b39e48ed1da8bf8e323b10170549e Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Tue, 25 Sep 2018 15:56:41 +0200 Subject: [PATCH 010/106] Final doctest fixes for `check_kernel_eigenvalues` (locally validated this time :) ). --- sklearn/utils/validation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 2c5349df68fdf..6d03761db062e 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -987,18 +987,20 @@ def check_kernel_eigenvalues(lambdas): >>> check_kernel_eigenvalues((1, 2)) # nominal case array([1, 2]) - >>> check_kernel_eigenvalues((5, 5j)) # doctest: +NORMALIZE_WHITESPACE + >>> check_kernel_eigenvalues((5, 5j)) # significant imag part + ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: There are significant imaginary parts in eigenvalues (1.000000 - of the max real part). Something may be wrong with the kernel + of the max real part). Something may be wrong with the kernel. >>> check_kernel_eigenvalues((5, 5e-5j)) # insignificant imag part array([ 5., 0.]) - >>> check_kernel_eigenvalues((-5, -1)) # doctest: +NORMALIZE_WHITESPACE + >>> check_kernel_eigenvalues((-5, -1)) # all negative + ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: All eigenvalues are negative (max is -1.000000). Something may - be wrong with the kernel. + be wrong with the kernel. >>> check_kernel_eigenvalues((5, -1)) # significant negative array([5, 0]) >>> check_kernel_eigenvalues((5, -5e-5)) # insignificant negative From ea92a520339b04c5764908c79054efa3a2b3045e Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Tue, 25 Sep 2018 16:29:33 +0200 Subject: [PATCH 011/106] pytest doc fix: added normalize whitespace everywhere since there were still errors of this kind. --- sklearn/utils/validation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 6d03761db062e..0b2478d90c867 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -986,6 +986,7 @@ def check_kernel_eigenvalues(lambdas): Note: the returned array is converted to numpy array. >>> check_kernel_eigenvalues((1, 2)) # nominal case + ... # doctest: +NORMALIZE_WHITESPACE array([1, 2]) >>> check_kernel_eigenvalues((5, 5j)) # significant imag part ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE @@ -994,7 +995,8 @@ def check_kernel_eigenvalues(lambdas): ValueError: There are significant imaginary parts in eigenvalues (1.000000 of the max real part). Something may be wrong with the kernel. >>> check_kernel_eigenvalues((5, 5e-5j)) # insignificant imag part - array([ 5., 0.]) + ... # doctest: +NORMALIZE_WHITESPACE + array([5., 0.]) >>> check_kernel_eigenvalues((-5, -1)) # all negative ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): @@ -1002,11 +1004,14 @@ def check_kernel_eigenvalues(lambdas): ValueError: All eigenvalues are negative (max is -1.000000). Something may be wrong with the kernel. >>> check_kernel_eigenvalues((5, -1)) # significant negative + ... # doctest: +NORMALIZE_WHITESPACE array([5, 0]) >>> check_kernel_eigenvalues((5, -5e-5)) # insignificant negative - array([ 5., 0.]) + ... # doctest: +NORMALIZE_WHITESPACE + array([5., 0.]) >>> check_kernel_eigenvalues((5, 4e-12)) # bad conditioning (too small) - array([ 5., 0.]) + ... # doctest: +NORMALIZE_WHITESPACE + array([5., 0.]) Parameters ---------- From 9d6ee75c16e97d5de1c5598f06340d76b731269d Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Mon, 8 Oct 2018 10:52:24 +0200 Subject: [PATCH 012/106] Added test corresponding to issue #12141 --- sklearn/decomposition/tests/test_kernel_pca.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index b0f2c5aeae52a..fab3739e8322f 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -140,6 +140,18 @@ def test_remove_zero_eig(): assert_equal(Xt.shape, (3, 0)) +def test_leave_zero_eig(): + """This test checks that fit().transform() returns the same result than + fit_transform() in case of non-removed zero eigenvalue""" + X_fit = np.array([[1, 1], [0, 0]]) + + k = KernelPCA(n_components=2, remove_zero_eig=False, + eigen_solver="dense") + A = k.fit(X_fit).transform(X_fit) + B = k.fit_transform(X_fit) + assert_array_almost_equal(np.abs(A), np.abs(B)) + + def test_kernel_pca_precomputed(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) From 3206a438f05a14c944167bb669a4d119138f6784 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Tue, 9 Oct 2018 17:52:39 +0200 Subject: [PATCH 013/106] Following code review from @NicolasHug : now only scaling eigenvectors where there is no zero division, to avoid numpy warnings. --- sklearn/decomposition/kernel_pca.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index cc258a786747f..1d4850afcf574 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -322,11 +322,10 @@ def transform(self, X): # Compute centered gram matrix between X and training data X_fit_ K = self._centerer.transform(self._get_kernel(X, self.X_fit_)) - # scale eigenvectors - scaled_alphas = self.alphas_ / np.sqrt(self.lambdas_) - - # properly take null-space into account for the dot product - scaled_alphas[:, self.lambdas_ == 0] = 0 + # scale eigenvectors (properly account for null-space for dot product) + nz = np.flatnonzero(self.lambdas_) + scaled_alphas = np.zeros(self.alphas_.shape) + scaled_alphas[:, nz] = self.alphas_[:, nz] / np.sqrt(self.lambdas_[nz]) # Project by doing a scalar product between K and the scaled eigenvects return np.dot(K, scaled_alphas) From 9fbbbb690b2ad5250575dcdf0f804d71f7db2f41 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Tue, 9 Oct 2018 18:16:48 +0200 Subject: [PATCH 014/106] Following code review from @NicolasHug : now checking that there is no zero division numpy warning. --- sklearn/decomposition/tests/test_kernel_pca.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index fab3739e8322f..5c3f31ae4bd92 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -4,7 +4,7 @@ from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, - assert_raises, ignore_warnings) + assert_raises, assert_no_warnings) from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles @@ -145,11 +145,17 @@ def test_leave_zero_eig(): fit_transform() in case of non-removed zero eigenvalue""" X_fit = np.array([[1, 1], [0, 0]]) - k = KernelPCA(n_components=2, remove_zero_eig=False, - eigen_solver="dense") - A = k.fit(X_fit).transform(X_fit) - B = k.fit_transform(X_fit) - assert_array_almost_equal(np.abs(A), np.abs(B)) + # Assert that even with all np warnings on, there is no div by zero warning + with np.errstate(all='warn'): + k = KernelPCA(n_components=2, remove_zero_eig=False, + eigen_solver="dense") + # Fit, then transform + A = assert_no_warnings(assert_no_warnings(k.fit, X_fit) + .transform, X_fit) + # Do both at once + B = assert_no_warnings(k.fit_transform, X_fit) + # Compare + assert_array_almost_equal(np.abs(A), np.abs(B)) def test_kernel_pca_precomputed(): From b09a1e7d582550c0049aca564698e63018df8ad2 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Thu, 11 Oct 2018 10:32:01 +0200 Subject: [PATCH 015/106] Following code review from @NicolasHug : now using pytest warning capture to perform fine-grain warning assertion --- .../decomposition/tests/test_kernel_pca.py | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 5c3f31ae4bd92..376bd873cb236 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -146,16 +146,23 @@ def test_leave_zero_eig(): X_fit = np.array([[1, 1], [0, 0]]) # Assert that even with all np warnings on, there is no div by zero warning - with np.errstate(all='warn'): - k = KernelPCA(n_components=2, remove_zero_eig=False, - eigen_solver="dense") - # Fit, then transform - A = assert_no_warnings(assert_no_warnings(k.fit, X_fit) - .transform, X_fit) - # Do both at once - B = assert_no_warnings(k.fit_transform, X_fit) - # Compare - assert_array_almost_equal(np.abs(A), np.abs(B)) + with pytest.warns(None) as record: + with np.errstate(all='warn'): + k = KernelPCA(n_components=2, remove_zero_eig=False, + eigen_solver="dense") + # Fit, then transform + A = k.fit(X_fit).transform(X_fit) + # Do both at once + B = k.fit_transform(X_fit) + # Compare + assert_array_almost_equal(np.abs(A), np.abs(B)) + + for w in record: + # There might be warnings about the kernel being badly conditioned, + # but there should not be warnings about division by zero. + # (Numpy division by zero warning can have many message variants, but + # at least we know that it is a RuntimeWarning so lets check only this) + assert w.category is not RuntimeWarning def test_kernel_pca_precomputed(): From 7fe9f06a639eedfedfbbc360b89555014520ad68 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Fri, 12 Oct 2018 09:38:28 +0200 Subject: [PATCH 016/106] Merge was wrong - removing useless import --- sklearn/decomposition/tests/test_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 376bd873cb236..0ee93b1454dd0 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -4,7 +4,7 @@ from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, - assert_raises, assert_no_warnings) + assert_raises) from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles From 6124783eb970b748b1c8303e389a2d726f998932 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Wed, 17 Oct 2018 09:56:24 +0200 Subject: [PATCH 017/106] Afterthoughts on the need to warn when gram matrix eigenvalues are small: this is most probably a common case especially when the number of samples gets high. Removing the warning by default. --- sklearn/utils/validation.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f6d7620b26ccc..f479803c0e369 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -973,7 +973,7 @@ def check_non_negative(X, whom): raise ValueError("Negative values in data passed to %s" % whom) -def check_kernel_eigenvalues(lambdas): +def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): """Checks that the provided array of kernel eigenvalues for numerical or conditioning issues and returns a fixed validated version. @@ -994,8 +994,8 @@ def check_kernel_eigenvalues(lambdas): - that the eigenvalues are well conditioned. That means, that the eigenvalues are all greater than the maximum eigenvalue divided by 1e12. - If this check fails, it raises a KernelWarning and all the eigenvalues - that are too small are set to zero. + If this check fails and `warn_on_zeros=True`, it raises a KernelWarning. + All the eigenvalues that are too small are then set to zero. Note: the returned array is converted to numpy array. @@ -1032,6 +1032,12 @@ def check_kernel_eigenvalues(lambdas): lambdas : array-like Array of eigenvalues to check / fix. + warn_on_zeros : boolean (default: False) + When this is set to `True`, a `KernelWarning` will be raised when there + are extremely small eigenvalues. Otherwise no warning will be raised. + Note that in both cases, extremely small eigenvalues will be set to + zero. + Returns ------- lambdas_fixed : array-like @@ -1075,10 +1081,11 @@ def check_kernel_eigenvalues(lambdas): max_conditioning = 1e12 # Max allowed conditioning (ratio big/small) too_small_lambdas = lambdas < max_eig / max_conditioning if too_small_lambdas.any(): - warnings.warn("The kernel is badly conditioned: the largest " - "eigenvalue is more than %.2E times the smallest. " - "Small eigenvalues will be replaced " - "by 0" % max_conditioning, KernelWarning) + if warn_on_zeros: + warnings.warn("The kernel is badly conditioned: the largest " + "eigenvalue is more than %.2E times the smallest. " + "Small eigenvalues will be replaced " + "by 0" % max_conditioning, KernelWarning) lambdas[too_small_lambdas] = 0 return lambdas From 2112f5d0278e6342d93e9649d3b0da6021d30f98 Mon Sep 17 00:00:00 2001 From: Sylvain Marie Date: Wed, 17 Oct 2018 16:19:30 +0200 Subject: [PATCH 018/106] Fixed test: no warning is raised in presence of small eigenvalues anymore, this is normal behaviour as the number of samples increases. --- sklearn/decomposition/tests/test_kernel_pca.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 2c17e5ce59342..983dbe3eb985e 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -7,7 +7,8 @@ from sklearn.utils.validation import check_kernel_eigenvalues from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, - assert_raises, assert_warns) + assert_raises, assert_warns, + assert_no_warnings) from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles @@ -337,8 +338,11 @@ def transform(self, K, y=None, copy=True): [0, 4e-12]] # check that the inner method works assert_warns(KernelWarning, - lambda: check_kernel_eigenvalues((K[0][0], K[1][1]))) + lambda: check_kernel_eigenvalues((K[0][0], K[1][1]), + warn_on_zeros=True)) + # This is actually a normal behaviour when the number of samples is big, + # so no special warning should be raised in this case for solver in solvers_except_arpack: # Note: arpack detects this case and raises an error already kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, @@ -346,4 +350,4 @@ def transform(self, K, y=None, copy=True): kpca._centerer = IdentityKernelTransformer() K = kpca._get_kernel(K) # note: we can not test 'fit' because _centerer would be replaced - assert_warns(KernelWarning, lambda: kpca._fit_transform(K)) + assert_no_warnings(lambda: kpca._fit_transform(K)) From b62b91c987d0587effb9628f4fffe6959c0251f3 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 1 Mar 2019 18:33:10 +0100 Subject: [PATCH 019/106] Improved zero array initialization according to code review https://github.com/scikit-learn/scikit-learn/pull/12145/files/2112f5d0278e6342d93e9649d3b0da6021d30f98#r261052208 --- sklearn/decomposition/kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 46db8be0e0723..635d6de5170a8 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -329,7 +329,7 @@ def transform(self, X): # scale eigenvectors (properly account for null-space for dot product) nz = np.flatnonzero(self.lambdas_) - scaled_alphas = np.zeros(self.alphas_.shape) + scaled_alphas = np.zeros_like(self.alphas_) scaled_alphas[:, nz] = self.alphas_[:, nz] / np.sqrt(self.lambdas_[nz]) # Project by doing a scalar product between K and the scaled eigenvects From 3fdec0ef8f8e3eefa5efdb7833988e0e7079df6e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 1 Mar 2019 18:51:41 +0100 Subject: [PATCH 020/106] Improved import style according to code review https://github.com/scikit-learn/scikit-learn/pull/12145#discussion_r261052208 --- sklearn/decomposition/kernel_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index c06311097b235..ecfdb0200b638 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -9,8 +9,8 @@ from ..utils import check_random_state from ..utils.extmath import svd_flip -from ..utils.validation import check_is_fitted, check_array, \ - check_kernel_eigenvalues +from ..utils.validation import (check_is_fitted, check_array, + check_kernel_eigenvalues) from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin from ..preprocessing import KernelCenterer From b8782a7ab1a826f34102600df6fe242ed67aae4c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 2 Mar 2019 14:10:38 +0100 Subject: [PATCH 021/106] Updated what's new for this PR --- doc/whats_new/v0.21.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 6f504a721ec75..fcf57c67d416b 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -79,6 +79,13 @@ Support for Python 3.4 and below has been officially dropped. (resolved sign ambiguity in eigenvalue decomposition of the kernel matrix). :issue:`13241` by :user:`Aurélien Bellet `. +- |Fix| Fixed a bug in :class:`decomposition.KernelPCA`, `fit().transform()` + now produces the correct output (the same than `fit_transform()`) in case + of non-removed zero eigenvalues (`remove_zero_eig=False`). + `fit_inverse_transform` was also accelerated by using the same trick than + `fit_transform` to compute the transform of `X`. + :issue:`12141` by :user:`Sylvain Marié ` + - |Fix| Fixed a bug in :class:`decomposition.NMF` where `init = 'nndsvd'`, `init = 'nndsvda'`, and `init = 'nndsvdar'` are allowed when `n_components < n_features` instead of From f34883ecf2aece1765b05324bcddfa581924405e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 2 Mar 2019 14:35:35 +0100 Subject: [PATCH 022/106] Updated what's new for this PR --- doc/whats_new/v0.21.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index fcf57c67d416b..6cc81501a20be 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -75,6 +75,14 @@ Support for Python 3.4 and below has been officially dropped. :mod:`sklearn.decomposition` ............................ +- |Enhancement| :class:`decomposition.KernelPCA` now properly checks the + eigenvalues found by the solver for numerical or conditioning issues, relying + on new common util `utils.validation.check_kernel_eigenvalues`. This ensures + consistency of results across solvers (different choices for `eigen_solver`), + including approximate solvers such as `'randomized'` and `'lobpcg'` (see + :issue:`12068`). + :issue:`12140` by :user:`Sylvain Marié ` + - |Enhancement| :class:`decomposition.KernelPCA` now has deterministic output (resolved sign ambiguity in eigenvalue decomposition of the kernel matrix). :issue:`13241` by :user:`Aurélien Bellet `. From 44d27fb87dc093c8465c1232320364969d060bf9 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 14 Mar 2019 18:33:40 +0100 Subject: [PATCH 023/106] Update doc/whats_new/v0.21.rst Co-Authored-By: smarie --- doc/whats_new/v0.21.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index fcf57c67d416b..bd8e4b20e2d58 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -82,7 +82,7 @@ Support for Python 3.4 and below has been officially dropped. - |Fix| Fixed a bug in :class:`decomposition.KernelPCA`, `fit().transform()` now produces the correct output (the same than `fit_transform()`) in case of non-removed zero eigenvalues (`remove_zero_eig=False`). - `fit_inverse_transform` was also accelerated by using the same trick than + `fit_inverse_transform` was also accelerated by using the same trick as `fit_transform` to compute the transform of `X`. :issue:`12141` by :user:`Sylvain Marié ` From 6684f8dcd1ce99231ec7935e29c35e117c33bd8b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 14 Mar 2019 18:36:09 +0100 Subject: [PATCH 024/106] grammar fix --- sklearn/decomposition/tests/test_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index d8044bf3e7351..6d224fab5ff1d 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -156,7 +156,7 @@ def test_remove_zero_eig(): def test_leave_zero_eig(): - """This test checks that fit().transform() returns the same result than + """This test checks that fit().transform() returns the same result as fit_transform() in case of non-removed zero eigenvalue""" X_fit = np.array([[1, 1], [0, 0]]) From b47d53a763c54633f69aaf291111fa1325ad5164 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 14 Mar 2019 18:38:23 +0100 Subject: [PATCH 025/106] Referenced the issue --- sklearn/decomposition/tests/test_kernel_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 6d224fab5ff1d..c9c0de4aa19a9 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -157,7 +157,8 @@ def test_remove_zero_eig(): def test_leave_zero_eig(): """This test checks that fit().transform() returns the same result as - fit_transform() in case of non-removed zero eigenvalue""" + fit_transform() in case of non-removed zero eigenvalue. + Non-regression test for issue #12141 (PR #12143)""" X_fit = np.array([[1, 1], [0, 0]]) # Assert that even with all np warnings on, there is no div by zero warning From b8a8f8919a18910ee75a2f7e1505f794a5166f0c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 14 Mar 2019 18:39:15 +0100 Subject: [PATCH 026/106] Grammar fix --- doc/whats_new/v0.21.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index bd8e4b20e2d58..e4d41580cb118 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -80,7 +80,7 @@ Support for Python 3.4 and below has been officially dropped. :issue:`13241` by :user:`Aurélien Bellet `. - |Fix| Fixed a bug in :class:`decomposition.KernelPCA`, `fit().transform()` - now produces the correct output (the same than `fit_transform()`) in case + now produces the correct output (the same as `fit_transform()`) in case of non-removed zero eigenvalues (`remove_zero_eig=False`). `fit_inverse_transform` was also accelerated by using the same trick as `fit_transform` to compute the transform of `X`. From 5f5604d31048e1a550b8d1d1d692c04417e58b86 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 14 Mar 2019 18:40:04 +0100 Subject: [PATCH 027/106] Issue number replaced with PR number --- doc/whats_new/v0.21.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index e4d41580cb118..f96e182dd4f0f 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -84,7 +84,7 @@ Support for Python 3.4 and below has been officially dropped. of non-removed zero eigenvalues (`remove_zero_eig=False`). `fit_inverse_transform` was also accelerated by using the same trick as `fit_transform` to compute the transform of `X`. - :issue:`12141` by :user:`Sylvain Marié ` + :issue:`12143` by :user:`Sylvain Marié ` - |Fix| Fixed a bug in :class:`decomposition.NMF` where `init = 'nndsvd'`, `init = 'nndsvda'`, and `init = 'nndsvdar'` are allowed when From 7f40c3e7afd19bda20acd3d65e699cde9f84682e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 15 Mar 2019 09:27:17 +0100 Subject: [PATCH 028/106] renamed `nz` `nonzeros` following review --- sklearn/decomposition/kernel_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 635d6de5170a8..662648cf58ba7 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -328,9 +328,9 @@ def transform(self, X): K = self._centerer.transform(self._get_kernel(X, self.X_fit_)) # scale eigenvectors (properly account for null-space for dot product) - nz = np.flatnonzero(self.lambdas_) + non_zeros = np.flatnonzero(self.lambdas_) scaled_alphas = np.zeros_like(self.alphas_) - scaled_alphas[:, nz] = self.alphas_[:, nz] / np.sqrt(self.lambdas_[nz]) + scaled_alphas[:, non_zeros] = self.alphas_[:, non_zeros] / np.sqrt(self.lambdas_[non_zeros]) # Project by doing a scalar product between K and the scaled eigenvects return np.dot(K, scaled_alphas) From d085cb5b00ff8f4ff47b367171297898ecf92496 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 15 Mar 2019 09:33:56 +0100 Subject: [PATCH 029/106] Fixed 80 characters max issue + improved comment as per review --- sklearn/decomposition/kernel_pca.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 662648cf58ba7..9bb8b4c1f8e61 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -330,9 +330,10 @@ def transform(self, X): # scale eigenvectors (properly account for null-space for dot product) non_zeros = np.flatnonzero(self.lambdas_) scaled_alphas = np.zeros_like(self.alphas_) - scaled_alphas[:, non_zeros] = self.alphas_[:, non_zeros] / np.sqrt(self.lambdas_[non_zeros]) + scaled_alphas[:, non_zeros] = (self.alphas_[:, non_zeros] + / np.sqrt(self.lambdas_[non_zeros])) - # Project by doing a scalar product between K and the scaled eigenvects + # Project with a scalar product between K and the scaled eigenvectors return np.dot(K, scaled_alphas) def inverse_transform(self, X): From 7a45ea25a871473059a70d5eb96ffcc159e9d1b0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 15 Mar 2019 09:38:43 +0100 Subject: [PATCH 030/106] Fixed comment as per review --- sklearn/decomposition/kernel_pca.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 9bb8b4c1f8e61..0d2e0e05c76ea 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -241,9 +241,8 @@ def _fit_transform(self, K): # We COULD scale them here: # self.alphas_ = self.alphas_ / np.sqrt(self.lambdas_) # - # But the original choice was to perform that LATER when needed, in - # fit() and in transform(). We keep it to preserve the meaning of - # self.alphas_ across sk-learn versions. + # But choose to perform that LATER when needed, in `fit()` and in + # `transform()`. return K From a6c6ee19b9038ea35f82661931c95fb9e8430c8e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 19 Mar 2019 11:49:44 +0100 Subject: [PATCH 031/106] Fixed comment as per review --- sklearn/decomposition/kernel_pca.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 0d2e0e05c76ea..c1c695c96d82b 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -277,8 +277,7 @@ def fit(self, X, y=None): self._fit_transform(K) if self.fit_inverse_transform: - # Transform X - # (shortcut since we transform the same X that was used to fit) + # no need to use the kernel to transform X, use shortcut expression X_transformed = self.alphas_ * np.sqrt(self.lambdas_) self._fit_inverse_transform(X_transformed, X) @@ -301,8 +300,7 @@ def fit_transform(self, X, y=None, **params): """ self.fit(X, **params) - # Transform X - # (shortcut since we transform the same X that was used to fit) + # no need to use the kernel to transform X, use shortcut expression X_transformed = self.alphas_ * np.sqrt(self.lambdas_) if self.fit_inverse_transform: From 4f647ab71de659938d901b921cfa950f71352672 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 19 Mar 2019 11:50:11 +0100 Subject: [PATCH 032/106] Fixed test as per review --- sklearn/decomposition/tests/test_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index c9c0de4aa19a9..e470ec640a79b 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -178,7 +178,7 @@ def test_leave_zero_eig(): # but there should not be warnings about division by zero. # (Numpy division by zero warning can have many message variants, but # at least we know that it is a RuntimeWarning so lets check only this) - assert w.category is not RuntimeWarning + assert not issubclass(w.category, RuntimeWarning) def test_kernel_pca_precomputed(): From 98b5eeea0ed6afe9e3ac8fbaa9534b290b227b14 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 19 Mar 2019 13:43:43 +0100 Subject: [PATCH 033/106] Fixed `check_kernel_eigenvalues` to return a copy (as per code review) --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5762bb62837dd..a2d3921cec88b 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1066,7 +1066,7 @@ def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): "kernel." % (max_imag_abs / max_real_abs)) # Remove the insignificant imaginary parts - lambdas = np.real(lambdas) + lambdas = np.real(np.copy(lambdas)) # Check that there are no significant negative eigenvalues max_eig = lambdas.max() From 234a4d1ef8ef7536bd56e6d7a39bb1de54fb5566 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 19 Mar 2019 13:44:17 +0100 Subject: [PATCH 034/106] Fixed `check_kernel_eigenvalues` docstring as per code review --- sklearn/utils/validation.py | 65 +++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index a2d3921cec88b..561f0b45463bb 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -985,31 +985,49 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): - """Checks that the provided array of kernel eigenvalues for numerical or + """Checks kernel eigenvalues for conditioning issues. + + Checks the provided array of kernel eigenvalues for numerical or conditioning issues and returns a fixed validated version. It checks: - - that there are no significant imaginary parts in eigenvalues (more than - 1e-5 times the maximum real part). If this check fails, it raises a - ValueError. Otherwise all non-significant imaginary parts that may remain - are removed. + - that there are no significant imaginary parts in eigenvalues (more than + 1e-5 times the maximum real part). If this check fails, it raises a + ValueError. Otherwise all non-significant imaginary parts that may remain + are removed. + + - that eigenvalues are not all negative. If this check fails, it raises a + ValueError - - that eigenvalues are not all negative. If this check fails, it raises a - ValueError + - that there are no significant negative eigenvalues (with absolute value + more than 1e-10 and more than 1e-5 times the largest positive eigenvalue). + If this check fails, it raises a KernelWarning. All negative eigenvalues + are set to zero in all cases. - - that there are no significant negative eigenvalues (with absolute value - more than 1e-10 and more than 1e-5 times the largest positive eigenvalue). - If this check fails, it raises a KernelWarning. All negative eigenvalues - are set to zero in all cases. + - that the eigenvalues are well conditioned. That means, that the + eigenvalues are all greater than the maximum eigenvalue divided by 1e12. + If this check fails and `warn_on_zeros=True`, it raises a KernelWarning. + All the eigenvalues that are too small are then set to zero. + + Parameters + ---------- + lambdas : array-like + Array of eigenvalues to check / fix. - - that the eigenvalues are well conditioned. That means, that the - eigenvalues are all greater than the maximum eigenvalue divided by 1e12. - If this check fails and `warn_on_zeros=True`, it raises a KernelWarning. - All the eigenvalues that are too small are then set to zero. + warn_on_zeros : boolean (default: False) + When this is set to `True`, a `KernelWarning` will be raised when there + are extremely small eigenvalues. Otherwise no warning will be raised. + Note that in both cases, extremely small eigenvalues will be set to + zero. - Note: the returned array is converted to numpy array. + Returns + ------- + lambdas_fixed : array-like + A fixed validated copy of the array of eigenvalues. + Examples + -------- >>> check_kernel_eigenvalues((1, 2)) # nominal case ... # doctest: +NORMALIZE_WHITESPACE array([1, 2]) @@ -1038,21 +1056,6 @@ def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) - Parameters - ---------- - lambdas : array-like - Array of eigenvalues to check / fix. - - warn_on_zeros : boolean (default: False) - When this is set to `True`, a `KernelWarning` will be raised when there - are extremely small eigenvalues. Otherwise no warning will be raised. - Note that in both cases, extremely small eigenvalues will be set to - zero. - - Returns - ------- - lambdas_fixed : array-like - The fixed validated array of eigenvalues. """ # Check that there are no significant imaginary parts From 2a1b4cf258efc6bf173375c414f2d5dac9ce00cd Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 19 Mar 2019 13:44:41 +0100 Subject: [PATCH 035/106] Added name in authors list --- sklearn/utils/validation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 561f0b45463bb..dc4e6c9be871d 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -6,6 +6,7 @@ # Lars Buitinck # Alexandre Gramfort # Nicolas Tresegnie +# Sylvain Marie # License: BSD 3 clause import warnings From 04b486e43f513df849cba0e9f676a12b06e3a684 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 19 Mar 2019 13:50:08 +0100 Subject: [PATCH 036/106] Fixed flake8 error --- sklearn/utils/validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index dc4e6c9be871d..be02cfeec8c54 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1002,9 +1002,9 @@ def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): ValueError - that there are no significant negative eigenvalues (with absolute value - more than 1e-10 and more than 1e-5 times the largest positive eigenvalue). - If this check fails, it raises a KernelWarning. All negative eigenvalues - are set to zero in all cases. + more than 1e-10 and more than 1e-5 times the largest positive + eigenvalue). If this check fails, it raises a KernelWarning. All negative + eigenvalues are set to zero in all cases. - that the eigenvalues are well conditioned. That means, that the eigenvalues are all greater than the maximum eigenvalue divided by 1e12. From 2e326b252fa6ee6624b7b01b754fb191e4a3a81b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 22 Mar 2019 11:48:43 +0100 Subject: [PATCH 037/106] Fixed issue number > PR number --- doc/whats_new/v0.21.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index a1d8a56811071..44d5c13761c0e 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -95,7 +95,7 @@ Support for Python 3.4 and below has been officially dropped. consistency of results across solvers (different choices for `eigen_solver`), including approximate solvers such as `'randomized'` and `'lobpcg'` (see :issue:`12068`). - :issue:`12140` by :user:`Sylvain Marié ` + :issue:`12145` by :user:`Sylvain Marié ` - |Enhancement| :class:`decomposition.KernelPCA` now has deterministic output (resolved sign ambiguity in eigenvalue decomposition of the kernel matrix). From a817f7a5517a0e79182bb57f4c58a6fca8ea6e83 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 27 Mar 2019 17:23:39 +0100 Subject: [PATCH 038/106] Removed 1 blank line --- sklearn/decomposition/kernel_pca.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 885145443ba91..b0cb40dd0f2c0 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -215,7 +215,6 @@ def _fit_transform(self, K): # make sure that there are no numerical or conditioning issues self.lambdas_ = check_kernel_eigenvalues(self.lambdas_) - # flip eigenvectors' sign to enforce deterministic output self.alphas_, _ = svd_flip(self.alphas_, np.empty_like(self.alphas_).T) From fecd5d3be8e1d0e336324b7ceb71c1d6516dfbbb Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 27 Mar 2019 17:39:46 +0100 Subject: [PATCH 039/106] As per code review: since `assert_warns` and `assert_no_warnings` are to be rolled out, replaced them with `pytest.warns`. --- .../decomposition/tests/test_kernel_pca.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index ca39e41d49ec4..7dd71b71ccd80 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -7,8 +7,7 @@ from sklearn.utils.validation import check_kernel_eigenvalues from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, - assert_raises, assert_allclose, - assert_warns, assert_no_warnings) + assert_raises, assert_allclose) from sklearn.decomposition import PCA, KernelPCA from sklearn.datasets import make_circles @@ -342,8 +341,9 @@ def transform(self, K, y=None, copy=True): K = [[5, 0], [0, -6e-5]] # check that the inner method works - assert_warns(KernelWarning, - lambda: check_kernel_eigenvalues((K[0][0], K[1][1]))) + w_msg = "There are significant negative eigenvalues" + with pytest.warns(KernelWarning, match=w_msg): + check_kernel_eigenvalues((K[0][0], K[1][1])) for solver in solvers_except_arpack: # Note: arpack detects this case and raises an error already @@ -352,16 +352,17 @@ def transform(self, K, y=None, copy=True): kpca._centerer = IdentityKernelTransformer() K = kpca._get_kernel(K) # note: we can not test 'fit' because _centerer would be replaced - assert_warns(KernelWarning, lambda: kpca._fit_transform(K)) + with pytest.warns(KernelWarning, match=w_msg): + kpca._fit_transform(K) # Bad conditioning # ---------------- K = [[5, 0], [0, 4e-12]] # check that the inner method works - assert_warns(KernelWarning, - lambda: check_kernel_eigenvalues((K[0][0], K[1][1]), - warn_on_zeros=True)) + w_msg = "the largest eigenvalue is more than 1.00E\\+12 times the smallest" + with pytest.warns(KernelWarning, match=w_msg): + check_kernel_eigenvalues((K[0][0], K[1][1]), warn_on_zeros=True) # This is actually a normal behaviour when the number of samples is big, # so no special warning should be raised in this case @@ -372,4 +373,6 @@ def transform(self, K, y=None, copy=True): kpca._centerer = IdentityKernelTransformer() K = kpca._get_kernel(K) # note: we can not test 'fit' because _centerer would be replaced - assert_no_warnings(lambda: kpca._fit_transform(K)) + with pytest.warns(None) as w: + kpca._fit_transform(K) + assert not w From 6edf1f658e6310e81acadbc68a0f957977cb1350 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 27 Mar 2019 18:12:21 +0100 Subject: [PATCH 040/106] As per code review: renamed `check_kernel_eigenvalues` into `check_psd_eigenvalues` and `KernelWarning` into `PSDSpectrumWarning` --- doc/whats_new/v0.21.rst | 2 +- sklearn/decomposition/kernel_pca.py | 4 +- .../decomposition/tests/test_kernel_pca.py | 18 ++--- sklearn/exceptions.py | 11 ++- sklearn/utils/__init__.py | 4 +- sklearn/utils/validation.py | 81 ++++++++++--------- 6 files changed, 67 insertions(+), 53 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 44d5c13761c0e..be4b2eb9857e4 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -91,7 +91,7 @@ Support for Python 3.4 and below has been officially dropped. - |Enhancement| :class:`decomposition.KernelPCA` now properly checks the eigenvalues found by the solver for numerical or conditioning issues, relying - on new common util `utils.validation.check_kernel_eigenvalues`. This ensures + on new common util `utils.validation.check_psd_eigenvalues`. This ensures consistency of results across solvers (different choices for `eigen_solver`), including approximate solvers such as `'randomized'` and `'lobpcg'` (see :issue:`12068`). diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index b0cb40dd0f2c0..2dd2bc6ae71d4 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -10,7 +10,7 @@ from ..utils import check_random_state from ..utils.extmath import svd_flip from ..utils.validation import (check_is_fitted, check_array, - check_kernel_eigenvalues) + check_psd_eigenvalues) from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin from ..preprocessing import KernelCenterer @@ -213,7 +213,7 @@ def _fit_transform(self, K): v0=v0) # make sure that there are no numerical or conditioning issues - self.lambdas_ = check_kernel_eigenvalues(self.lambdas_) + self.lambdas_ = check_psd_eigenvalues(self.lambdas_) # flip eigenvectors' sign to enforce deterministic output self.alphas_, _ = svd_flip(self.alphas_, diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 7dd71b71ccd80..15d256769a0e7 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -3,8 +3,8 @@ import pytest from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.exceptions import KernelWarning -from sklearn.utils.validation import check_kernel_eigenvalues +from sklearn.exceptions import PSDSpectrumWarning +from sklearn.utils.validation import check_psd_eigenvalues from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, assert_raises, assert_allclose) @@ -317,7 +317,7 @@ def transform(self, K, y=None, copy=True): # For safety concerning future evolutions the corresponding code is left in # KernelPCA, and we test it directly by calling the inner method here: with pytest.raises(ValueError): - check_kernel_eigenvalues((K[0][0], K[1][1])) + check_psd_eigenvalues((K[0][0], K[1][1])) # All negative eigenvalues: error # ------------------------------- @@ -325,7 +325,7 @@ def transform(self, K, y=None, copy=True): [0, -6e-5]] # check that the inner method works with pytest.raises(ValueError): - check_kernel_eigenvalues((K[0][0], K[1][1])) + check_psd_eigenvalues((K[0][0], K[1][1])) for solver in solvers: kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, @@ -342,8 +342,8 @@ def transform(self, K, y=None, copy=True): [0, -6e-5]] # check that the inner method works w_msg = "There are significant negative eigenvalues" - with pytest.warns(KernelWarning, match=w_msg): - check_kernel_eigenvalues((K[0][0], K[1][1])) + with pytest.warns(PSDSpectrumWarning, match=w_msg): + check_psd_eigenvalues((K[0][0], K[1][1])) for solver in solvers_except_arpack: # Note: arpack detects this case and raises an error already @@ -352,7 +352,7 @@ def transform(self, K, y=None, copy=True): kpca._centerer = IdentityKernelTransformer() K = kpca._get_kernel(K) # note: we can not test 'fit' because _centerer would be replaced - with pytest.warns(KernelWarning, match=w_msg): + with pytest.warns(PSDSpectrumWarning, match=w_msg): kpca._fit_transform(K) # Bad conditioning @@ -361,8 +361,8 @@ def transform(self, K, y=None, copy=True): [0, 4e-12]] # check that the inner method works w_msg = "the largest eigenvalue is more than 1.00E\\+12 times the smallest" - with pytest.warns(KernelWarning, match=w_msg): - check_kernel_eigenvalues((K[0][0], K[1][1]), warn_on_zeros=True) + with pytest.warns(PSDSpectrumWarning, match=w_msg): + check_psd_eigenvalues((K[0][0], K[1][1]), warn_on_zeros=True) # This is actually a normal behaviour when the number of samples is big, # so no special warning should be raised in this case diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index c589015fee50a..711af578309dd 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -13,7 +13,7 @@ 'NonBLASDotWarning', 'SkipTestWarning', 'UndefinedMetricWarning', - 'KernelWarning'] + 'PSDSpectrumWarning'] class NotFittedError(ValueError, AttributeError): @@ -157,8 +157,13 @@ class UndefinedMetricWarning(UserWarning): """ -class KernelWarning(UserWarning): - """Custom warning to capture kernel issues +class PSDSpectrumWarning(UserWarning): + """Warning raised when the eigenvalues of a PSD matrix have issues + + This warning is typically raised by `check_psd_eigenvalues` when the + eigenvalues of a positive semidefinite (PSD) matrix such as a gram matrix + (kernel) present significant negative eigenvalues, or bad conditioning i.e. + very small non-zero eigenvalues compared to the largest eigenvalue. .. versionadded:: 0.21 """ diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index e944a456f44de..6a199ec91460b 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -16,7 +16,7 @@ from ..exceptions import DataConversionWarning from .deprecation import deprecated from .validation import (as_float_array, - assert_all_finite, check_kernel_eigenvalues, + assert_all_finite, check_psd_eigenvalues, check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, check_symmetric, check_scalar) @@ -57,7 +57,7 @@ class Parallel(_joblib.Parallel): __all__ = ["murmurhash3_32", "as_float_array", "assert_all_finite", "check_array", - "check_random_state", "check_kernel_eigenvalues", + "check_random_state", "check_psd_eigenvalues", "compute_class_weight", "compute_sample_weight", "column_or_1d", "safe_indexing", "check_consistent_length", "check_X_y", "check_scalar", 'indexable', diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5a6904947f92c..99a6bc5856775 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -21,7 +21,7 @@ from .fixes import _object_dtype_isnan from .. import get_config as _get_config -from ..exceptions import NonBLASDotWarning, KernelWarning +from ..exceptions import NonBLASDotWarning, PSDSpectrumWarning from ..exceptions import NotFittedError from ..exceptions import DataConversionWarning from ._joblib import Memory @@ -969,31 +969,36 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): - """Checks kernel eigenvalues for conditioning issues. +def check_psd_eigenvalues(lambdas, warn_on_zeros=False): + """Checks the eigenvalues of a positive semidefinite (PSD) matrix. - Checks the provided array of kernel eigenvalues for numerical or - conditioning issues and returns a fixed validated version. + Checks the provided array of PSD matrix eigenvalues for numerical or + conditioning issues and returns a fixed validated version. This method + should typically be used if the PSD matrix is user-provided (e.g. a + Gram matrix) or computed using a user-provided dissimilarity metric + (e.g. kernel function), or if the decomposition process uses approximation + methods (randomized SVD, etc.). It checks: - that there are no significant imaginary parts in eigenvalues (more than 1e-5 times the maximum real part). If this check fails, it raises a - ValueError. Otherwise all non-significant imaginary parts that may remain - are removed. + `ValueError`. Otherwise all non-significant imaginary parts that may + remain are removed. - that eigenvalues are not all negative. If this check fails, it raises a - ValueError + `ValueError` - that there are no significant negative eigenvalues (with absolute value more than 1e-10 and more than 1e-5 times the largest positive - eigenvalue). If this check fails, it raises a KernelWarning. All negative - eigenvalues are set to zero in all cases. + eigenvalue). If this check fails, it raises a `PSDSpectrumWarning`. All + negative eigenvalues (even smaller ones) are set to zero in all cases. - that the eigenvalues are well conditioned. That means, that the eigenvalues are all greater than the maximum eigenvalue divided by 1e12. - If this check fails and `warn_on_zeros=True`, it raises a KernelWarning. - All the eigenvalues that are too small are then set to zero. + If this check fails and `warn_on_zeros=True`, it raises a + `PSDSpectrumWarning`. All the eigenvalues that are too small are then set + to zero. Parameters ---------- @@ -1001,10 +1006,10 @@ def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): Array of eigenvalues to check / fix. warn_on_zeros : boolean (default: False) - When this is set to `True`, a `KernelWarning` will be raised when there - are extremely small eigenvalues. Otherwise no warning will be raised. - Note that in both cases, extremely small eigenvalues will be set to - zero. + When this is set to `True`, a `PSDSpectrumWarning` will be raised when + there are extremely small eigenvalues. Otherwise no warning will be + raised. Note that in both cases, extremely small eigenvalues will be + set to zero. Returns ------- @@ -1013,31 +1018,32 @@ def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): Examples -------- - >>> check_kernel_eigenvalues((1, 2)) # nominal case + >>> check_psd_eigenvalues((1, 2)) # nominal case ... # doctest: +NORMALIZE_WHITESPACE array([1, 2]) - >>> check_kernel_eigenvalues((5, 5j)) # significant imag part + >>> check_psd_eigenvalues((5, 5j)) # significant imag part ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: There are significant imaginary parts in eigenvalues (1.000000 - of the max real part). Something may be wrong with the kernel. - >>> check_kernel_eigenvalues((5, 5e-5j)) # insignificant imag part + of the max real part). The matrix is maybe not PSD, or something went + wrong with the eigenvalues decomposition. + >>> check_psd_eigenvalues((5, 5e-5j)) # insignificant imag part ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) - >>> check_kernel_eigenvalues((-5, -1)) # all negative + >>> check_psd_eigenvalues((-5, -1)) # all negative ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... - ValueError: All eigenvalues are negative (max is -1.000000). Something may - be wrong with the kernel. - >>> check_kernel_eigenvalues((5, -1)) # significant negative + ValueError: All eigenvalues are negative (max is -1.000000). The matrix is + maybe not PSD, or something went wrong with the eigenvalues decomposition. + >>> check_psd_eigenvalues((5, -1)) # significant negative ... # doctest: +NORMALIZE_WHITESPACE array([5, 0]) - >>> check_kernel_eigenvalues((5, -5e-5)) # insignificant negative + >>> check_psd_eigenvalues((5, -5e-5)) # insignificant negative ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) - >>> check_kernel_eigenvalues((5, 4e-12)) # bad conditioning (too small) + >>> check_psd_eigenvalues((5, 4e-12)) # bad conditioning (too small) ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) @@ -1050,8 +1056,9 @@ def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): if max_imag_abs > 1e-5 * max_real_abs: raise ValueError( "There are significant imaginary parts in eigenvalues (%f " - "of the max real part). Something may be wrong with the " - "kernel." % (max_imag_abs / max_real_abs)) + "of the max real part). The matrix is maybe not PSD, or " + "something went wrong with the eigenvalues decomposition." + "" % (max_imag_abs / max_real_abs)) # Remove the insignificant imaginary parts lambdas = np.real(np.copy(lambdas)) @@ -1059,8 +1066,9 @@ def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): # Check that there are no significant negative eigenvalues max_eig = lambdas.max() if max_eig < 0: - raise ValueError("All eigenvalues are negative (max is %f). Something " - "may be wrong with the kernel." % max_eig) + raise ValueError("All eigenvalues are negative (max is %f). The matrix" + " is maybe not PSD, or something went wrong with the " + "eigenvalues decomposition." % max_eig) else: min_eig = lambdas.min() @@ -1069,9 +1077,10 @@ def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): # probably need more tolerant thresholds such as: # (-min_eig > 5e-3 * max(max_eig, 0) and -min_eig > 1e-8) warnings.warn("There are significant negative eigenvalues " - "(%f of the max positive). Something may be " - "wrong with the kernel. Replacing them by " - "zero." % (-min_eig / max_eig), KernelWarning) + "(%f of the max positive). The matrix is maybe not " + "PSD, or something went wrong with the eigenvalues " + "decomposition. Replacing them with zero." + "" % (-min_eig / max_eig), PSDSpectrumWarning) # Remove all negative values in all cases lambdas[lambdas < 0] = 0 @@ -1081,10 +1090,10 @@ def check_kernel_eigenvalues(lambdas, warn_on_zeros=False): too_small_lambdas = lambdas < max_eig / max_conditioning if too_small_lambdas.any(): if warn_on_zeros: - warnings.warn("The kernel is badly conditioned: the largest " + warnings.warn("Badly conditioned PSD matrix spectrum: the largest " "eigenvalue is more than %.2E times the smallest. " - "Small eigenvalues will be replaced " - "by 0" % max_conditioning, KernelWarning) + "Small eigenvalues will be replaced with 0" + "" % max_conditioning, PSDSpectrumWarning) lambdas[too_small_lambdas] = 0 return lambdas From 2ad637705caaecebf24382ee16cae290f7752708 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 27 Mar 2019 18:14:30 +0100 Subject: [PATCH 041/106] Minor edit: line too long --- sklearn/utils/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 99a6bc5856775..e76eaa4251ac8 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1036,7 +1036,8 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): Traceback (most recent call last): ... ValueError: All eigenvalues are negative (max is -1.000000). The matrix is - maybe not PSD, or something went wrong with the eigenvalues decomposition. + maybe not PSD, or something went wrong with the eigenvalues + decomposition. >>> check_psd_eigenvalues((5, -1)) # significant negative ... # doctest: +NORMALIZE_WHITESPACE array([5, 0]) From cf2fe2b8275dc7b8aa854fc6ac6afe203e415afc Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 27 Mar 2019 21:59:12 +0100 Subject: [PATCH 042/106] Added tests for `check_psd_eigenvalues` --- sklearn/utils/tests/test_validation.py | 65 +++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 67fc9b33d404b..6275b9eed7303 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -40,10 +40,11 @@ check_memory, check_non_negative, _num_samples, - check_scalar) + check_scalar, + check_psd_eigenvalues) import sklearn -from sklearn.exceptions import NotFittedError +from sklearn.exceptions import NotFittedError, PSDSpectrumWarning from sklearn.exceptions import DataConversionWarning from sklearn.utils.testing import assert_raise_message @@ -835,3 +836,63 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, min_val=min_val, max_val=max_val) assert str(raised_error.value) == str(err_msg) assert type(raised_error.value) == type(err_msg) + + +psd_cases = [('nominal', (1., 2.), np.array([1., 2.]), None, ""), + ('insignificant_imag', (5., 5e-5j), np.array([5., 0.]), None, ""), + ('significant neg', (5., -1.), np.array([5., 0.]), + PSDSpectrumWarning, "There are significant negative eigenval"), + ('insignificant neg', (5, -5e-5), np.array([5., 0.]), None, ""),] + + +@pytest.mark.parametrize("input, output, w_type, w_msg", + [c[1:] for c in psd_cases], + ids=[c[0] for c in psd_cases]) +@pytest.mark.parametrize("warn_on_zeros", [True, False]) +def test_check_psd_eigenvalues_valid(input, output, w_type, w_msg, + warn_on_zeros): + """Test that check_psd_eigenvalues returns the right output for valid + input, possibly raising the right warning""" + + with pytest.warns(w_type, match=w_msg) as w: + assert_array_equal( + check_psd_eigenvalues(input, warn_on_zeros=warn_on_zeros), output) + if w_type is None: + assert not w + + +def test_check_psd_eigenvalues_bad_conditioning_warning(): + """Test that check_psd_eigenvalues raises a warning for bad conditioning + when warn_on_zeros is set to True, and does not when it is set to False""" + + input = (5, 4e-12) + output = np.array([5., 0.]) + + with pytest.warns(None, ) as w: + assert_array_equal(check_psd_eigenvalues(input, warn_on_zeros=False), + output) + assert not w + + w_msg = "the largest eigenvalue is more than 1.00E\\+12 times the smallest" + with pytest.warns(PSDSpectrumWarning, match=w_msg) as w: + assert_array_equal(check_psd_eigenvalues(input, warn_on_zeros=True), + output) + + +psd_cases2 = [('significant_imag', (5., 5j), + ValueError, "There are significant imaginary parts in eigenv"), + ('all negative', (-5, -1), + ValueError, "All eigenvalues are negative (max is -1.000000)")] + + +@pytest.mark.parametrize("input, err_type, err_msg", + [c[1:] for c in psd_cases2], + ids=[c[0] for c in psd_cases2]) +def test_check_psd_eigenvalues_invalid(input, err_type, err_msg): + """Test that check_psd_eigenvalues raises the right error for invalid + input""" + + with pytest.raises(err_type) as err_info: + check_psd_eigenvalues(input) + + assert err_msg in err_info.value.args[0] From 75c234a666ad44846d9b97f04c86caba740e27f5 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 27 Mar 2019 22:00:06 +0100 Subject: [PATCH 043/106] Fixed warning message for bad conditioning: now discarding exact zero eigenvalues. --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index e76eaa4251ac8..f854fed5a1655 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1088,7 +1088,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): # Finally check for conditioning max_conditioning = 1e12 # Max allowed conditioning (ratio big/small) - too_small_lambdas = lambdas < max_eig / max_conditioning + too_small_lambdas = (0 < lambdas) & (lambdas < max_eig / max_conditioning) if too_small_lambdas.any(): if warn_on_zeros: warnings.warn("Badly conditioned PSD matrix spectrum: the largest " From 5669d3f4d634956bd8d695dc8d112a984a3f2325 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 09:42:08 +0200 Subject: [PATCH 044/106] Added `check_psd_eigenvalues` to `classes.rst` for auto documentation --- doc/modules/classes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 7464729a52e0d..917860621475b 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1432,6 +1432,7 @@ Low-level methods utils.check_scalar utils.check_consistent_length utils.check_random_state + utils.check_psd_eigenvalues utils.class_weight.compute_class_weight utils.class_weight.compute_sample_weight utils.deprecated From bd99db0f1b916fda817a1845b65e49b40fc6c8e1 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 09:47:32 +0200 Subject: [PATCH 045/106] Fixed one-liner docstring as per pep257. --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f854fed5a1655..7cc6e8a0c9614 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -970,7 +970,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): def check_psd_eigenvalues(lambdas, warn_on_zeros=False): - """Checks the eigenvalues of a positive semidefinite (PSD) matrix. + """Check the eigenvalues of a positive semidefinite (PSD) matrix. Checks the provided array of PSD matrix eigenvalues for numerical or conditioning issues and returns a fixed validated version. This method From 265be2b2364540dcaf3c7cf618a40d9ad2d28f16 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 09:50:19 +0200 Subject: [PATCH 046/106] Using double barticks in rst docstring as per review --- sklearn/utils/validation.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 7cc6e8a0c9614..b20e0f70df9e7 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -983,22 +983,22 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): - that there are no significant imaginary parts in eigenvalues (more than 1e-5 times the maximum real part). If this check fails, it raises a - `ValueError`. Otherwise all non-significant imaginary parts that may + ``ValueError``. Otherwise all non-significant imaginary parts that may remain are removed. - that eigenvalues are not all negative. If this check fails, it raises a - `ValueError` + ``ValueError`` - that there are no significant negative eigenvalues (with absolute value more than 1e-10 and more than 1e-5 times the largest positive - eigenvalue). If this check fails, it raises a `PSDSpectrumWarning`. All + eigenvalue). If this check fails, it raises a ``PSDSpectrumWarning``. All negative eigenvalues (even smaller ones) are set to zero in all cases. - that the eigenvalues are well conditioned. That means, that the eigenvalues are all greater than the maximum eigenvalue divided by 1e12. - If this check fails and `warn_on_zeros=True`, it raises a - `PSDSpectrumWarning`. All the eigenvalues that are too small are then set - to zero. + If this check fails and ``warn_on_zeros=True``, it raises a + ``PSDSpectrumWarning``. All the eigenvalues that are too small are then + set to zero. Parameters ---------- @@ -1006,9 +1006,9 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): Array of eigenvalues to check / fix. warn_on_zeros : boolean (default: False) - When this is set to `True`, a `PSDSpectrumWarning` will be raised when - there are extremely small eigenvalues. Otherwise no warning will be - raised. Note that in both cases, extremely small eigenvalues will be + When this is set to ``True``, a ``PSDSpectrumWarning`` will be raised + when there are extremely small eigenvalues. Otherwise no warning will + be raised. Note that in both cases, extremely small eigenvalues will be set to zero. Returns From cb9993c692cb8ad8caf2b2099482d5976877f01b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 09:51:55 +0200 Subject: [PATCH 047/106] Using double barticks in rst docstring as per review --- sklearn/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 711af578309dd..6d1009f449628 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -160,7 +160,7 @@ class UndefinedMetricWarning(UserWarning): class PSDSpectrumWarning(UserWarning): """Warning raised when the eigenvalues of a PSD matrix have issues - This warning is typically raised by `check_psd_eigenvalues` when the + This warning is typically raised by ``check_psd_eigenvalues`` when the eigenvalues of a positive semidefinite (PSD) matrix such as a gram matrix (kernel) present significant negative eigenvalues, or bad conditioning i.e. very small non-zero eigenvalues compared to the largest eigenvalue. From 4dfe22dd2209ed1be40e9aee3694547af59c0f2f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 09:55:17 +0200 Subject: [PATCH 048/106] Using double barticks in rst docstring as per review --- sklearn/decomposition/kernel_pca.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 2dd2bc6ae71d4..d52feb10f7a3b 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -80,14 +80,14 @@ class KernelPCA(BaseEstimator, TransformerMixin): If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used - by `np.random`. Used when ``eigen_solver`` == 'arpack'. + by ``np.random``. Used when ``eigen_solver`` == 'arpack'. .. versionadded:: 0.18 copy_X : boolean, default=True - If True, input X is copied and stored by the model in the `X_fit_` + If True, input X is copied and stored by the model in the ``X_fit_`` attribute. If no further changes will be done to X, setting - `copy_X=False` saves memory by storing a reference. + ``copy_X=False`` saves memory by storing a reference. .. versionadded:: 0.18 @@ -103,12 +103,12 @@ class KernelPCA(BaseEstimator, TransformerMixin): ---------- lambdas_ : array, (n_components,) Eigenvalues of the centered kernel matrix in decreasing order. - If `n_components` and `remove_zero_eig` are not set, + If ``n_components`` and ``remove_zero_eig`` are not set, then all values are stored. alphas_ : array, (n_samples, n_components) - Eigenvectors of the centered kernel matrix. If `n_components` and - `remove_zero_eig` are not set, then all components are stored. + Eigenvectors of the centered kernel matrix. If ``n_components`` and + ``remove_zero_eig`` are not set, then all components are stored. dual_coef_ : array, (n_samples, n_features) Inverse transform matrix. Only available when @@ -119,7 +119,7 @@ class KernelPCA(BaseEstimator, TransformerMixin): Only available when ``fit_inverse_transform`` is True. X_fit_ : (n_samples, n_features) - The data used to fit the model. If `copy_X=False`, then `X_fit_` is + The data used to fit the model. If ``copy_X=False``, then ``X_fit_`` is a reference. This attribute is used for the calls to transform. Examples From 96033baaf4ac1653aab1b6d7367da5d5eae6be7e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 2 Apr 2019 10:05:07 +0200 Subject: [PATCH 049/106] Update sklearn/utils/validation.py as per code review Co-Authored-By: smarie --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f854fed5a1655..aff7538adb5d7 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1005,7 +1005,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): lambdas : array-like Array of eigenvalues to check / fix. - warn_on_zeros : boolean (default: False) + warn_on_zeros : bool, optional (default=False) When this is set to `True`, a `PSDSpectrumWarning` will be raised when there are extremely small eigenvalues. Otherwise no warning will be raised. Note that in both cases, extremely small eigenvalues will be From 5f1db2e6c1950a9ac3601d4d91e1c05a95b722a7 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 10:07:35 +0200 Subject: [PATCH 050/106] End with period (code review) --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 4e27d4f08250b..7103e7c6131cd 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1093,7 +1093,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): if warn_on_zeros: warnings.warn("Badly conditioned PSD matrix spectrum: the largest " "eigenvalue is more than %.2E times the smallest. " - "Small eigenvalues will be replaced with 0" + "Small eigenvalues will be replaced with 0." "" % max_conditioning, PSDSpectrumWarning) lambdas[too_small_lambdas] = 0 From d40a292e2e62477f46ead8b9fe5616865ee28748 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 2 Apr 2019 10:14:26 +0200 Subject: [PATCH 051/106] Update sklearn/utils/validation.py improved readability Co-Authored-By: smarie --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index aff7538adb5d7..f1a2ceeeeb89a 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1073,7 +1073,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): else: min_eig = lambdas.min() - if -min_eig > 1e-5 * max(max_eig, 0) and -min_eig > 1e-10: + if min_eig < -1e-5 * max(max_eig, 0) and min_eig < -1e-10: # If kernel has been computed with single precision we would # probably need more tolerant thresholds such as: # (-min_eig > 5e-3 * max(max_eig, 0) and -min_eig > 1e-8) From b256bcbcdc53420f6f9f3e93e4c3b020b2957841 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 10:15:10 +0200 Subject: [PATCH 052/106] max -> maximum (as per review) --- sklearn/utils/validation.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 7103e7c6131cd..c668f64e26627 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1057,7 +1057,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): if max_imag_abs > 1e-5 * max_real_abs: raise ValueError( "There are significant imaginary parts in eigenvalues (%f " - "of the max real part). The matrix is maybe not PSD, or " + "of the maximum real part). The matrix is maybe not PSD, or " "something went wrong with the eigenvalues decomposition." "" % (max_imag_abs / max_real_abs)) @@ -1067,9 +1067,9 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): # Check that there are no significant negative eigenvalues max_eig = lambdas.max() if max_eig < 0: - raise ValueError("All eigenvalues are negative (max is %f). The matrix" - " is maybe not PSD, or something went wrong with the " - "eigenvalues decomposition." % max_eig) + raise ValueError("All eigenvalues are negative (maximum is %f). The " + "matrix is maybe not PSD, or something went wrong " + "with the eigenvalues decomposition." % max_eig) else: min_eig = lambdas.min() @@ -1078,10 +1078,10 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): # probably need more tolerant thresholds such as: # (-min_eig > 5e-3 * max(max_eig, 0) and -min_eig > 1e-8) warnings.warn("There are significant negative eigenvalues " - "(%f of the max positive). The matrix is maybe not " - "PSD, or something went wrong with the eigenvalues " - "decomposition. Replacing them with zero." - "" % (-min_eig / max_eig), PSDSpectrumWarning) + "(%f of the maximum positive). The matrix is maybe " + "not PSD, or something went wrong with the " + "eigenvalues decomposition. Replacing them with " + "zero." % (-min_eig / max_eig), PSDSpectrumWarning) # Remove all negative values in all cases lambdas[lambdas < 0] = 0 From c4b764d21ae95f32cc17bd46babf400328b9dd47 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 10:33:42 +0200 Subject: [PATCH 053/106] renamed `input` into `lambdas` and `output` into `expected_lambdas`, and transformed cases into dictionaries as per code review --- sklearn/utils/tests/test_validation.py | 46 +++++++++++++++----------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 6275b9eed7303..67a2d522a25e5 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -838,25 +838,29 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, assert type(raised_error.value) == type(err_msg) -psd_cases = [('nominal', (1., 2.), np.array([1., 2.]), None, ""), - ('insignificant_imag', (5., 5e-5j), np.array([5., 0.]), None, ""), - ('significant neg', (5., -1.), np.array([5., 0.]), - PSDSpectrumWarning, "There are significant negative eigenval"), - ('insignificant neg', (5, -5e-5), np.array([5., 0.]), None, ""),] - - -@pytest.mark.parametrize("input, output, w_type, w_msg", - [c[1:] for c in psd_cases], - ids=[c[0] for c in psd_cases]) +_psd_cases_valid = { + 'nominal': ((1., 2.), np.array([1., 2.]), None, ""), + 'insignificant_imag': ((5., 5e-5j), np.array([5., 0.]), None, ""), + 'significant neg': ((5., -1.), np.array([5., 0.]), PSDSpectrumWarning, + "There are significant negative eigenvalues"), + 'insignificant neg': ((5, -5e-5), np.array([5., 0.]), None, "") +} + + +@pytest.mark.parametrize("lambdas, expected_lambdas, w_type, w_msg", + list(_psd_cases_valid.values()), + ids=list(_psd_cases_valid.keys())) @pytest.mark.parametrize("warn_on_zeros", [True, False]) -def test_check_psd_eigenvalues_valid(input, output, w_type, w_msg, +def test_check_psd_eigenvalues_valid(lambdas, expected_lambdas, w_type, w_msg, warn_on_zeros): """Test that check_psd_eigenvalues returns the right output for valid input, possibly raising the right warning""" with pytest.warns(w_type, match=w_msg) as w: assert_array_equal( - check_psd_eigenvalues(input, warn_on_zeros=warn_on_zeros), output) + check_psd_eigenvalues(lambdas, warn_on_zeros=warn_on_zeros), + expected_lambdas + ) if w_type is None: assert not w @@ -879,16 +883,18 @@ def test_check_psd_eigenvalues_bad_conditioning_warning(): output) -psd_cases2 = [('significant_imag', (5., 5j), - ValueError, "There are significant imaginary parts in eigenv"), - ('all negative', (-5, -1), - ValueError, "All eigenvalues are negative (max is -1.000000)")] +_psd_cases_invalid = { + 'significant_imag': ((5., 5j), ValueError, + "There are significant imaginary parts in eigenv"), + 'all negative': ((-5, -1), ValueError, + "All eigenvalues are negative \\(maximum is -1.000") +} -@pytest.mark.parametrize("input, err_type, err_msg", - [c[1:] for c in psd_cases2], - ids=[c[0] for c in psd_cases2]) -def test_check_psd_eigenvalues_invalid(input, err_type, err_msg): +@pytest.mark.parametrize("lambdas, err_type, err_msg", + list(_psd_cases_invalid.values()), + ids=list(_psd_cases_invalid.keys())) +def test_check_psd_eigenvalues_invalid(lambdas, err_type, err_msg): """Test that check_psd_eigenvalues raises the right error for invalid input""" From 4dca96cbb55a162b72f2133be309a995c010214c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 10:34:05 +0200 Subject: [PATCH 054/106] using `match` as per review --- sklearn/utils/tests/test_validation.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 67a2d522a25e5..2908b2b73ac70 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -898,7 +898,5 @@ def test_check_psd_eigenvalues_invalid(lambdas, err_type, err_msg): """Test that check_psd_eigenvalues raises the right error for invalid input""" - with pytest.raises(err_type) as err_info: - check_psd_eigenvalues(input) - - assert err_msg in err_info.value.args[0] + with pytest.raises(err_type, match=err_msg): + check_psd_eigenvalues(lambdas) From 256bb161425f4734de34795015c3ff196dbec495 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 10:34:35 +0200 Subject: [PATCH 055/106] changed comment to match expression --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5fabff3bebc8c..165a7ae2eb5eb 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1076,7 +1076,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): if min_eig < -1e-5 * max(max_eig, 0) and min_eig < -1e-10: # If kernel has been computed with single precision we would # probably need more tolerant thresholds such as: - # (-min_eig > 5e-3 * max(max_eig, 0) and -min_eig > 1e-8) + # (min_eig < -5e-3 * max(max_eig, 0) and min_eig < -1e-8) warnings.warn("There are significant negative eigenvalues " "(%f of the maximum positive). The matrix is maybe " "not PSD, or something went wrong with the " From 21b3ed31707d89083637edde648dff5f9545aca8 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 10:46:07 +0200 Subject: [PATCH 056/106] Removed explicit floats as per review --- sklearn/utils/tests/test_validation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 2908b2b73ac70..42122e58a58a1 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -839,11 +839,11 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, _psd_cases_valid = { - 'nominal': ((1., 2.), np.array([1., 2.]), None, ""), - 'insignificant_imag': ((5., 5e-5j), np.array([5., 0.]), None, ""), - 'significant neg': ((5., -1.), np.array([5., 0.]), PSDSpectrumWarning, + 'nominal': ((1, 2), np.array([1, 2]), None, ""), + 'insignificant_imag': ((5, 5e-5j), np.array([5, 0]), None, ""), + 'significant neg': ((5, -1), np.array([5, 0]), PSDSpectrumWarning, "There are significant negative eigenvalues"), - 'insignificant neg': ((5, -5e-5), np.array([5., 0.]), None, "") + 'insignificant neg': ((5, -5e-5), np.array([5, 0]), None, "") } @@ -870,7 +870,7 @@ def test_check_psd_eigenvalues_bad_conditioning_warning(): when warn_on_zeros is set to True, and does not when it is set to False""" input = (5, 4e-12) - output = np.array([5., 0.]) + output = np.array([5, 0]) with pytest.warns(None, ) as w: assert_array_equal(check_psd_eigenvalues(input, warn_on_zeros=False), @@ -884,7 +884,7 @@ def test_check_psd_eigenvalues_bad_conditioning_warning(): _psd_cases_invalid = { - 'significant_imag': ((5., 5j), ValueError, + 'significant_imag': ((5, 5j), ValueError, "There are significant imaginary parts in eigenv"), 'all negative': ((-5, -1), ValueError, "All eigenvalues are negative \\(maximum is -1.000") From dd7a5a6632223bf9d73b728f1221044ee82b7739 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 11:14:34 +0200 Subject: [PATCH 057/106] As per code review: cleaned the kPCA kernel conditioning test as most of it is in the test of `check_psd_eigenvalues`. --- .../decomposition/tests/test_kernel_pca.py | 115 ++---------------- 1 file changed, 12 insertions(+), 103 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 15d256769a0e7..674012d684063 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -1,10 +1,6 @@ import numpy as np import scipy.sparse as sp import pytest -from sklearn.base import BaseEstimator, TransformerMixin - -from sklearn.exceptions import PSDSpectrumWarning -from sklearn.utils.validation import check_psd_eigenvalues from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, assert_raises, assert_allclose) @@ -277,102 +273,15 @@ def test_nested_circles(): assert_equal(train_score, 1.0) -def test_errors_and_warnings(): - """Tests that bad kernels raise error and warnings""" - - solvers = ['dense', 'arpack'] - solvers_except_arpack = ['dense'] - - # First create an identity transformer class - # ------------------------------------------ - class IdentityKernelTransformer(BaseEstimator, TransformerMixin): - """We will use this transformer so that the passed kernel matrix is - not centered when kPCA is fit""" - - def __init__(self): - pass - - def fit(self, K, y=None): - return self - - def transform(self, K, y=None, copy=True): - return K - - # Significant imaginary parts: error - # ---------------------------------- - # As of today it seems that the kernel matrix is always cast as a float - # whatever the method (precomputed or callable). - # The following test therefore fails ("did not raise"). - K = [[5, 0], - [0, 6 * 1e-5j]] - # for solver in solvers: - # kpca = KernelPCA(kernel=kernel_getter, eigen_solver=solver, - # fit_inverse_transform=False) - # kpca._centerer = IdentityKernelTransformer() - # K = kpca._get_kernel(K) - # with pytest.raises(ValueError): - # # note: we can not test 'fit' because _centerer would be replaced - # kpca._fit_transform(K) - # - # For safety concerning future evolutions the corresponding code is left in - # KernelPCA, and we test it directly by calling the inner method here: - with pytest.raises(ValueError): - check_psd_eigenvalues((K[0][0], K[1][1])) - - # All negative eigenvalues: error - # ------------------------------- - K = [[-5, 0], - [0, -6e-5]] - # check that the inner method works - with pytest.raises(ValueError): - check_psd_eigenvalues((K[0][0], K[1][1])) - - for solver in solvers: - kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, - fit_inverse_transform=False) - kpca._centerer = IdentityKernelTransformer() - K = kpca._get_kernel(K) - with pytest.raises(ValueError): - # note: we can not test 'fit' because _centerer would be replaced - kpca._fit_transform(K) - - # Significant negative eigenvalue: warning - # ---------------------------------------- - K = [[5, 0], - [0, -6e-5]] - # check that the inner method works - w_msg = "There are significant negative eigenvalues" - with pytest.warns(PSDSpectrumWarning, match=w_msg): - check_psd_eigenvalues((K[0][0], K[1][1])) - - for solver in solvers_except_arpack: - # Note: arpack detects this case and raises an error already - kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, - fit_inverse_transform=False) - kpca._centerer = IdentityKernelTransformer() - K = kpca._get_kernel(K) - # note: we can not test 'fit' because _centerer would be replaced - with pytest.warns(PSDSpectrumWarning, match=w_msg): - kpca._fit_transform(K) - - # Bad conditioning - # ---------------- - K = [[5, 0], - [0, 4e-12]] - # check that the inner method works - w_msg = "the largest eigenvalue is more than 1.00E\\+12 times the smallest" - with pytest.warns(PSDSpectrumWarning, match=w_msg): - check_psd_eigenvalues((K[0][0], K[1][1]), warn_on_zeros=True) - - # This is actually a normal behaviour when the number of samples is big, - # so no special warning should be raised in this case - for solver in solvers_except_arpack: - # Note: arpack detects this case and raises an error already - kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, - fit_inverse_transform=False) - kpca._centerer = IdentityKernelTransformer() - K = kpca._get_kernel(K) - # note: we can not test 'fit' because _centerer would be replaced - with pytest.warns(None) as w: - kpca._fit_transform(K) - assert not w +def test_kernel_conditioning(): + """Test that ``check_psd_eigenvalues`` is correctly called.""" + + # create a pathological X leading to small non-zero eigenvalue + X = [[5, 1], + [5+1e-8, 1e-8], + [5+1e-8, 0]] + kpca = KernelPCA(kernel="linear", n_components=2, fit_inverse_transform=True) + kpca.fit(X) + + # check that the small non-zero eigenvalue was correctly set to zero + assert kpca.lambdas_.min() == 0 From b35bcfb4073f04f44e1f5123b4a4c390773c7ea9 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2019 11:25:54 +0200 Subject: [PATCH 058/106] fixed line length --- sklearn/decomposition/tests/test_kernel_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 674012d684063..4485593ed441f 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -280,7 +280,8 @@ def test_kernel_conditioning(): X = [[5, 1], [5+1e-8, 1e-8], [5+1e-8, 0]] - kpca = KernelPCA(kernel="linear", n_components=2, fit_inverse_transform=True) + kpca = KernelPCA(kernel="linear", n_components=2, + fit_inverse_transform=True) kpca.fit(X) # check that the small non-zero eigenvalue was correctly set to zero From a7c3616597a742051178126668895f5b40255bee Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2019 17:47:06 +0200 Subject: [PATCH 059/106] reverted double barticks fix --- sklearn/decomposition/kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index d52feb10f7a3b..307a1d18bc593 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -80,7 +80,7 @@ class KernelPCA(BaseEstimator, TransformerMixin): If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used - by ``np.random``. Used when ``eigen_solver`` == 'arpack'. + by `np.random`. Used when ``eigen_solver`` == 'arpack'. .. versionadded:: 0.18 From 43ea82a7ed8c241fd35cc040169ae54272b65851 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2019 17:48:54 +0200 Subject: [PATCH 060/106] minor test update as per review: added an assert --- sklearn/decomposition/tests/test_kernel_pca.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 4485593ed441f..0c27f3f033e05 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -11,6 +11,7 @@ from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.metrics.pairwise import rbf_kernel +from sklearn.utils import check_psd_eigenvalues def test_kernel_pca(): @@ -286,3 +287,4 @@ def test_kernel_conditioning(): # check that the small non-zero eigenvalue was correctly set to zero assert kpca.lambdas_.min() == 0 + assert kpca.lambdas_ == check_psd_eigenvalues(kpca.lambdas_) From f6ee5684dcebb32934623ab9801374983efe4d5d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2019 17:52:55 +0200 Subject: [PATCH 061/106] fixed last test update --- sklearn/decomposition/tests/test_kernel_pca.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 0c27f3f033e05..9f0218dc746c9 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -1,7 +1,8 @@ import numpy as np import scipy.sparse as sp import pytest -from sklearn.utils.testing import (assert_array_almost_equal, assert_less, +from sklearn.utils.testing import (assert_array_equal, + assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, assert_raises, assert_allclose) @@ -287,4 +288,4 @@ def test_kernel_conditioning(): # check that the small non-zero eigenvalue was correctly set to zero assert kpca.lambdas_.min() == 0 - assert kpca.lambdas_ == check_psd_eigenvalues(kpca.lambdas_) + assert_array_equal(kpca.lambdas_, check_psd_eigenvalues(kpca.lambdas_)) From 55e63da4a52fafcd666f5bd0c3af3f8f724d14f9 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2019 17:56:25 +0200 Subject: [PATCH 062/106] fixed doctest --- sklearn/utils/validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 165a7ae2eb5eb..dc4c8052c5132 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1026,8 +1026,8 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): Traceback (most recent call last): ... ValueError: There are significant imaginary parts in eigenvalues (1.000000 - of the max real part). The matrix is maybe not PSD, or something went - wrong with the eigenvalues decomposition. + of the maximum real part). The matrix is maybe not PSD, or something + went wrong with the eigenvalues decomposition. >>> check_psd_eigenvalues((5, 5e-5j)) # insignificant imag part ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) @@ -1035,8 +1035,8 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... - ValueError: All eigenvalues are negative (max is -1.000000). The matrix is - maybe not PSD, or something went wrong with the eigenvalues + ValueError: All eigenvalues are negative (maximum is -1.000000). The matrix + is maybe not PSD, or something went wrong with the eigenvalues decomposition. >>> check_psd_eigenvalues((5, -1)) # significant negative ... # doctest: +NORMALIZE_WHITESPACE From ef07045d2db085a73a542bfe460a22ee984384c7 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 5 Apr 2019 09:59:17 +0200 Subject: [PATCH 063/106] added double backticks as per review --- doc/whats_new/v0.21.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 00683481930dc..0a27c5147be65 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -105,10 +105,10 @@ Support for Python 3.4 and below has been officially dropped. - |Enhancement| :class:`decomposition.KernelPCA` now properly checks the eigenvalues found by the solver for numerical or conditioning issues, relying - on new common util `utils.validation.check_psd_eigenvalues`. This ensures - consistency of results across solvers (different choices for `eigen_solver`), - including approximate solvers such as `'randomized'` and `'lobpcg'` (see - :issue:`12068`). + on new common util :func:`utils.validation.check_psd_eigenvalues`. This + ensures consistency of results across solvers (different choices for + ``eigen_solver``), including approximate solvers such as ``'randomized'`` and + ``'lobpcg'`` (see :issue:`12068`). :issue:`12145` by :user:`Sylvain Marié ` - |Enhancement| :class:`decomposition.KernelPCA` now has deterministic output From ba576e4599a7dbd4645d2c9de4c6306f63a7f927 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 5 Apr 2019 10:03:19 +0200 Subject: [PATCH 064/106] Removed double backticks in unrelated places --- sklearn/decomposition/kernel_pca.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 307a1d18bc593..2dd2bc6ae71d4 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -85,9 +85,9 @@ class KernelPCA(BaseEstimator, TransformerMixin): .. versionadded:: 0.18 copy_X : boolean, default=True - If True, input X is copied and stored by the model in the ``X_fit_`` + If True, input X is copied and stored by the model in the `X_fit_` attribute. If no further changes will be done to X, setting - ``copy_X=False`` saves memory by storing a reference. + `copy_X=False` saves memory by storing a reference. .. versionadded:: 0.18 @@ -103,12 +103,12 @@ class KernelPCA(BaseEstimator, TransformerMixin): ---------- lambdas_ : array, (n_components,) Eigenvalues of the centered kernel matrix in decreasing order. - If ``n_components`` and ``remove_zero_eig`` are not set, + If `n_components` and `remove_zero_eig` are not set, then all values are stored. alphas_ : array, (n_samples, n_components) - Eigenvectors of the centered kernel matrix. If ``n_components`` and - ``remove_zero_eig`` are not set, then all components are stored. + Eigenvectors of the centered kernel matrix. If `n_components` and + `remove_zero_eig` are not set, then all components are stored. dual_coef_ : array, (n_samples, n_features) Inverse transform matrix. Only available when @@ -119,7 +119,7 @@ class KernelPCA(BaseEstimator, TransformerMixin): Only available when ``fit_inverse_transform`` is True. X_fit_ : (n_samples, n_features) - The data used to fit the model. If ``copy_X=False``, then ``X_fit_`` is + The data used to fit the model. If `copy_X=False`, then `X_fit_` is a reference. This attribute is used for the calls to transform. Examples From 48bf481f1735ebfad38a01a0e66a3342717617b5 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 5 Apr 2019 17:20:23 +0200 Subject: [PATCH 065/106] using `assert np.all` instead of `assert_array_equal` as per review --- sklearn/decomposition/tests/test_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 9f0218dc746c9..afa43e8a74089 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -288,4 +288,4 @@ def test_kernel_conditioning(): # check that the small non-zero eigenvalue was correctly set to zero assert kpca.lambdas_.min() == 0 - assert_array_equal(kpca.lambdas_, check_psd_eigenvalues(kpca.lambdas_)) + assert np.all(kpca.lambdas_ == check_psd_eigenvalues(kpca.lambdas_)) From 4f3fb72329a89cf83886b8c20636fc212105c48d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 5 Apr 2019 18:13:43 +0200 Subject: [PATCH 066/106] All thresholds are now variables. Also, added a section about precision. --- sklearn/utils/validation.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index dc4c8052c5132..08c6f5669c8b7 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1049,12 +1049,31 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): array([5., 0.]) """ + # is the provided array in double precision (float64) ? + is_double_precision = True + try: + is_double_precision = lambdas[0].dtype == np.dtype('f') + except AttributeError: # not np array + pass + except IndexError: # zero-length + pass + # note: the minimum value available is + # - single-precision: np.finfo('float32').eps = 1.2e-07 + # - double-precision: np.finfo('float64').eps = 2.2e-16 + + # the various thresholds used for validation + # we may wish to change the value according to precision. + # currently we do it only for the 'absolute' threshold. + significant_imag_ratio = 1e-5 + significant_neg_ratio = 1e-5 # if is_double_precision else 5e-3 + significant_neg_value = 1e-10 if is_double_precision else 1e-6 + small_pos_ratio = 1e-12 # Check that there are no significant imaginary parts if not np.isreal(lambdas).all(): max_imag_abs = abs(np.imag(lambdas)).max() max_real_abs = abs(np.real(lambdas)).max() - if max_imag_abs > 1e-5 * max_real_abs: + if max_imag_abs > significant_imag_ratio * max_real_abs: raise ValueError( "There are significant imaginary parts in eigenvalues (%f " "of the maximum real part). The matrix is maybe not PSD, or " @@ -1073,10 +1092,8 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): else: min_eig = lambdas.min() - if min_eig < -1e-5 * max(max_eig, 0) and min_eig < -1e-10: - # If kernel has been computed with single precision we would - # probably need more tolerant thresholds such as: - # (min_eig < -5e-3 * max(max_eig, 0) and min_eig < -1e-8) + if (min_eig < -significant_neg_ratio * max(max_eig, 0) + and min_eig < -significant_neg_value): warnings.warn("There are significant negative eigenvalues " "(%f of the maximum positive). The matrix is maybe " "not PSD, or something went wrong with the " @@ -1087,14 +1104,14 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): lambdas[lambdas < 0] = 0 # Finally check for conditioning - max_conditioning = 1e12 # Max allowed conditioning (ratio big/small) - too_small_lambdas = (0 < lambdas) & (lambdas < max_eig / max_conditioning) + too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig) if too_small_lambdas.any(): if warn_on_zeros: warnings.warn("Badly conditioned PSD matrix spectrum: the largest " "eigenvalue is more than %.2E times the smallest. " "Small eigenvalues will be replaced with 0." - "" % max_conditioning, PSDSpectrumWarning) + "" % (1 / small_pos_ratio), + PSDSpectrumWarning) lambdas[too_small_lambdas] = 0 return lambdas From dd1ec9e5790013eee62b7b2cf3a7f3d1b3b778d8 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 9 Apr 2019 22:43:49 +0200 Subject: [PATCH 067/106] Update sklearn/utils/validation.py Co-Authored-By: smarie --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 08c6f5669c8b7..226242d831391 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1018,7 +1018,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): Examples -------- - >>> check_psd_eigenvalues((1, 2)) # nominal case + >>> check_psd_eigenvalues([1, 2]) # nominal case ... # doctest: +NORMALIZE_WHITESPACE array([1, 2]) >>> check_psd_eigenvalues((5, 5j)) # significant imag part From f394570d6263ee2e0fd13fe696c9f47d28a2ccc0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 10 Apr 2019 17:33:18 +0200 Subject: [PATCH 068/106] Fixed brackets as per review --- sklearn/utils/validation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 226242d831391..b12bce81a17ef 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1021,30 +1021,30 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): >>> check_psd_eigenvalues([1, 2]) # nominal case ... # doctest: +NORMALIZE_WHITESPACE array([1, 2]) - >>> check_psd_eigenvalues((5, 5j)) # significant imag part + >>> check_psd_eigenvalues([5, 5j]) # significant imag part ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: There are significant imaginary parts in eigenvalues (1.000000 of the maximum real part). The matrix is maybe not PSD, or something went wrong with the eigenvalues decomposition. - >>> check_psd_eigenvalues((5, 5e-5j)) # insignificant imag part + >>> check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) - >>> check_psd_eigenvalues((-5, -1)) # all negative + >>> check_psd_eigenvalues([-5, -1]) # all negative ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: All eigenvalues are negative (maximum is -1.000000). The matrix is maybe not PSD, or something went wrong with the eigenvalues decomposition. - >>> check_psd_eigenvalues((5, -1)) # significant negative + >>> check_psd_eigenvalues([5, -1]) # significant negative ... # doctest: +NORMALIZE_WHITESPACE array([5, 0]) - >>> check_psd_eigenvalues((5, -5e-5)) # insignificant negative + >>> check_psd_eigenvalues([5, -5e-5]) # insignificant negative ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) - >>> check_psd_eigenvalues((5, 4e-12)) # bad conditioning (too small) + >>> check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small) ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) From 153c7ec1c853f8af40c26a767799ca8047597700 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 11 Apr 2019 17:59:33 +0200 Subject: [PATCH 069/106] changed double precision check as per review --- sklearn/utils/validation.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index b12bce81a17ef..79fb3fe05773a 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1050,13 +1050,12 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): """ # is the provided array in double precision (float64) ? - is_double_precision = True - try: - is_double_precision = lambdas[0].dtype == np.dtype('f') - except AttributeError: # not np array - pass - except IndexError: # zero-length - pass + if isinstance(lambdas, np.ndarray): + is_double_precision = (lambdas.dtype == np.dtype('f')) + else: + # default for non-numpy inputs + is_double_precision = True + # note: the minimum value available is # - single-precision: np.finfo('float32').eps = 1.2e-07 # - double-precision: np.finfo('float64').eps = 2.2e-16 From fda29c8ec702f6c42ae331d0c4a28cc83e63068d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 23 Apr 2019 17:28:19 +0200 Subject: [PATCH 070/106] Fixed double precision check as per review, and added corresponding test. --- sklearn/utils/tests/test_validation.py | 15 ++++++++++++++- sklearn/utils/validation.py | 11 ++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 42122e58a58a1..3548728d39973 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -840,10 +840,23 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, _psd_cases_valid = { 'nominal': ((1, 2), np.array([1, 2]), None, ""), + 'nominal_np_array': (np.array([1, 2]), np.array([1, 2]), None, ""), 'insignificant_imag': ((5, 5e-5j), np.array([5, 0]), None, ""), 'significant neg': ((5, -1), np.array([5, 0]), PSDSpectrumWarning, "There are significant negative eigenvalues"), - 'insignificant neg': ((5, -5e-5), np.array([5, 0]), None, "") + 'significant neg float32': (np.array([3e-4, -2e-6], dtype=np.float32), + np.array([3e-4, 0], dtype=np.float32), + PSDSpectrumWarning, + "There are significant negative eigenvalues"), + 'significant neg float64': (np.array([1e-5, -2e-10], dtype=np.float64), + np.array([1e-5, 0], dtype=np.float64), + PSDSpectrumWarning, + "There are significant negative eigenvalues"), + 'insignificant neg': ((5, -5e-5), np.array([5, 0]), None, ""), + 'insignificant neg float32': (np.array([1, -1e-6], dtype=np.float32), + np.array([1, 0], dtype=np.float32), None, ""), + 'insignificant neg float64': (np.array([1, -1e-10], dtype=np.float64), + np.array([1, 0], dtype=np.float64), None, ""), } diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 79fb3fe05773a..261ee997d0c22 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1051,7 +1051,13 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): """ # is the provided array in double precision (float64) ? if isinstance(lambdas, np.ndarray): - is_double_precision = (lambdas.dtype == np.dtype('f')) + # From https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html + # and https://docs.scipy.org/doc/numpy/user/basics.types.html + if lambdas.dtype.kind == 'f': + is_double_precision = (lambdas.dtype.itemsize >= 8) + else: + # default for non-float dtypes + is_double_precision = True else: # default for non-numpy inputs is_double_precision = True @@ -1062,9 +1068,8 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): # the various thresholds used for validation # we may wish to change the value according to precision. - # currently we do it only for the 'absolute' threshold. significant_imag_ratio = 1e-5 - significant_neg_ratio = 1e-5 # if is_double_precision else 5e-3 + significant_neg_ratio = 1e-5 if is_double_precision else 5e-3 significant_neg_value = 1e-10 if is_double_precision else 1e-6 small_pos_ratio = 1e-12 From 3bc1b6c812c96b806742d5add0f363b563f7ed6d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 23 Apr 2019 17:31:45 +0200 Subject: [PATCH 071/106] Fixed line length --- sklearn/utils/tests/test_validation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 6911759db5392..704fe2cf0da82 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -873,9 +873,11 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, "There are significant negative eigenvalues"), 'insignificant neg': ((5, -5e-5), np.array([5, 0]), None, ""), 'insignificant neg float32': (np.array([1, -1e-6], dtype=np.float32), - np.array([1, 0], dtype=np.float32), None, ""), + np.array([1, 0], dtype=np.float32), + None, ""), 'insignificant neg float64': (np.array([1, -1e-10], dtype=np.float64), - np.array([1, 0], dtype=np.float64), None, ""), + np.array([1, 0], dtype=np.float64), + None, ""), } From 25a6908f393c3890449157e9d50bd24e0122d7a3 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 23 Apr 2019 17:53:40 +0200 Subject: [PATCH 072/106] Fixed unused import --- sklearn/decomposition/tests/test_kernel_pca.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index afa43e8a74089..6f9d95254a585 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -1,8 +1,7 @@ import numpy as np import scipy.sparse as sp import pytest -from sklearn.utils.testing import (assert_array_equal, - assert_array_almost_equal, assert_less, +from sklearn.utils.testing import (assert_array_almost_equal, assert_less, assert_equal, assert_not_equal, assert_raises, assert_allclose) From ac18f82cdef8cb856f441f69e4b888387adf8fa1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 26 Apr 2019 18:13:38 +0200 Subject: [PATCH 073/106] Update sklearn/utils/tests/test_validation.py Co-Authored-By: smarie --- sklearn/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 704fe2cf0da82..3321a5c1e143d 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -906,7 +906,7 @@ def test_check_psd_eigenvalues_bad_conditioning_warning(): input = (5, 4e-12) output = np.array([5, 0]) - with pytest.warns(None, ) as w: + with pytest.warns(None) as w: assert_array_equal(check_psd_eigenvalues(input, warn_on_zeros=False), output) assert not w From 1e0fc68b3a9b8b44fa7c9523a06b75857c22eba5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 26 Apr 2019 18:14:15 +0200 Subject: [PATCH 074/106] Update sklearn/utils/validation.py Co-Authored-By: smarie --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 3c5eff8dd3691..bf63a36ed3a89 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1090,7 +1090,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): # Check that there are no significant imaginary parts if not np.isreal(lambdas).all(): - max_imag_abs = abs(np.imag(lambdas)).max() + max_imag_abs = np.abs(np.imag(lambdas)).max() max_real_abs = abs(np.real(lambdas)).max() if max_imag_abs > significant_imag_ratio * max_real_abs: raise ValueError( From e5d1db4ad8909026edf43d897b38af8e9c618cd6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 26 Apr 2019 18:14:39 +0200 Subject: [PATCH 075/106] Update sklearn/utils/validation.py Co-Authored-By: smarie --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index bf63a36ed3a89..0fbcbd7fbdc06 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1091,7 +1091,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): # Check that there are no significant imaginary parts if not np.isreal(lambdas).all(): max_imag_abs = np.abs(np.imag(lambdas)).max() - max_real_abs = abs(np.real(lambdas)).max() + max_real_abs = np.abs(np.real(lambdas)).max() if max_imag_abs > significant_imag_ratio * max_real_abs: raise ValueError( "There are significant imaginary parts in eigenvalues (%f " From e3d5ec062fc481d8f6703e135647723c5f9d3c0a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 26 Apr 2019 18:14:52 +0200 Subject: [PATCH 076/106] Update sklearn/utils/validation.py Co-Authored-By: smarie --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 0fbcbd7fbdc06..b15c0e4e0d2e4 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1017,7 +1017,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): Parameters ---------- - lambdas : array-like + lambdas : array-like, shape (n_eigenvalues,) Array of eigenvalues to check / fix. warn_on_zeros : bool, optional (default=False) From 2f4e4e4f84b26e0f455e1a1b558ded4abe5a324d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 26 Apr 2019 18:15:30 +0200 Subject: [PATCH 077/106] Update sklearn/utils/validation.py Co-Authored-By: smarie --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index b15c0e4e0d2e4..1e5fd94c529ac 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1028,7 +1028,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): Returns ------- - lambdas_fixed : array-like + lambdas_fixed : ndarray, shape (n_eigenvalues,) A fixed validated copy of the array of eigenvalues. Examples From 9a0753ff2c5d4a4cfd83ec94e14738c9317431d0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 26 Apr 2019 18:22:09 +0200 Subject: [PATCH 078/106] `PSDSpectrumWarning` renamed `PositiveSpectrumWarning` as per review --- sklearn/exceptions.py | 4 ++-- sklearn/utils/tests/test_validation.py | 10 +++++----- sklearn/utils/validation.py | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 6d1009f449628..139f06b628331 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -13,7 +13,7 @@ 'NonBLASDotWarning', 'SkipTestWarning', 'UndefinedMetricWarning', - 'PSDSpectrumWarning'] + 'PositiveSpectrumWarning'] class NotFittedError(ValueError, AttributeError): @@ -157,7 +157,7 @@ class UndefinedMetricWarning(UserWarning): """ -class PSDSpectrumWarning(UserWarning): +class PositiveSpectrumWarning(UserWarning): """Warning raised when the eigenvalues of a PSD matrix have issues This warning is typically raised by ``check_psd_eigenvalues`` when the diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 3321a5c1e143d..defd2d7d03b25 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -44,7 +44,7 @@ check_psd_eigenvalues) import sklearn -from sklearn.exceptions import NotFittedError, PSDSpectrumWarning +from sklearn.exceptions import NotFittedError, PositiveSpectrumWarning from sklearn.exceptions import DataConversionWarning from sklearn.utils.testing import assert_raise_message @@ -861,15 +861,15 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, 'nominal': ((1, 2), np.array([1, 2]), None, ""), 'nominal_np_array': (np.array([1, 2]), np.array([1, 2]), None, ""), 'insignificant_imag': ((5, 5e-5j), np.array([5, 0]), None, ""), - 'significant neg': ((5, -1), np.array([5, 0]), PSDSpectrumWarning, + 'significant neg': ((5, -1), np.array([5, 0]), PositiveSpectrumWarning, "There are significant negative eigenvalues"), 'significant neg float32': (np.array([3e-4, -2e-6], dtype=np.float32), np.array([3e-4, 0], dtype=np.float32), - PSDSpectrumWarning, + PositiveSpectrumWarning, "There are significant negative eigenvalues"), 'significant neg float64': (np.array([1e-5, -2e-10], dtype=np.float64), np.array([1e-5, 0], dtype=np.float64), - PSDSpectrumWarning, + PositiveSpectrumWarning, "There are significant negative eigenvalues"), 'insignificant neg': ((5, -5e-5), np.array([5, 0]), None, ""), 'insignificant neg float32': (np.array([1, -1e-6], dtype=np.float32), @@ -912,7 +912,7 @@ def test_check_psd_eigenvalues_bad_conditioning_warning(): assert not w w_msg = "the largest eigenvalue is more than 1.00E\\+12 times the smallest" - with pytest.warns(PSDSpectrumWarning, match=w_msg) as w: + with pytest.warns(PositiveSpectrumWarning, match=w_msg) as w: assert_array_equal(check_psd_eigenvalues(input, warn_on_zeros=True), output) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 1e5fd94c529ac..9257a2be79cd1 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -21,7 +21,7 @@ from .fixes import _object_dtype_isnan from .. import get_config as _get_config -from ..exceptions import NonBLASDotWarning, PSDSpectrumWarning +from ..exceptions import NonBLASDotWarning, PositiveSpectrumWarning from ..exceptions import NotFittedError from ..exceptions import DataConversionWarning from ._joblib import Memory @@ -1006,13 +1006,13 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): - that there are no significant negative eigenvalues (with absolute value more than 1e-10 and more than 1e-5 times the largest positive - eigenvalue). If this check fails, it raises a ``PSDSpectrumWarning``. All + eigenvalue). If this check fails, it raises a ``PositiveSpectrumWarning``. All negative eigenvalues (even smaller ones) are set to zero in all cases. - that the eigenvalues are well conditioned. That means, that the eigenvalues are all greater than the maximum eigenvalue divided by 1e12. If this check fails and ``warn_on_zeros=True``, it raises a - ``PSDSpectrumWarning``. All the eigenvalues that are too small are then + ``PositiveSpectrumWarning``. All the eigenvalues that are too small are then set to zero. Parameters @@ -1021,7 +1021,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): Array of eigenvalues to check / fix. warn_on_zeros : bool, optional (default=False) - When this is set to ``True``, a ``PSDSpectrumWarning`` will be raised + When this is set to ``True``, a ``PositiveSpectrumWarning`` will be raised when there are extremely small eigenvalues. Otherwise no warning will be raised. Note that in both cases, extremely small eigenvalues will be set to zero. @@ -1117,7 +1117,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): "(%f of the maximum positive). The matrix is maybe " "not PSD, or something went wrong with the " "eigenvalues decomposition. Replacing them with " - "zero." % (-min_eig / max_eig), PSDSpectrumWarning) + "zero." % (-min_eig / max_eig), PositiveSpectrumWarning) # Remove all negative values in all cases lambdas[lambdas < 0] = 0 @@ -1130,7 +1130,7 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): "eigenvalue is more than %.2E times the smallest. " "Small eigenvalues will be replaced with 0." "" % (1 / small_pos_ratio), - PSDSpectrumWarning) + PositiveSpectrumWarning) lambdas[too_small_lambdas] = 0 return lambdas From 187c37888024b219a5c10bcbdf486baf6ad3d514 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 26 Apr 2019 19:07:53 +0200 Subject: [PATCH 079/106] Hopefully fixed the line length warning --- sklearn/utils/validation.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 9257a2be79cd1..b6cd1339fd0b8 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1006,14 +1006,14 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): - that there are no significant negative eigenvalues (with absolute value more than 1e-10 and more than 1e-5 times the largest positive - eigenvalue). If this check fails, it raises a ``PositiveSpectrumWarning``. All - negative eigenvalues (even smaller ones) are set to zero in all cases. + eigenvalue). If this check fails, it raises a ``PositiveSpectrumWarning``. + All negative eigenvalues (even smaller ones) are set to zero in all cases. - that the eigenvalues are well conditioned. That means, that the eigenvalues are all greater than the maximum eigenvalue divided by 1e12. If this check fails and ``warn_on_zeros=True``, it raises a - ``PositiveSpectrumWarning``. All the eigenvalues that are too small are then - set to zero. + ``PositiveSpectrumWarning``. All the eigenvalues that are too small are + then set to zero. Parameters ---------- @@ -1021,10 +1021,10 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): Array of eigenvalues to check / fix. warn_on_zeros : bool, optional (default=False) - When this is set to ``True``, a ``PositiveSpectrumWarning`` will be raised - when there are extremely small eigenvalues. Otherwise no warning will - be raised. Note that in both cases, extremely small eigenvalues will be - set to zero. + When this is set to ``True``, a ``PositiveSpectrumWarning`` will be + raised when there are extremely small eigenvalues. Otherwise no warning + will be raised. Note that in both cases, extremely small eigenvalues + will be set to zero. Returns ------- @@ -1116,8 +1116,8 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): warnings.warn("There are significant negative eigenvalues " "(%f of the maximum positive). The matrix is maybe " "not PSD, or something went wrong with the " - "eigenvalues decomposition. Replacing them with " - "zero." % (-min_eig / max_eig), PositiveSpectrumWarning) + "eigenvalues decomposition. Replacing them with zero." + "" % (-min_eig / max_eig), PositiveSpectrumWarning) # Remove all negative values in all cases lambdas[lambdas < 0] = 0 From e301a34af427694040dddda73e4d28f175776bea Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 26 Apr 2019 19:22:33 +0200 Subject: [PATCH 080/106] Fixed line length --- sklearn/utils/validation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index b6cd1339fd0b8..e709364850d01 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1006,8 +1006,9 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): - that there are no significant negative eigenvalues (with absolute value more than 1e-10 and more than 1e-5 times the largest positive - eigenvalue). If this check fails, it raises a ``PositiveSpectrumWarning``. - All negative eigenvalues (even smaller ones) are set to zero in all cases. + eigenvalue). If this check fails, it raises a + ``PositiveSpectrumWarning``. All negative eigenvalues (even smaller ones) + are set to zero in all cases. - that the eigenvalues are well conditioned. That means, that the eigenvalues are all greater than the maximum eigenvalue divided by 1e12. @@ -1116,8 +1117,9 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): warnings.warn("There are significant negative eigenvalues " "(%f of the maximum positive). The matrix is maybe " "not PSD, or something went wrong with the " - "eigenvalues decomposition. Replacing them with zero." - "" % (-min_eig / max_eig), PositiveSpectrumWarning) + "eigenvalues decomposition. Replacing them with " + "zero." % (-min_eig / max_eig), + PositiveSpectrumWarning) # Remove all negative values in all cases lambdas[lambdas < 0] = 0 From f5d21e4c7075af258e2999c17750d14d3c06793b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 22 May 2019 10:10:26 +0200 Subject: [PATCH 081/106] `check_psd_eigenvalues` is now private: renamed to `_check_psd_eigenvalues` and removed from `classes.rst` and `utils/__init__.py`. --- doc/modules/classes.rst | 1 - doc/whats_new/v0.21.rst | 2 +- sklearn/decomposition/kernel_pca.py | 4 ++-- .../decomposition/tests/test_kernel_pca.py | 6 +++--- sklearn/exceptions.py | 2 +- sklearn/utils/__init__.py | 5 ++--- sklearn/utils/tests/test_validation.py | 19 ++++++++++--------- sklearn/utils/validation.py | 16 ++++++++-------- 8 files changed, 27 insertions(+), 28 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 12a0f50ef9420..e06e5139ea8f3 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1433,7 +1433,6 @@ Low-level methods utils.check_scalar utils.check_consistent_length utils.check_random_state - utils.check_psd_eigenvalues utils.class_weight.compute_class_weight utils.class_weight.compute_sample_weight utils.deprecated diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index b4bf0cac9a9a7..dfb48fd1212b5 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -118,7 +118,7 @@ Support for Python 3.4 and below has been officially dropped. - |Enhancement| :class:`decomposition.KernelPCA` now properly checks the eigenvalues found by the solver for numerical or conditioning issues, relying - on new common util :func:`utils.validation.check_psd_eigenvalues`. This + on new common util :func:`utils.validation._check_psd_eigenvalues`. This ensures consistency of results across solvers (different choices for ``eigen_solver``), including approximate solvers such as ``'randomized'`` and ``'lobpcg'`` (see :issue:`12068`). diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 2dd2bc6ae71d4..244845e2428a7 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -10,7 +10,7 @@ from ..utils import check_random_state from ..utils.extmath import svd_flip from ..utils.validation import (check_is_fitted, check_array, - check_psd_eigenvalues) + _check_psd_eigenvalues) from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin from ..preprocessing import KernelCenterer @@ -213,7 +213,7 @@ def _fit_transform(self, K): v0=v0) # make sure that there are no numerical or conditioning issues - self.lambdas_ = check_psd_eigenvalues(self.lambdas_) + self.lambdas_ = _check_psd_eigenvalues(self.lambdas_) # flip eigenvectors' sign to enforce deterministic output self.alphas_, _ = svd_flip(self.alphas_, diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 6f9d95254a585..ac21690c144e7 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -11,7 +11,7 @@ from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.metrics.pairwise import rbf_kernel -from sklearn.utils import check_psd_eigenvalues +from sklearn.utils.validation import _check_psd_eigenvalues def test_kernel_pca(): @@ -275,7 +275,7 @@ def test_nested_circles(): def test_kernel_conditioning(): - """Test that ``check_psd_eigenvalues`` is correctly called.""" + """Test that ``_check_psd_eigenvalues`` is correctly called.""" # create a pathological X leading to small non-zero eigenvalue X = [[5, 1], @@ -287,4 +287,4 @@ def test_kernel_conditioning(): # check that the small non-zero eigenvalue was correctly set to zero assert kpca.lambdas_.min() == 0 - assert np.all(kpca.lambdas_ == check_psd_eigenvalues(kpca.lambdas_)) + assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_)) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 139f06b628331..1012ceacdea5f 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -160,7 +160,7 @@ class UndefinedMetricWarning(UserWarning): class PositiveSpectrumWarning(UserWarning): """Warning raised when the eigenvalues of a PSD matrix have issues - This warning is typically raised by ``check_psd_eigenvalues`` when the + This warning is typically raised by ``_check_psd_eigenvalues`` when the eigenvalues of a positive semidefinite (PSD) matrix such as a gram matrix (kernel) present significant negative eigenvalues, or bad conditioning i.e. very small non-zero eigenvalues compared to the largest eigenvalue. diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 5d3ebe754f78d..1c8c7565ef91c 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -17,8 +17,7 @@ from . import _joblib from ..exceptions import DataConversionWarning from .deprecation import deprecated -from .validation import (as_float_array, - assert_all_finite, check_psd_eigenvalues, +from .validation import (as_float_array, assert_all_finite, check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, check_symmetric, check_scalar) @@ -59,7 +58,7 @@ class Parallel(_joblib.Parallel): __all__ = ["murmurhash3_32", "as_float_array", "assert_all_finite", "check_array", - "check_random_state", "check_psd_eigenvalues", + "check_random_state", "compute_class_weight", "compute_sample_weight", "column_or_1d", "safe_indexing", "check_consistent_length", "check_X_y", "check_scalar", 'indexable', diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index defd2d7d03b25..9a1afdd43f46e 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -41,7 +41,7 @@ check_non_negative, _num_samples, check_scalar, - check_psd_eigenvalues) + _check_psd_eigenvalues) import sklearn from sklearn.exceptions import NotFittedError, PositiveSpectrumWarning @@ -887,12 +887,12 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, @pytest.mark.parametrize("warn_on_zeros", [True, False]) def test_check_psd_eigenvalues_valid(lambdas, expected_lambdas, w_type, w_msg, warn_on_zeros): - """Test that check_psd_eigenvalues returns the right output for valid + """Test that ``_check_psd_eigenvalues`` returns the right output for valid input, possibly raising the right warning""" with pytest.warns(w_type, match=w_msg) as w: assert_array_equal( - check_psd_eigenvalues(lambdas, warn_on_zeros=warn_on_zeros), + _check_psd_eigenvalues(lambdas, warn_on_zeros=warn_on_zeros), expected_lambdas ) if w_type is None: @@ -900,20 +900,21 @@ def test_check_psd_eigenvalues_valid(lambdas, expected_lambdas, w_type, w_msg, def test_check_psd_eigenvalues_bad_conditioning_warning(): - """Test that check_psd_eigenvalues raises a warning for bad conditioning - when warn_on_zeros is set to True, and does not when it is set to False""" + """Test that ``_check_psd_eigenvalues`` raises a warning for bad + conditioning when warn_on_zeros is set to True, and does not when it is set + to False""" input = (5, 4e-12) output = np.array([5, 0]) with pytest.warns(None) as w: - assert_array_equal(check_psd_eigenvalues(input, warn_on_zeros=False), + assert_array_equal(_check_psd_eigenvalues(input, warn_on_zeros=False), output) assert not w w_msg = "the largest eigenvalue is more than 1.00E\\+12 times the smallest" with pytest.warns(PositiveSpectrumWarning, match=w_msg) as w: - assert_array_equal(check_psd_eigenvalues(input, warn_on_zeros=True), + assert_array_equal(_check_psd_eigenvalues(input, warn_on_zeros=True), output) @@ -929,8 +930,8 @@ def test_check_psd_eigenvalues_bad_conditioning_warning(): list(_psd_cases_invalid.values()), ids=list(_psd_cases_invalid.keys())) def test_check_psd_eigenvalues_invalid(lambdas, err_type, err_msg): - """Test that check_psd_eigenvalues raises the right error for invalid + """Test that ``_check_psd_eigenvalues`` raises the right error for invalid input""" with pytest.raises(err_type, match=err_msg): - check_psd_eigenvalues(lambdas) + _check_psd_eigenvalues(lambdas) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index e709364850d01..c120fa15012cb 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -984,7 +984,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def check_psd_eigenvalues(lambdas, warn_on_zeros=False): +def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): """Check the eigenvalues of a positive semidefinite (PSD) matrix. Checks the provided array of PSD matrix eigenvalues for numerical or @@ -1034,33 +1034,33 @@ def check_psd_eigenvalues(lambdas, warn_on_zeros=False): Examples -------- - >>> check_psd_eigenvalues([1, 2]) # nominal case + >>> _check_psd_eigenvalues([1, 2]) # nominal case ... # doctest: +NORMALIZE_WHITESPACE array([1, 2]) - >>> check_psd_eigenvalues([5, 5j]) # significant imag part + >>> _check_psd_eigenvalues([5, 5j]) # significant imag part ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: There are significant imaginary parts in eigenvalues (1.000000 of the maximum real part). The matrix is maybe not PSD, or something went wrong with the eigenvalues decomposition. - >>> check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part + >>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) - >>> check_psd_eigenvalues([-5, -1]) # all negative + >>> _check_psd_eigenvalues([-5, -1]) # all negative ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: All eigenvalues are negative (maximum is -1.000000). The matrix is maybe not PSD, or something went wrong with the eigenvalues decomposition. - >>> check_psd_eigenvalues([5, -1]) # significant negative + >>> _check_psd_eigenvalues([5, -1]) # significant negative ... # doctest: +NORMALIZE_WHITESPACE array([5, 0]) - >>> check_psd_eigenvalues([5, -5e-5]) # insignificant negative + >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) - >>> check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small) + >>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small) ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) From 9442d881cab46aeb6bcaad2b44ef4ee097ca2c35 Mon Sep 17 00:00:00 2001 From: smarie Date: Tue, 2 Jul 2019 09:22:02 +0200 Subject: [PATCH 082/106] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index c75b1958647aa..4c84f1cad1b0c 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1097,7 +1097,7 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): "There are significant imaginary parts in eigenvalues (%f " "of the maximum real part). The matrix is maybe not PSD, or " "something went wrong with the eigenvalues decomposition." - "" % (max_imag_abs / max_real_abs)) + % (max_imag_abs / max_real_abs)) # Remove the insignificant imaginary parts lambdas = np.real(np.copy(lambdas)) From 907e9b24656a246f4c87612913d982ace285c82c Mon Sep 17 00:00:00 2001 From: smarie Date: Tue, 2 Jul 2019 09:23:27 +0200 Subject: [PATCH 083/106] Update doc/whats_new/v0.21.rst Co-Authored-By: Guillaume Lemaitre --- doc/whats_new/v0.21.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 467a74300ae0c..0ae7b6c8ece34 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -264,7 +264,7 @@ Support for Python 3.4 and below has been officially dropped. ensures consistency of results across solvers (different choices for ``eigen_solver``), including approximate solvers such as ``'randomized'`` and ``'lobpcg'`` (see :issue:`12068`). - :issue:`12145` by :user:`Sylvain Marié ` + :pr:`12145` by :user:`Sylvain Marié ` - |Enhancement| :class:`decomposition.KernelPCA` now has deterministic output (resolved sign ambiguity in eigenvalue decomposition of the kernel matrix). From 788d40d9f7909f8db8b54bf0e6d7ab6f3b0ecdae Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Jul 2019 09:28:02 +0200 Subject: [PATCH 084/106] Fixed lint issue introduced by code review --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 4c84f1cad1b0c..1fc980db86dd4 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1097,7 +1097,7 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): "There are significant imaginary parts in eigenvalues (%f " "of the maximum real part). The matrix is maybe not PSD, or " "something went wrong with the eigenvalues decomposition." - % (max_imag_abs / max_real_abs)) + % (max_imag_abs / max_real_abs)) # Remove the insignificant imaginary parts lambdas = np.real(np.copy(lambdas)) From 71dd21f287a949dca5bfd74fd004cfdcdb824a07 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 24 Oct 2019 14:32:04 -0400 Subject: [PATCH 085/106] moved whatsnew to 22 --- doc/whats_new/v0.21.rst | 8 -------- doc/whats_new/v0.22.rst | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index f4258e082db62..588299546fb42 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -333,14 +333,6 @@ Support for Python 3.4 and below has been officially dropped. :mod:`sklearn.decomposition` ............................ -- |Enhancement| :class:`decomposition.KernelPCA` now properly checks the - eigenvalues found by the solver for numerical or conditioning issues, relying - on new common util :func:`utils.validation._check_psd_eigenvalues`. This - ensures consistency of results across solvers (different choices for - ``eigen_solver``), including approximate solvers such as ``'randomized'`` and - ``'lobpcg'`` (see :issue:`12068`). - :pr:`12145` by :user:`Sylvain Marié ` - - |Enhancement| :class:`decomposition.KernelPCA` now has deterministic output (resolved sign ambiguity in eigenvalue decomposition of the kernel matrix). :pr:`13241` by :user:`Aurélien Bellet `. diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 6e331f6d0f1f8..ff0752cea7f7c 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -113,6 +113,14 @@ Changelog :mod:`sklearn.cross_decomposition` .................................. +- |Enhancement| :class:`decomposition.KernelPCA` now properly checks the + eigenvalues found by the solver for numerical or conditioning issues, relying + on new common util :func:`utils.validation._check_psd_eigenvalues`. This + ensures consistency of results across solvers (different choices for + ``eigen_solver``), including approximate solvers such as ``'randomized'`` and + ``'lobpcg'`` (see :issue:`12068`). + :pr:`12145` by :user:`Sylvain Marié ` + - |Fix| Fixed a bug where :class:`cross_decomposition.PLSCanonical` and :class:`cross_decomposition.PLSRegression` were raising an error when fitted with a target matrix `Y` in which the first column was constant. From cb9b29c981aa63bfd6b1c0a473026c66561eaf95 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 24 Oct 2019 14:59:18 -0400 Subject: [PATCH 086/106] simplify double precision detection --- sklearn/utils/validation.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 9f032acd8fa73..0f0a30890929c 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1100,18 +1100,9 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): array([5., 0.]) """ - # is the provided array in double precision (float64) ? - if isinstance(lambdas, np.ndarray): - # From https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html - # and https://docs.scipy.org/doc/numpy/user/basics.types.html - if lambdas.dtype.kind == 'f': - is_double_precision = (lambdas.dtype.itemsize >= 8) - else: - # default for non-float dtypes - is_double_precision = True - else: - # default for non-numpy inputs - is_double_precision = True + + lambdas = np.array(lambdas) + is_double_precision = lambdas.dtype == np.float64 # note: the minimum value available is # - single-precision: np.finfo('float32').eps = 1.2e-07 @@ -1136,7 +1127,7 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): % (max_imag_abs / max_real_abs)) # Remove the insignificant imaginary parts - lambdas = np.real(np.copy(lambdas)) + lambdas = np.real(lambdas) # Check that there are no significant negative eigenvalues max_eig = lambdas.max() From 4615bc79d634d8440b0533684fe23e84740c8644 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 24 Oct 2019 15:01:46 -0400 Subject: [PATCH 087/106] reduced diff --- sklearn/utils/tests/test_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 49bdd297dcc23..b4df37945ab20 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -912,8 +912,8 @@ def __init__(self): [(3, int, 2, 5), (2.5, float, 2, 5)]) def test_check_scalar_valid(x, target_type, min_val, max_val): - # Test that check_scalar returns no error/warning if valid inputs are - # provided + """ Test that check_scalar returns no error/warning if valid inputs are + provided """ with pytest.warns(None) as record: check_scalar(x, "test_name", target_type, min_val, max_val) assert len(record) == 0 @@ -930,8 +930,8 @@ def test_check_scalar_valid(x, target_type, min_val, max_val): ValueError('`test_name3`= 5, must be <= 4.'))]) def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, err_msg): - # Test that check_scalar returns the right error if a wrong input is - # given + """ Test that check_scalar returns the right error if a wrong input is + given""" with pytest.raises(Exception) as raised_error: check_scalar(x, target_name, target_type=target_type, min_val=min_val, max_val=max_val) From 56a37d7e74147d74fedded808622d3c3840e86a3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 24 Oct 2019 15:02:52 -0400 Subject: [PATCH 088/106] update whatsnew --- doc/whats_new/v0.22.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index ff0752cea7f7c..0ff874dc0a5ee 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -114,8 +114,7 @@ Changelog .................................. - |Enhancement| :class:`decomposition.KernelPCA` now properly checks the - eigenvalues found by the solver for numerical or conditioning issues, relying - on new common util :func:`utils.validation._check_psd_eigenvalues`. This + eigenvalues found by the solver for numerical or conditioning issues. This ensures consistency of results across solvers (different choices for ``eigen_solver``), including approximate solvers such as ``'randomized'`` and ``'lobpcg'`` (see :issue:`12068`). From d0cb0339952b9c7e3a00c5649d1d924601776d04 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 24 Oct 2019 15:03:33 -0400 Subject: [PATCH 089/106] reduce diff again --- sklearn/utils/tests/test_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index b4df37945ab20..e533e106966d8 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -912,8 +912,8 @@ def __init__(self): [(3, int, 2, 5), (2.5, float, 2, 5)]) def test_check_scalar_valid(x, target_type, min_val, max_val): - """ Test that check_scalar returns no error/warning if valid inputs are - provided """ + """Test that check_scalar returns no error/warning if valid inputs are + provided""" with pytest.warns(None) as record: check_scalar(x, "test_name", target_type, min_val, max_val) assert len(record) == 0 @@ -930,8 +930,8 @@ def test_check_scalar_valid(x, target_type, min_val, max_val): ValueError('`test_name3`= 5, must be <= 4.'))]) def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, err_msg): - """ Test that check_scalar returns the right error if a wrong input is - given""" + """Test that check_scalar returns the right error if a wrong input is + given""" with pytest.raises(Exception) as raised_error: check_scalar(x, target_name, target_type=target_type, min_val=min_val, max_val=max_val) From aa21b9f4bb0f6364ce0c609dd250447f2d8aa5da Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 1 Nov 2019 11:26:59 -0400 Subject: [PATCH 090/106] Addressed most comments --- sklearn/exceptions.py | 2 +- sklearn/utils/validation.py | 23 ++++++++--------------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index a682a3efae479..590fa416dd807 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -182,5 +182,5 @@ class PositiveSpectrumWarning(UserWarning): (kernel) present significant negative eigenvalues, or bad conditioning i.e. very small non-zero eigenvalues compared to the largest eigenvalue. - .. versionadded:: 0.21 + .. versionadded:: 0.22 """ diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 59769b8eb82b7..f107e7bd52154 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1041,11 +1041,11 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): - that eigenvalues are not all negative. If this check fails, it raises a ``ValueError`` - - that there are no significant negative eigenvalues (with absolute value - more than 1e-10 and more than 1e-5 times the largest positive - eigenvalue). If this check fails, it raises a - ``PositiveSpectrumWarning``. All negative eigenvalues (even smaller ones) - are set to zero in all cases. + - that there are no significant negative eigenvalues with absolute value + more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest + positive eigenvalue in double (simple) precision. If this check fails, + it raises a ``PositiveSpectrumWarning``. All negative eigenvalues + (even smaller ones) are set to zero in all cases. - that the eigenvalues are well conditioned. That means, that the eigenvalues are all greater than the maximum eigenvalue divided by 1e12. @@ -1055,10 +1055,10 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): Parameters ---------- - lambdas : array-like, shape (n_eigenvalues,) + lambdas : array-like of shape (n_eigenvalues,) Array of eigenvalues to check / fix. - warn_on_zeros : bool, optional (default=False) + warn_on_zeros : bool, default=False When this is set to ``True``, a ``PositiveSpectrumWarning`` will be raised when there are extremely small eigenvalues. Otherwise no warning will be raised. Note that in both cases, extremely small eigenvalues @@ -1066,39 +1066,32 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): Returns ------- - lambdas_fixed : ndarray, shape (n_eigenvalues,) + lambdas_fixed : ndarray of shape (n_eigenvalues,) A fixed validated copy of the array of eigenvalues. Examples -------- >>> _check_psd_eigenvalues([1, 2]) # nominal case - ... # doctest: +NORMALIZE_WHITESPACE array([1, 2]) >>> _check_psd_eigenvalues([5, 5j]) # significant imag part - ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: There are significant imaginary parts in eigenvalues (1.000000 of the maximum real part). The matrix is maybe not PSD, or something went wrong with the eigenvalues decomposition. >>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part - ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) >>> _check_psd_eigenvalues([-5, -1]) # all negative - ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: All eigenvalues are negative (maximum is -1.000000). The matrix is maybe not PSD, or something went wrong with the eigenvalues decomposition. >>> _check_psd_eigenvalues([5, -1]) # significant negative - ... # doctest: +NORMALIZE_WHITESPACE array([5, 0]) >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative - ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) >>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small) - ... # doctest: +NORMALIZE_WHITESPACE array([5., 0.]) """ From 072910a966fae406a35b37c164204e550b2e7606 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 5 Nov 2019 08:53:11 -0500 Subject: [PATCH 091/106] update error message --- sklearn/utils/validation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f107e7bd52154..c35ea70b09946 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1127,9 +1127,10 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): # Check that there are no significant negative eigenvalues max_eig = lambdas.max() if max_eig < 0: - raise ValueError("All eigenvalues are negative (maximum is %f). The " - "matrix is maybe not PSD, or something went wrong " - "with the eigenvalues decomposition." % max_eig) + raise ValueError("All eigenvalues are negative (maximum is %f). " + "Either the matrix is not PSD, or there was an " + "issue while computing the eigendecomposition of " + "the matrix" % max_eig) else: min_eig = lambdas.min() From f87ec5a858f32fa325794b5be02fd417926e00dc Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 6 Nov 2019 13:26:55 +0100 Subject: [PATCH 092/106] Fixed error message (2) and updated doctests accordingly --- sklearn/utils/validation.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index c35ea70b09946..b57131bce5a5f 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1077,16 +1077,16 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): Traceback (most recent call last): ... ValueError: There are significant imaginary parts in eigenvalues (1.000000 - of the maximum real part). The matrix is maybe not PSD, or something - went wrong with the eigenvalues decomposition. + of the maximum real part). Either the matrix is not PSD, or there was + an issue while computing the eigendecomposition of the matrix. >>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part array([5., 0.]) >>> _check_psd_eigenvalues([-5, -1]) # all negative Traceback (most recent call last): ... - ValueError: All eigenvalues are negative (maximum is -1.000000). The matrix - is maybe not PSD, or something went wrong with the eigenvalues - decomposition. + ValueError: All eigenvalues are negative (maximum is -1.000000). Either the + matrix is not PSD, or there was an issue while computing the + eigendecomposition of the matrix. >>> _check_psd_eigenvalues([5, -1]) # significant negative array([5, 0]) >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative @@ -1117,8 +1117,9 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): if max_imag_abs > significant_imag_ratio * max_real_abs: raise ValueError( "There are significant imaginary parts in eigenvalues (%f " - "of the maximum real part). The matrix is maybe not PSD, or " - "something went wrong with the eigenvalues decomposition." + "of the maximum real part). Either the matrix is not PSD, or " + "there was an issue while computing the eigendecomposition of " + "the matrix." % (max_imag_abs / max_real_abs)) # Remove the insignificant imaginary parts @@ -1130,7 +1131,7 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): raise ValueError("All eigenvalues are negative (maximum is %f). " "Either the matrix is not PSD, or there was an " "issue while computing the eigendecomposition of " - "the matrix" % max_eig) + "the matrix." % max_eig) else: min_eig = lambdas.min() From a2d01930d627792ef13b704147da3c1655c59dbd Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 6 Nov 2019 17:29:15 +0100 Subject: [PATCH 093/106] Renamed `warn_on_zeros` to `bad_conditioning_warning` as per review, and made its default value to `True`. --- sklearn/decomposition/_kernel_pca.py | 3 ++- sklearn/utils/tests/test_validation.py | 11 ++++++----- sklearn/utils/validation.py | 6 +++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index c7ab03fec23c3..4e8a6c3a395e7 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -213,7 +213,8 @@ def _fit_transform(self, K): v0=v0) # make sure that there are no numerical or conditioning issues - self.lambdas_ = _check_psd_eigenvalues(self.lambdas_) + self.lambdas_ = _check_psd_eigenvalues(self.lambdas_, + bad_conditioning_warning=False) # flip eigenvectors' sign to enforce deterministic output self.alphas_, _ = svd_flip(self.alphas_, diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 42d9f9ec8bc58..70591ba2121fb 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -974,7 +974,8 @@ def test_check_psd_eigenvalues_valid(lambdas, expected_lambdas, w_type, w_msg, with pytest.warns(w_type, match=w_msg) as w: assert_array_equal( - _check_psd_eigenvalues(lambdas, warn_on_zeros=warn_on_zeros), + _check_psd_eigenvalues(lambdas, + bad_conditioning_warning=warn_on_zeros), expected_lambdas ) if w_type is None: @@ -990,14 +991,14 @@ def test_check_psd_eigenvalues_bad_conditioning_warning(): output = np.array([5, 0]) with pytest.warns(None) as w: - assert_array_equal(_check_psd_eigenvalues(input, warn_on_zeros=False), - output) + checked = _check_psd_eigenvalues(input, bad_conditioning_warning=False) + assert_array_equal(checked, output) assert not w w_msg = "the largest eigenvalue is more than 1.00E\\+12 times the smallest" with pytest.warns(PositiveSpectrumWarning, match=w_msg) as w: - assert_array_equal(_check_psd_eigenvalues(input, warn_on_zeros=True), - output) + checked = _check_psd_eigenvalues(input, bad_conditioning_warning=True) + assert_array_equal(checked, output) _psd_cases_invalid = { diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index b57131bce5a5f..1c6d92c1e8210 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1021,7 +1021,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): +def _check_psd_eigenvalues(lambdas, bad_conditioning_warning=True): """Check the eigenvalues of a positive semidefinite (PSD) matrix. Checks the provided array of PSD matrix eigenvalues for numerical or @@ -1058,7 +1058,7 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): lambdas : array-like of shape (n_eigenvalues,) Array of eigenvalues to check / fix. - warn_on_zeros : bool, default=False + bad_conditioning_warning : bool, default=True When this is set to ``True``, a ``PositiveSpectrumWarning`` will be raised when there are extremely small eigenvalues. Otherwise no warning will be raised. Note that in both cases, extremely small eigenvalues @@ -1150,7 +1150,7 @@ def _check_psd_eigenvalues(lambdas, warn_on_zeros=False): # Finally check for conditioning too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig) if too_small_lambdas.any(): - if warn_on_zeros: + if bad_conditioning_warning: warnings.warn("Badly conditioned PSD matrix spectrum: the largest " "eigenvalue is more than %.2E times the smallest. " "Small eigenvalues will be replaced with 0." From a433b9a0d750afc07709e2ab831f0b9af5edce8f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 6 Nov 2019 17:37:11 +0100 Subject: [PATCH 094/106] Fixed comment for clarity on why we use `_check_psd_eigenvalues` --- sklearn/decomposition/_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 4e8a6c3a395e7..fe95616619672 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -212,7 +212,7 @@ def _fit_transform(self, K): maxiter=self.max_iter, v0=v0) - # make sure that there are no numerical or conditioning issues + # make sure that the eigenvalues are ok and fix numerical issues self.lambdas_ = _check_psd_eigenvalues(self.lambdas_, bad_conditioning_warning=False) From bd132895159e1395acd267814aab0aeaf1e6d687 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 6 Nov 2019 18:21:59 +0100 Subject: [PATCH 095/106] Minor docstring update --- sklearn/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 1c6d92c1e8210..eca87079ff3f0 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1047,9 +1047,9 @@ def _check_psd_eigenvalues(lambdas, bad_conditioning_warning=True): it raises a ``PositiveSpectrumWarning``. All negative eigenvalues (even smaller ones) are set to zero in all cases. - - that the eigenvalues are well conditioned. That means, that the + - that the eigenvalues are well conditioned. That means, that the non-zero eigenvalues are all greater than the maximum eigenvalue divided by 1e12. - If this check fails and ``warn_on_zeros=True``, it raises a + If this check fails and ``bad_conditioning_warning=True``, it raises a ``PositiveSpectrumWarning``. All the eigenvalues that are too small are then set to zero. From 25c5251e7bf20aa044dc0321e89616d717d112dd Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 7 Nov 2019 11:59:39 +0100 Subject: [PATCH 096/106] `bad_conditioning_warning` renamed to `small_nonzeros_warning` because "bad conditioning" does not really apply when there are also exact zeros. This warning is now raised if there are small non-zero **negative or positive** eigenvalues (the warning was raised for positive small previously but not for negative small, which was not consistent) --- sklearn/decomposition/_kernel_pca.py | 2 +- sklearn/utils/tests/test_validation.py | 16 ++++++++-------- sklearn/utils/validation.py | 25 +++++++++++++------------ 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index fe95616619672..d0292e8ed5d2d 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -214,7 +214,7 @@ def _fit_transform(self, K): # make sure that the eigenvalues are ok and fix numerical issues self.lambdas_ = _check_psd_eigenvalues(self.lambdas_, - bad_conditioning_warning=False) + small_nonzeros_warning=False) # flip eigenvectors' sign to enforce deterministic output self.alphas_, _ = svd_flip(self.alphas_, diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 70591ba2121fb..1554a218b4939 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -966,38 +966,38 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, @pytest.mark.parametrize("lambdas, expected_lambdas, w_type, w_msg", list(_psd_cases_valid.values()), ids=list(_psd_cases_valid.keys())) -@pytest.mark.parametrize("warn_on_zeros", [True, False]) +@pytest.mark.parametrize("small_nonzeros_warning", [True, False]) def test_check_psd_eigenvalues_valid(lambdas, expected_lambdas, w_type, w_msg, - warn_on_zeros): + small_nonzeros_warning): # Test that ``_check_psd_eigenvalues`` returns the right output for valid # input, possibly raising the right warning with pytest.warns(w_type, match=w_msg) as w: assert_array_equal( _check_psd_eigenvalues(lambdas, - bad_conditioning_warning=warn_on_zeros), + small_nonzeros_warning=small_nonzeros_warning), expected_lambdas ) if w_type is None: assert not w -def test_check_psd_eigenvalues_bad_conditioning_warning(): +def test_check_psd_eigenvalues_small_nonzeros_warning(): # Test that ``_check_psd_eigenvalues`` raises a warning for bad - # conditioning when warn_on_zeros is set to True, and does not when it is - # set to False + # conditioning when small_nonzeros_warning is set to True, and does not + # when it is set to False input = (5, 4e-12) output = np.array([5, 0]) with pytest.warns(None) as w: - checked = _check_psd_eigenvalues(input, bad_conditioning_warning=False) + checked = _check_psd_eigenvalues(input, small_nonzeros_warning=False) assert_array_equal(checked, output) assert not w w_msg = "the largest eigenvalue is more than 1.00E\\+12 times the smallest" with pytest.warns(PositiveSpectrumWarning, match=w_msg) as w: - checked = _check_psd_eigenvalues(input, bad_conditioning_warning=True) + checked = _check_psd_eigenvalues(input, small_nonzeros_warning=True) assert_array_equal(checked, output) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index eca87079ff3f0..3aa4f07b6f4a9 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1021,7 +1021,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def _check_psd_eigenvalues(lambdas, bad_conditioning_warning=True): +def _check_psd_eigenvalues(lambdas, small_nonzeros_warning=True): """Check the eigenvalues of a positive semidefinite (PSD) matrix. Checks the provided array of PSD matrix eigenvalues for numerical or @@ -1049,7 +1049,7 @@ def _check_psd_eigenvalues(lambdas, bad_conditioning_warning=True): - that the eigenvalues are well conditioned. That means, that the non-zero eigenvalues are all greater than the maximum eigenvalue divided by 1e12. - If this check fails and ``bad_conditioning_warning=True``, it raises a + If this check fails and ``small_nonzeros_warning=True``, it raises a ``PositiveSpectrumWarning``. All the eigenvalues that are too small are then set to zero. @@ -1058,11 +1058,11 @@ def _check_psd_eigenvalues(lambdas, bad_conditioning_warning=True): lambdas : array-like of shape (n_eigenvalues,) Array of eigenvalues to check / fix. - bad_conditioning_warning : bool, default=True + small_nonzeros_warning : bool, default=True When this is set to ``True``, a ``PositiveSpectrumWarning`` will be - raised when there are extremely small eigenvalues. Otherwise no warning - will be raised. Note that in both cases, extremely small eigenvalues - will be set to zero. + raised when there are extremely small non-zero eigenvalues (positive or + negative). Otherwise no warning will be raised in this case. Note that + in both cases, extremely small eigenvalues will be set to zero. Returns ------- @@ -1144,13 +1144,11 @@ def _check_psd_eigenvalues(lambdas, bad_conditioning_warning=True): "zero." % (-min_eig / max_eig), PositiveSpectrumWarning) - # Remove all negative values in all cases - lambdas[lambdas < 0] = 0 - - # Finally check for conditioning - too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig) + # Check for conditioning (both positive and negative small non-zeros) + too_small_lambdas = ((0 != lambdas) + & (np.abs(lambdas) < small_pos_ratio * max_eig)) if too_small_lambdas.any(): - if bad_conditioning_warning: + if small_nonzeros_warning: warnings.warn("Badly conditioned PSD matrix spectrum: the largest " "eigenvalue is more than %.2E times the smallest. " "Small eigenvalues will be replaced with 0." @@ -1158,6 +1156,9 @@ def _check_psd_eigenvalues(lambdas, bad_conditioning_warning=True): PositiveSpectrumWarning) lambdas[too_small_lambdas] = 0 + # Finally remove all negative values (the big ones were remaining) + lambdas[lambdas < 0] = 0 + return lambdas From 13e2ba6de478ce6c3d2b59c84e1ab07eebdc51c1 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 7 Nov 2019 14:29:07 +0100 Subject: [PATCH 097/106] fixed linter error --- sklearn/utils/tests/test_validation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 1554a218b4939..aa27e876989ce 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -974,8 +974,10 @@ def test_check_psd_eigenvalues_valid(lambdas, expected_lambdas, w_type, w_msg, with pytest.warns(w_type, match=w_msg) as w: assert_array_equal( - _check_psd_eigenvalues(lambdas, - small_nonzeros_warning=small_nonzeros_warning), + _check_psd_eigenvalues( + lambdas, + small_nonzeros_warning=small_nonzeros_warning + ), expected_lambdas ) if w_type is None: From 71ed9fdc94bee57fae7df2b5b509fba0ee9db81c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 7 Nov 2019 16:19:48 +0100 Subject: [PATCH 098/106] `_check_psd_eigenvalues` : as per code review, `small_nonzeros_warning` is now `False` by default --- sklearn/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 3aa4f07b6f4a9..fa68dc3ffc8ef 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1021,7 +1021,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def _check_psd_eigenvalues(lambdas, small_nonzeros_warning=True): +def _check_psd_eigenvalues(lambdas, small_nonzeros_warning=False): """Check the eigenvalues of a positive semidefinite (PSD) matrix. Checks the provided array of PSD matrix eigenvalues for numerical or @@ -1058,7 +1058,7 @@ def _check_psd_eigenvalues(lambdas, small_nonzeros_warning=True): lambdas : array-like of shape (n_eigenvalues,) Array of eigenvalues to check / fix. - small_nonzeros_warning : bool, default=True + small_nonzeros_warning : bool, default=False When this is set to ``True``, a ``PositiveSpectrumWarning`` will be raised when there are extremely small non-zero eigenvalues (positive or negative). Otherwise no warning will be raised in this case. Note that From 78936b550b264f165fe5f2f6bbb13c6f38c3472b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 7 Nov 2019 19:18:14 +0100 Subject: [PATCH 099/106] (as per code review) * Renamed `small_nonzeros_warning` into `enable_warnings`. * now consistent warnings are raised for all three cases (imaginary parts, negative, small non zero), and the parameter disables all of them. * improved string formatting using `%g` instead of `%f` or other things --- sklearn/decomposition/_kernel_pca.py | 2 +- sklearn/utils/tests/test_validation.py | 70 ++++++++----------- sklearn/utils/validation.py | 97 +++++++++++++++----------- 3 files changed, 87 insertions(+), 82 deletions(-) diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index d0292e8ed5d2d..169b0942e74bd 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -214,7 +214,7 @@ def _fit_transform(self, K): # make sure that the eigenvalues are ok and fix numerical issues self.lambdas_ = _check_psd_eigenvalues(self.lambdas_, - small_nonzeros_warning=False) + enable_warnings=False) # flip eigenvectors' sign to enforce deterministic output self.alphas_, _ = svd_flip(self.alphas_, diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index aa27e876989ce..ef869957be7e9 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -942,72 +942,60 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, _psd_cases_valid = { 'nominal': ((1, 2), np.array([1, 2]), None, ""), 'nominal_np_array': (np.array([1, 2]), np.array([1, 2]), None, ""), - 'insignificant_imag': ((5, 5e-5j), np.array([5, 0]), None, ""), - 'significant neg': ((5, -1), np.array([5, 0]), PositiveSpectrumWarning, - "There are significant negative eigenvalues"), - 'significant neg float32': (np.array([3e-4, -2e-6], dtype=np.float32), - np.array([3e-4, 0], dtype=np.float32), - PositiveSpectrumWarning, - "There are significant negative eigenvalues"), - 'significant neg float64': (np.array([1e-5, -2e-10], dtype=np.float64), - np.array([1e-5, 0], dtype=np.float64), - PositiveSpectrumWarning, - "There are significant negative eigenvalues"), - 'insignificant neg': ((5, -5e-5), np.array([5, 0]), None, ""), + 'insignificant_imag': ((5, 5e-5j), np.array([5, 0]), + PositiveSpectrumWarning, + "There are imaginary parts in eigenvalues " + "\\(1e\\-05 of the maximum real part"), + 'insignificant neg': ((5, -5e-5), np.array([5, 0]), + PositiveSpectrumWarning, ""), 'insignificant neg float32': (np.array([1, -1e-6], dtype=np.float32), np.array([1, 0], dtype=np.float32), - None, ""), + PositiveSpectrumWarning, + "There are negative eigenvalues \\(1e\\-06 " + "of the maximum positive"), 'insignificant neg float64': (np.array([1, -1e-10], dtype=np.float64), np.array([1, 0], dtype=np.float64), - None, ""), + PositiveSpectrumWarning, + "There are negative eigenvalues \\(1e\\-10 " + "of the maximum positive"), } @pytest.mark.parametrize("lambdas, expected_lambdas, w_type, w_msg", list(_psd_cases_valid.values()), ids=list(_psd_cases_valid.keys())) -@pytest.mark.parametrize("small_nonzeros_warning", [True, False]) +@pytest.mark.parametrize("enable_warnings", [True, False]) def test_check_psd_eigenvalues_valid(lambdas, expected_lambdas, w_type, w_msg, - small_nonzeros_warning): + enable_warnings): # Test that ``_check_psd_eigenvalues`` returns the right output for valid # input, possibly raising the right warning + if not enable_warnings: + w_type = None + w_msg = "" + with pytest.warns(w_type, match=w_msg) as w: assert_array_equal( - _check_psd_eigenvalues( - lambdas, - small_nonzeros_warning=small_nonzeros_warning - ), + _check_psd_eigenvalues(lambdas, enable_warnings=enable_warnings), expected_lambdas ) - if w_type is None: + if w_type is None or not enable_warnings: assert not w -def test_check_psd_eigenvalues_small_nonzeros_warning(): - # Test that ``_check_psd_eigenvalues`` raises a warning for bad - # conditioning when small_nonzeros_warning is set to True, and does not - # when it is set to False - - input = (5, 4e-12) - output = np.array([5, 0]) - - with pytest.warns(None) as w: - checked = _check_psd_eigenvalues(input, small_nonzeros_warning=False) - assert_array_equal(checked, output) - assert not w - - w_msg = "the largest eigenvalue is more than 1.00E\\+12 times the smallest" - with pytest.warns(PositiveSpectrumWarning, match=w_msg) as w: - checked = _check_psd_eigenvalues(input, small_nonzeros_warning=True) - assert_array_equal(checked, output) - - _psd_cases_invalid = { 'significant_imag': ((5, 5j), ValueError, "There are significant imaginary parts in eigenv"), 'all negative': ((-5, -1), ValueError, - "All eigenvalues are negative \\(maximum is -1.000") + "All eigenvalues are negative \\(maximum is -1"), + 'significant neg': ((5, -1), ValueError, + "There are significant negative eigenvalues"), + 'significant neg float32': (np.array([3e-4, -2e-6], dtype=np.float32), + ValueError, + "There are significant negative eigenvalues"), + 'significant neg float64': (np.array([1e-5, -2e-10], dtype=np.float64), + ValueError, + "There are significant negative eigenvalues"), } diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index fa68dc3ffc8ef..fc2e316d5a064 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1021,7 +1021,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def _check_psd_eigenvalues(lambdas, small_nonzeros_warning=False): +def _check_psd_eigenvalues(lambdas, enable_warnings=False): """Check the eigenvalues of a positive semidefinite (PSD) matrix. Checks the provided array of PSD matrix eigenvalues for numerical or @@ -1031,12 +1031,13 @@ def _check_psd_eigenvalues(lambdas, small_nonzeros_warning=False): (e.g. kernel function), or if the decomposition process uses approximation methods (randomized SVD, etc.). - It checks: + It checks for three things: - that there are no significant imaginary parts in eigenvalues (more than 1e-5 times the maximum real part). If this check fails, it raises a ``ValueError``. Otherwise all non-significant imaginary parts that may - remain are removed. + remain are set to zero. This operation is traced with a + ``PositiveSpectrumWarning`` when ``enable_warnings=True``. - that eigenvalues are not all negative. If this check fails, it raises a ``ValueError`` @@ -1044,25 +1045,26 @@ def _check_psd_eigenvalues(lambdas, small_nonzeros_warning=False): - that there are no significant negative eigenvalues with absolute value more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest positive eigenvalue in double (simple) precision. If this check fails, - it raises a ``PositiveSpectrumWarning``. All negative eigenvalues - (even smaller ones) are set to zero in all cases. + it raises a ``ValueError``. Otherwise all negative eigenvalues that may + remain are set to zero. This operation is traced with a + ``PositiveSpectrumWarning`` when ``enable_warnings=True``. - - that the eigenvalues are well conditioned. That means, that the non-zero - eigenvalues are all greater than the maximum eigenvalue divided by 1e12. - If this check fails and ``small_nonzeros_warning=True``, it raises a - ``PositiveSpectrumWarning``. All the eigenvalues that are too small are - then set to zero. + Finally, all the positive eigenvalues that are too small (with a value + smaller than the maximum eigenvalue divided by 1e12) are set to zero. + This operation is traced with a ``PositiveSpectrumWarning`` when + ``enable_warnings=True``. Parameters ---------- lambdas : array-like of shape (n_eigenvalues,) Array of eigenvalues to check / fix. - small_nonzeros_warning : bool, default=False + enable_warnings : bool, default=False When this is set to ``True``, a ``PositiveSpectrumWarning`` will be - raised when there are extremely small non-zero eigenvalues (positive or - negative). Otherwise no warning will be raised in this case. Note that - in both cases, extremely small eigenvalues will be set to zero. + raised when there are imaginary parts, negative eigenvalues, or + extremely small non-zero eigenvalues. Otherwise no warning will be + raised. In both cases, imaginary parts, negative eigenvalues, and + extremely small non-zero eigenvalues will be set to zero. Returns ------- @@ -1076,7 +1078,7 @@ def _check_psd_eigenvalues(lambdas, small_nonzeros_warning=False): >>> _check_psd_eigenvalues([5, 5j]) # significant imag part Traceback (most recent call last): ... - ValueError: There are significant imaginary parts in eigenvalues (1.000000 + ValueError: There are significant imaginary parts in eigenvalues (1 of the maximum real part). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix. >>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part @@ -1084,11 +1086,15 @@ def _check_psd_eigenvalues(lambdas, small_nonzeros_warning=False): >>> _check_psd_eigenvalues([-5, -1]) # all negative Traceback (most recent call last): ... - ValueError: All eigenvalues are negative (maximum is -1.000000). Either the + ValueError: All eigenvalues are negative (maximum is -1). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix. >>> _check_psd_eigenvalues([5, -1]) # significant negative - array([5, 0]) + Traceback (most recent call last): + ... + ValueError: There are significant negative eigenvalues (0.2 of the + maximum positive). The matrix is maybe not PSD, or something went wrong + with the eigenvalues decomposition. >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative array([5., 0.]) >>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small) @@ -1116,49 +1122,60 @@ def _check_psd_eigenvalues(lambdas, small_nonzeros_warning=False): max_real_abs = np.abs(np.real(lambdas)).max() if max_imag_abs > significant_imag_ratio * max_real_abs: raise ValueError( - "There are significant imaginary parts in eigenvalues (%f " - "of the maximum real part). Either the matrix is not PSD, or " - "there was an issue while computing the eigendecomposition of " - "the matrix." + "There are significant imaginary parts in eigenvalues (%g " + "of the maximum real part). The matrix is maybe not PSD, or " + "something went wrong with the eigenvalues decomposition." % (max_imag_abs / max_real_abs)) - # Remove the insignificant imaginary parts - lambdas = np.real(lambdas) + # Remove all imaginary parts and warn about it + if enable_warnings: + warnings.warn("There are imaginary parts in eigenvalues (%g " + "of the maximum real part). Either the matrix is not" + " PSD, or there was an issue while computing the " + "eigendecomposition of the matrix." + % (max_imag_abs / max_real_abs), + PositiveSpectrumWarning) + lambdas = np.real(lambdas) # Check that there are no significant negative eigenvalues max_eig = lambdas.max() if max_eig < 0: - raise ValueError("All eigenvalues are negative (maximum is %f). " + raise ValueError("All eigenvalues are negative (maximum is %g). " "Either the matrix is not PSD, or there was an " "issue while computing the eigendecomposition of " "the matrix." % max_eig) else: min_eig = lambdas.min() - if (min_eig < -significant_neg_ratio * max(max_eig, 0) + if (min_eig < -significant_neg_ratio * max_eig and min_eig < -significant_neg_value): - warnings.warn("There are significant negative eigenvalues " - "(%f of the maximum positive). The matrix is maybe " - "not PSD, or something went wrong with the " - "eigenvalues decomposition. Replacing them with " - "zero." % (-min_eig / max_eig), - PositiveSpectrumWarning) - - # Check for conditioning (both positive and negative small non-zeros) - too_small_lambdas = ((0 != lambdas) - & (np.abs(lambdas) < small_pos_ratio * max_eig)) + raise ValueError("There are significant negative eigenvalues (%g" + " of the maximum positive). Either the matrix is " + "not PSD, or there was an issue while computing " + "the eigendecomposition of the matrix." + % (-min_eig / max_eig)) + elif min_eig < 0: + # Remove all negative values and warn about it + if enable_warnings: + warnings.warn("There are negative eigenvalues (%g of the " + "maximum positive). Either the matrix is not " + "PSD, or there was an issue while computing the" + " eigendecomposition of the matrix." + % (-min_eig / max_eig), + PositiveSpectrumWarning) + lambdas[lambdas < 0] = 0 + + # Check for conditioning (small positive non-zeros) + too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig) if too_small_lambdas.any(): - if small_nonzeros_warning: + if enable_warnings: warnings.warn("Badly conditioned PSD matrix spectrum: the largest " - "eigenvalue is more than %.2E times the smallest. " + "eigenvalue is more than %g times the smallest. " "Small eigenvalues will be replaced with 0." "" % (1 / small_pos_ratio), PositiveSpectrumWarning) lambdas[too_small_lambdas] = 0 - # Finally remove all negative values (the big ones were remaining) - lambdas[lambdas < 0] = 0 - return lambdas From 7fdbbd50da95129d9288bbce9d2f8073e503b958 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 7 Nov 2019 22:07:39 +0100 Subject: [PATCH 100/106] Fixed doctest + One of the error messages was still not consistent with the others. Now adopting the same message everywhere. --- sklearn/utils/validation.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index fc2e316d5a064..38242f96fbdb6 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1093,8 +1093,8 @@ def _check_psd_eigenvalues(lambdas, enable_warnings=False): Traceback (most recent call last): ... ValueError: There are significant negative eigenvalues (0.2 of the - maximum positive). The matrix is maybe not PSD, or something went wrong - with the eigenvalues decomposition. + maximum positive). Either the matrix is not PSD, or there was an issue + while computing the eigendecomposition of the matrix. >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative array([5., 0.]) >>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small) @@ -1123,8 +1123,9 @@ def _check_psd_eigenvalues(lambdas, enable_warnings=False): if max_imag_abs > significant_imag_ratio * max_real_abs: raise ValueError( "There are significant imaginary parts in eigenvalues (%g " - "of the maximum real part). The matrix is maybe not PSD, or " - "something went wrong with the eigenvalues decomposition." + "of the maximum real part). Either the matrix is not PSD, or " + "there was an issue while computing the eigendecomposition " + "of the matrix." % (max_imag_abs / max_real_abs)) # Remove all imaginary parts and warn about it From d06e71ab670e5971b7fc8b2f6cd1aec127b429b3 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 7 Nov 2019 22:08:13 +0100 Subject: [PATCH 101/106] One test case (small pos eigenvals) had been mistakenly removed and not copied back. Added it. --- sklearn/utils/tests/test_validation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index ef869957be7e9..48acffea128da 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -958,6 +958,10 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, PositiveSpectrumWarning, "There are negative eigenvalues \\(1e\\-10 " "of the maximum positive"), + 'insignificant pos': ((5, 4e-12), np.array([5, 0]), + PositiveSpectrumWarning, + "the largest eigenvalue is more than 1e\\+12 " + "times the smallest"), } From b58f1a066a15d5bac29b9e5da6428cca07f2f3a8 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 7 Nov 2019 22:22:07 +0100 Subject: [PATCH 102/106] Reverted bad line: `np.real(lambdas)` should also be done if imaginary parts are all zeros, to convert to float dtype --- sklearn/utils/validation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 38242f96fbdb6..39a5e42c5f33a 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1128,7 +1128,7 @@ def _check_psd_eigenvalues(lambdas, enable_warnings=False): "of the matrix." % (max_imag_abs / max_real_abs)) - # Remove all imaginary parts and warn about it + # warn about imaginary parts being removed if enable_warnings: warnings.warn("There are imaginary parts in eigenvalues (%g " "of the maximum real part). Either the matrix is not" @@ -1136,7 +1136,9 @@ def _check_psd_eigenvalues(lambdas, enable_warnings=False): "eigendecomposition of the matrix." % (max_imag_abs / max_real_abs), PositiveSpectrumWarning) - lambdas = np.real(lambdas) + + # Remove all imaginary parts (even if zero) + lambdas = np.real(lambdas) # Check that there are no significant negative eigenvalues max_eig = lambdas.max() From 581aca08e331a1c612887f784c38bb368455f903 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 8 Nov 2019 09:15:57 +0100 Subject: [PATCH 103/106] Two very minor improvements of the warning messages: now all warning messages explicitly state what is happening (setting to zero the imag/negative/small value). --- sklearn/utils/validation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 39a5e42c5f33a..424cf4b5180a3 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1133,7 +1133,8 @@ def _check_psd_eigenvalues(lambdas, enable_warnings=False): warnings.warn("There are imaginary parts in eigenvalues (%g " "of the maximum real part). Either the matrix is not" " PSD, or there was an issue while computing the " - "eigendecomposition of the matrix." + "eigendecomposition of the matrix. Only the real " + "parts will be kept." % (max_imag_abs / max_real_abs), PositiveSpectrumWarning) @@ -1163,7 +1164,8 @@ def _check_psd_eigenvalues(lambdas, enable_warnings=False): warnings.warn("There are negative eigenvalues (%g of the " "maximum positive). Either the matrix is not " "PSD, or there was an issue while computing the" - " eigendecomposition of the matrix." + " eigendecomposition of the matrix. Negative " + "eigenvalues will be replaced with 0." % (-min_eig / max_eig), PositiveSpectrumWarning) lambdas[lambdas < 0] = 0 From 4d05d0a686eea1babc6b9e0bb4d2f3aa5d7c886c Mon Sep 17 00:00:00 2001 From: smarie Date: Fri, 8 Nov 2019 15:53:45 +0100 Subject: [PATCH 104/106] Update sklearn/utils/tests/test_validation.py Co-Authored-By: Nicolas Hug --- sklearn/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 48acffea128da..56efb98a8b2d8 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -983,7 +983,7 @@ def test_check_psd_eigenvalues_valid(lambdas, expected_lambdas, w_type, w_msg, _check_psd_eigenvalues(lambdas, enable_warnings=enable_warnings), expected_lambdas ) - if w_type is None or not enable_warnings: + if w_type is None: assert not w From a1c64c415e4174b512b9627f801da3c9016ff80e Mon Sep 17 00:00:00 2001 From: smarie Date: Thu, 14 Nov 2019 17:13:33 +0100 Subject: [PATCH 105/106] Update sklearn/decomposition/tests/test_kernel_pca.py --- sklearn/decomposition/tests/test_kernel_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 7d70124611012..c64dcd3078119 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -274,7 +274,8 @@ def test_nested_circles(): def test_kernel_conditioning(): - """Test that ``_check_psd_eigenvalues`` is correctly called.""" + """ Test that ``_check_psd_eigenvalues`` is correctly called (see + :issue:`12140`).""" # create a pathological X leading to small non-zero eigenvalue X = [[5, 1], From b49f6ab7897765d4501394ffbeb6a4ba45e608c6 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 14 Nov 2019 17:17:16 +0100 Subject: [PATCH 106/106] Fixed docstring --- sklearn/decomposition/tests/test_kernel_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index c64dcd3078119..39fc16b5ff5fb 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -274,8 +274,8 @@ def test_nested_circles(): def test_kernel_conditioning(): - """ Test that ``_check_psd_eigenvalues`` is correctly called (see - :issue:`12140`).""" + """ Test that ``_check_psd_eigenvalues`` is correctly called + Non-regression test for issue #12140 (PR #12145)""" # create a pathological X leading to small non-zero eigenvalue X = [[5, 1],