From 3d7b5967c788ad07d5273370b2cae04480c6ae58 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Tue, 11 Dec 2012 11:31:43 -0500 Subject: [PATCH 01/12] enh: added diffusion embedding sty: removed old code enh: diffusion time parameter enh: separated diffusion embedding into it's own function fix: remove spectral clustering changes fix: reverting old changes fix: change spectral clustering to use new call for spectral_embedding fix: reverting to original drop_first fix: change final place where spectral embedding is called fix: change to private function fix: revert spectral clustering to standard api --- sklearn/cluster/spectral.py | 2 +- sklearn/manifold/__init__.py | 6 +- sklearn/manifold/spectral_embedding_.py | 281 ++++++++++++++++++++++-- 3 files changed, 269 insertions(+), 20 deletions(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 8480e836952e2..b9a0bef9850c8 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -157,7 +157,7 @@ def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, - eigen_tol=0.0, assign_labels='kmeans'): + k=None, eigen_tol=0.0, assign_labels='kmeans'): """Apply clustering to a projection to the normalized laplacian. In practice Spectral Clustering is very useful when the structure of diff --git a/sklearn/manifold/__init__.py b/sklearn/manifold/__init__.py index 16de0d3aeb808..6462ae5fedf2f 100644 --- a/sklearn/manifold/__init__.py +++ b/sklearn/manifold/__init__.py @@ -5,7 +5,9 @@ from .locally_linear import locally_linear_embedding, LocallyLinearEmbedding from .isomap import Isomap from .mds import MDS -from .spectral_embedding_ import SpectralEmbedding, spectral_embedding +from .spectral_embedding_ import (SpectralEmbedding, spectral_embedding, + diffusion_embedding, DiffusionEmbedding) __all__ = ['locally_linear_embedding', 'LocallyLinearEmbedding', 'Isomap', - 'MDS', 'SpectralEmbedding', 'spectral_embedding'] + 'MDS', 'SpectralEmbedding', 'spectral_embedding', + 'diffusion_embedding', 'DiffusionEmbedding'] diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 059b746b3268c..1d037930e210d 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -117,10 +117,9 @@ def _set_diag(laplacian, value): return laplacian -def spectral_embedding(adjacency, n_components=8, eigen_solver=None, - random_state=None, eigen_tol=0.0, - norm_laplacian=True, drop_first=True, - mode=None): +def _solve_eigenvalue_problem(adjacency, n_components=1, eigen_solver=None, + random_state=None, eigen_tol=0.0, + norm_laplacian=True, mode=None): """Project the sample on the first eigen vectors of the graph Laplacian. The adjacency matrix is used to compute a normalized graph Laplacian @@ -167,8 +166,12 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, Returns ------- - embedding : array, shape=(n_samples, n_components) - The reduced samples. + lambdas : array, shape=(n_components,) + The eigenvalues of the Laplacian + vectors : array, shape=(n_components, n_samples) + The eigenvectors of the Laplacian + degrees : array, shape=(n_samples,) + The degrees of the graph Notes ----- @@ -209,9 +212,6 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, random_state = check_random_state(random_state) n_nodes = adjacency.shape[0] - # Whether to drop the first eigenvector - if drop_first: - n_components = n_components + 1 # Check that the matrices given is symmetric if ((not sparse.isspmatrix(adjacency) and not np.all((adjacency - adjacency.T) < 1e-10)) or @@ -258,7 +258,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, lambdas, diffusion_map = eigsh(-laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol) - embedding = diffusion_map.T[n_components::-1] * dd + vectors = diffusion_map.T[n_components::-1] except RuntimeError: # When submatrices are exactly singular, an LU decomposition # in arpack fails. We fallback to lobpcg @@ -277,8 +277,8 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-12, largest=False) - embedding = diffusion_map.T * dd - if embedding.shape[0] == 1: + vectors = diffusion_map.T + if vectors.shape[0] == 1: raise ValueError elif eigen_solver == "lobpcg": @@ -290,7 +290,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, if sparse.isspmatrix(laplacian): laplacian = laplacian.todense() lambdas, diffusion_map = symeig(laplacian) - embedding = diffusion_map.T[:n_components] * dd + vectors = diffusion_map.T[:n_components] else: # lobpcg needs native floats laplacian = laplacian.astype(np.float) @@ -301,13 +301,98 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None, X[:, 0] = dd.ravel() lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15, largest=False, maxiter=2000) - embedding = diffusion_map.T[:n_components] * dd - if embedding.shape[0] == 1: + vectors = diffusion_map.T[:n_components] + if vectors.shape[0] == 1: raise ValueError + return lambdas[:n_components], vectors, dd + + +def spectral_embedding(adjacency, n_components=8, eigen_solver=None, + random_state=None, eigen_tol=0.0, + norm_laplacian=True, drop_first=True, + mode=None, diffusion_time=None): + """Project the sample on the first eigen vectors of the graph Laplacian + + The adjacency matrix is used to compute a normalized graph Laplacian + whose spectrum (especially the eigen vectors associated to the + smallest eigen values) has an interpretation in terms of minimal + number of cuts necessary to split the graph into comparably sized + components. + + This embedding can also 'work' even if the ``adjacency`` variable is + not strictly the adjacency matrix of a graph but more generally + an affinity or similarity matrix between samples (for instance the + heat kernel of a euclidean distance matrix or a k-NN matrix). + + However care must taken to always make the affinity matrix symmetric + so that the eigen vector decomposition works as expected. + + Parameters + ---------- + adjacency : array-like or sparse matrix, shape: (n_samples, n_samples) + The adjacency matrix of the graph to embed. + + n_components : integer, optional + The dimension of the projection subspace. + + eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} + The eigenvalue decomposition strategy to use. AMG requires pyamg + to be installed. It can be faster on very large, sparse problems, + but may also lead to instabilities. + + random_state : int seed, RandomState instance, or None (default) + A pseudo random number generator used for the initialization of the + lobpcg eigen vectors decomposition when eigen_solver == 'amg'. + By default, arpack is used. + + eigen_tol : float, optional, default=0.0 + Stopping criterion for eigendecomposition of the Laplacian matrix + when using arpack eigen_solver. + + drop_first : bool, optional, default=True + Whether to drop the first eigenvector. For spectral embedding, this + should be True as the first eigenvector should be constant vector for + connected graph, but for spectral clustering, this should be kept as + False to retain the first eigenvector. + + diffusion_time: float, optional, default=None + Determines the scaling of the eigenvalues of the Laplacian + + Returns + ------- + embedding : array, shape=(n_samples, n_components) + The reduced samples. + + Notes + ----- + Spectral embedding is most useful when the graph has one connected + component. If there graph has many components, the first few eigenvectors + will simply uncover the connected components of the graph. + + References + ---------- + * http://en.wikipedia.org/wiki/LOBPCG + + * Toward the Optimal Preconditioned Eigensolver: Locally Optimal + Block Preconditioned Conjugate Gradient Method + Andrew V. Knyazev + http://dx.doi.org/10.1137%2FS1064827500366124 + """ + # Whether to drop the first eigenvector if drop_first: - return embedding[1:n_components].T + n_components += 1 + _, vectors, dd = _solve_eigenvalue_problem(adjacency=adjacency, + n_components=n_components, + eigen_solver=eigen_solver, + random_state=random_state, + eigen_tol=eigen_tol, + norm_laplacian=norm_laplacian, + mode=mode) + embedding = vectors * dd + if drop_first: + return embedding[1:].T else: - return embedding[:n_components].T + return embedding.T class SpectralEmbedding(BaseEstimator, TransformerMixin): @@ -486,3 +571,165 @@ def fit_transform(self, X, y=None): """ self.fit(X) return self.embedding_ + + +def diffusion_embedding(adjacency, n_components=8, diffusion_time=None, + eigen_solver=None, random_state=None, eigen_tol=0.0, + norm_laplacian=True, drop_first=True): + """Project the sample on the first eigenvectors of the graph Laplacian + + The adjacency matrix is used to compute a normalized graph Laplacian + whose spectrum (especially the eigen vectors associated to the + smallest eigen values) has an interpretation in terms of minimal + number of cuts necessary to split the graph into comparably sized + components. + + This embedding can also 'work' even if the ``adjacency`` variable is + not strictly the adjacency matrix of a graph but more generally + an affinity or similarity matrix between samples (for instance the + heat kernel of a euclidean distance matrix or a k-NN matrix). + + However care must taken to always make the affinity matrix symmetric + so that the eigen vector decomposition works as expected. + + Parameters + ---------- + adjacency : array-like or sparse matrix, shape: (n_samples, n_samples) + The adjacency matrix of the graph to embed. + + n_components : integer, optional + The dimension of the projection subspace. + + diffusion_time: float, optional, default=None + Determines the scaling of the eigenvalues of the Laplacian + + eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} + The eigenvalue decomposition strategy to use. AMG requires pyamg + to be installed. It can be faster on very large, sparse problems, + but may also lead to instabilities. + + random_state : int seed, RandomState instance, or None (default) + A pseudo random number generator used for the initialization of the + lobpcg eigen vectors decomposition when eigen_solver == 'amg'. + By default, arpack is used. + + eigen_tol : float, optional, default=0.0 + Stopping criterion for eigendecomposition of the Laplacian matrix + when using arpack eigen_solver. + + drop_first : bool, optional, default=True + Whether to drop the first eigenvector. For spectral embedding, this + should be True as the first eigenvector should be constant vector for + connected graph, but for spectral clustering, this should be kept as + False to retain the first eigenvector. + + Returns + ------- + embedding : array, shape=(n_samples, n_components) + The reduced samples. + + Notes + ----- + Diffusion embedding is most useful when the graph has one connected + component. If there graph has many components, the first few eigenvectors + will simply uncover the connected components of the graph. + + References + ---------- + + - Lafon, Stephane, and Ann B. Lee. "Diffusion maps and coarse-graining: A + unified framework for dimensionality reduction, graph partitioning, and + data set parameterization." Pattern Analysis and Machine Intelligence, + IEEE Transactions on 28.9 (2006): 1393-1403. + - Coifman, Ronald R., and Stephane Lafon. Diffusion maps. Applied and + Computational Harmonic Analysis 21.1 (2006): 5-30. + + """ + # Whether to drop the first eigenvector + if drop_first: + n_components += 1 + lambdas, vectors, dd = _solve_eigenvalue_problem(adjacency=adjacency, + n_components=n_components, + eigen_solver=eigen_solver, + random_state=random_state, + eigen_tol=eigen_tol, + norm_laplacian=norm_laplacian) + embedding = vectors * dd + if diffusion_time is not None: + norm_factor = np.sqrt(np.sum(dd)) + psi = (norm_factor * np.diag(1. / (dd ** 0.5))).dot(vectors) + Lt = np.power(1 + lambdas, diffusion_time) + embedding = psi * Lt + if drop_first: + return embedding[1:].T + else: + return embedding.T + + +class DiffusionEmbedding(SpectralEmbedding): + """Diffusion embedding for nonlinear dimensionality reduction + + Diffusion embedding adds an extra parameter `diffusion_time` to spectral + embedding. + + Parameters + ----------- + diffusion_time : float + Determines the scaling of the eigenvalues of the Laplacian + + For all other parameters see `SpectralEmbedding` + + References + ---------- + + - Lafon, Stephane, and Ann B. Lee. "Diffusion maps and coarse-graining: A + unified framework for dimensionality reduction, graph partitioning, and + data set parameterization." Pattern Analysis and Machine Intelligence, + IEEE Transactions on 28.9 (2006): 1393-1403. + - Coifman, Ronald R., and Stephane Lafon. Diffusion maps. Applied and + Computational Harmonic Analysis 21.1 (2006): 5-30. + + """ + + def __init__(self, diffusion_time=0., **kwargs): + super(DiffusionEmbedding, self).__init__(**kwargs) + self.diffusion_time = diffusion_time + + def fit(self, X, y=None): + """Fit the model from data in X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training vector, where n_samples in the number of samples + and n_features is the number of features. + + If affinity is "precomputed" + X : array-like, shape (n_samples, n_samples), + Interpret X as precomputed adjacency graph computed from + samples. + + Returns + ------- + self : object + Returns the instance itself. + """ + self.random_state = check_random_state(self.random_state) + if isinstance(self.affinity, basestring): + if self.affinity not in set(("nearest_neighbors", "rbf", + "precomputed")): + raise ValueError(("%s is not a valid affinity. Expected " + "'precomputed', 'rbf', 'nearest_neighbors' " + "or a callable.") % self.affinity) + elif not hasattr(self.affinity, "__call__"): + raise ValueError(("'affinity' is expected to be an an affinity " + "name or a callable. Got: %s") % self.affinity) + + affinity_matrix = self._get_affinity_matrix(X) + self.embedding_ = diffusion_embedding(affinity_matrix, + n_components=self.n_components, + eigen_solver=self.eigen_solver, + random_state=self.random_state, + diffusion_time=self.diffusion_time, + drop_first=False) + return self From 3f80f83c9483a95b1b9c2e1c89faae4d7ba1116d Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 11 Jan 2014 15:34:27 -0500 Subject: [PATCH 02/12] fix: remove kwargs --- sklearn/manifold/spectral_embedding_.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 1d037930e210d..137c302ff249a 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -691,8 +691,15 @@ class DiffusionEmbedding(SpectralEmbedding): """ - def __init__(self, diffusion_time=0., **kwargs): - super(DiffusionEmbedding, self).__init__(**kwargs) + def __init__(self, diffusion_time=0., n_components=2, + affinity="nearest_neighbors", gamma=None, random_state=None, + eigen_solver=None, n_neighbors=None): + super(DiffusionEmbedding, self).__init__(n_components=n_components, + affinity=affinity, + gamma=gamma, + random_state=random_state, + eigen_solver=eigen_solver, + n_neighbors=n_neighbors) self.diffusion_time = diffusion_time def fit(self, X, y=None): From 0c8aac608be12f60cb30bb2111f8168beabfb75a Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 11 Jan 2014 15:37:19 -0500 Subject: [PATCH 03/12] fix: reverting the spectral clustering api to current master --- sklearn/cluster/spectral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index b9a0bef9850c8..8480e836952e2 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -157,7 +157,7 @@ def discretize(vectors, copy=True, max_svd_restarts=30, n_iter_max=20, def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, - k=None, eigen_tol=0.0, assign_labels='kmeans'): + eigen_tol=0.0, assign_labels='kmeans'): """Apply clustering to a projection to the normalized laplacian. In practice Spectral Clustering is very useful when the structure of From 5f48d8c5b077af30b0f07358bea4f82b39391cd9 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sat, 11 Jan 2014 17:11:42 -0500 Subject: [PATCH 04/12] fix: properly ordered lambdas and taking care of diffusion time is None --- sklearn/manifold/spectral_embedding_.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 137c302ff249a..b87218130d50a 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -259,6 +259,7 @@ def _solve_eigenvalue_problem(adjacency, n_components=1, eigen_solver=None, sigma=1.0, which='LM', tol=eigen_tol) vectors = diffusion_map.T[n_components::-1] + lambdas = lambdas[n_components::-1] except RuntimeError: # When submatrices are exactly singular, an LU decomposition # in arpack fails. We fallback to lobpcg @@ -654,16 +655,17 @@ def diffusion_embedding(adjacency, n_components=8, diffusion_time=None, random_state=random_state, eigen_tol=eigen_tol, norm_laplacian=norm_laplacian) - embedding = vectors * dd - if diffusion_time is not None: - norm_factor = np.sqrt(np.sum(dd)) - psi = (norm_factor * np.diag(1. / (dd ** 0.5))).dot(vectors) + norm_factor = np.sqrt(np.sum(dd)) + psi = (norm_factor * np.diag(1. / (np.sqrt(dd)))).dot(vectors.T) + if diffusion_time is None: + embedding = psi + else: Lt = np.power(1 + lambdas, diffusion_time) embedding = psi * Lt if drop_first: - return embedding[1:].T + return embedding[:, 1:] else: - return embedding.T + return embedding class DiffusionEmbedding(SpectralEmbedding): From 7d1e6b5acafdedbcde79186d82971e6a04b52163 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Sun, 12 Jan 2014 14:59:04 -0500 Subject: [PATCH 05/12] fix: random state check --- sklearn/manifold/spectral_embedding_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index b87218130d50a..19c8aea968821 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -723,7 +723,7 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self.random_state = check_random_state(self.random_state) + random_state = check_random_state(self.random_state) if isinstance(self.affinity, basestring): if self.affinity not in set(("nearest_neighbors", "rbf", "precomputed")): @@ -738,7 +738,7 @@ def fit(self, X, y=None): self.embedding_ = diffusion_embedding(affinity_matrix, n_components=self.n_components, eigen_solver=self.eigen_solver, - random_state=self.random_state, + random_state=random_state, diffusion_time=self.diffusion_time, drop_first=False) return self From f27bf7a294aabca2b981ba2179e842dd68107f3c Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Thu, 23 Jan 2014 08:53:45 -0500 Subject: [PATCH 06/12] trying out clustering --- sklearn/cluster/__init__.py | 3 +- sklearn/cluster/spectral.py | 112 +++++++++++++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index 26ddb689ee0ac..67f6d611f32e5 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -3,7 +3,7 @@ algorithms. """ -from .spectral import spectral_clustering, SpectralClustering +from .spectral import spectral_clustering, SpectralClustering, diffusion_clustering from .mean_shift_ import mean_shift, MeanShift, estimate_bandwidth, \ get_bin_seeds from .affinity_propagation_ import affinity_propagation, AffinityPropagation @@ -28,6 +28,7 @@ 'k_means', 'mean_shift', 'spectral_clustering', + 'diffusion_clustering', 'ward_tree', 'SpectralBiclustering', 'SpectralCoclustering'] diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 8480e836952e2..14770d890b193 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -14,7 +14,7 @@ from ..utils.extmath import norm from ..metrics.pairwise import pairwise_kernels from ..neighbors import kneighbors_graph -from ..manifold import spectral_embedding +from ..manifold import spectral_embedding, diffusion_embedding from .k_means_ import k_means @@ -450,3 +450,113 @@ def fit(self, X): @property def _pairwise(self): return self.affinity == "precomputed" + + +def diffusion_clustering(affinity, n_clusters=8, n_components=None, + eigen_solver=None, random_state=None, n_init=10, + eigen_tol=0.0, assign_labels='kmeans', + diffusion_time=None): + """Apply clustering to a projection to the normalized laplacian. + + In practice Spectral Clustering is very useful when the structure of + the individual clusters is highly non-convex or more generally when + a measure of the center and spread of the cluster is not a suitable + description of the complete cluster. For instance when clusters are + nested circles on the 2D plan. + + If affinity is the adjacency matrix of a graph, this method can be + used to find normalized graph cuts. + + Parameters + ----------- + affinity: array-like or sparse matrix, shape: (n_samples, n_samples) + The affinity matrix describing the relationship of the samples to + embed. **Must be symmetric**. + + Possible examples: + - adjacency matrix of a graph, + - heat kernel of the pairwise distance matrix of the samples, + - symmetric k-nearest neighbours connectivity matrix of the samples. + + n_clusters: integer, optional + Number of clusters to extract. + + n_components: integer, optional, default is k + Number of eigen vectors to use for the spectral embedding + + eigen_solver: {None, 'arpack', 'lobpcg', or 'amg'} + The eigenvalue decomposition strategy to use. AMG requires pyamg + to be installed. It can be faster on very large, sparse problems, + but may also lead to instabilities + + random_state: int seed, RandomState instance, or None (default) + A pseudo random number generator used for the initialization + of the lobpcg eigen vectors decomposition when eigen_solver == 'amg' + and by the K-Means initialization. + + n_init: int, optional, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + eigen_tol : float, optional, default: 0.0 + Stopping criterion for eigendecomposition of the Laplacian matrix + when using arpack eigen_solver. + + assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' + The strategy to use to assign labels in the embedding + space. There are two ways to assign labels after the laplacian + embedding. k-means can be applied and is a popular choice. But it can + also be sensitive to initialization. Discretization is another + approach which is less sensitive to random initialization. See + the 'Multiclass spectral clustering' paper referenced below for + more details on the discretization approach. + + Returns + ------- + labels: array of integers, shape: n_samples + The labels of the clusters. + + References + ---------- + + - Normalized cuts and image segmentation, 2000 + Jianbo Shi, Jitendra Malik + http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 + + - A Tutorial on Spectral Clustering, 2007 + Ulrike von Luxburg + http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323 + + - Multiclass spectral clustering, 2003 + Stella X. Yu, Jianbo Shi + http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf + + Notes + ------ + The graph should contain only one connect component, elsewhere + the results make little sense. + + This algorithm solves the normalized cut for k=2: it is a + normalized spectral clustering. + """ + if not assign_labels in ('kmeans', 'discretize'): + raise ValueError("The 'assign_labels' parameter should be " + "'kmeans' or 'discretize', but '%s' was given" + % assign_labels) + + random_state = check_random_state(random_state) + n_components = n_clusters if n_components is None else n_components + maps = diffusion_embedding(affinity, n_components=n_components, + eigen_solver=eigen_solver, + random_state=random_state, + eigen_tol=eigen_tol, drop_first=False, + diffusion_time=diffusion_time) + + if assign_labels == 'kmeans': + _, labels, _ = k_means(maps, n_clusters, random_state=random_state, + n_init=n_init) + else: + labels = discretize(maps, random_state=random_state) + + return labels From 739f128651b86fad3961c4825e2a6b24a0121dfd Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Mon, 24 Feb 2014 14:18:22 -0500 Subject: [PATCH 07/12] fix: drop the first eigenvector --- sklearn/manifold/spectral_embedding_.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 19c8aea968821..316d5dacb48f4 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -228,6 +228,7 @@ def _solve_eigenvalue_problem(adjacency, n_components=1, eigen_solver=None, laplacian, dd = graph_laplacian(adjacency, normed=norm_laplacian, return_diag=True) + if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and (not sparse.isspmatrix(laplacian) @@ -740,5 +741,5 @@ def fit(self, X, y=None): eigen_solver=self.eigen_solver, random_state=random_state, diffusion_time=self.diffusion_time, - drop_first=False) + drop_first=True) return self From 4d84948c04fa5be617237f3278b3dff490e13955 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Tue, 16 Sep 2014 21:11:30 -0400 Subject: [PATCH 08/12] fix: remove method options --- sklearn/cluster/spectral.py | 6 +-- sklearn/manifold/spectral_embedding_.py | 68 ++++++++++++++----------- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index d35e0bdc03570..90d99419eae1c 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -454,10 +454,10 @@ def _pairwise(self): return self.affinity == "precomputed" -def diffusion_clustering(affinity, n_clusters=8, n_components=None, +def diffusion_clustering(affinity, n_clusters=8, n_components='auto', eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans', - diffusion_time=None): + diffusion_time=None, method=1): """Apply clustering to a projection to the normalized laplacian. In practice Spectral Clustering is very useful when the structure of @@ -552,7 +552,7 @@ def diffusion_clustering(affinity, n_clusters=8, n_components=None, maps = diffusion_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, - eigen_tol=eigen_tol, drop_first=False, + eigen_tol=eigen_tol, diffusion_time=diffusion_time) if assign_labels == 'kmeans': diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 338c0b7050f60..4d106a4e89bed 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -571,7 +571,7 @@ def fit_transform(self, X, y=None): def diffusion_embedding(adjacency, n_components=8, diffusion_time=None, eigen_solver=None, random_state=None, eigen_tol=0.0, - norm_laplacian=True, drop_first=True): + norm_laplacian=True): """Project the sample on the first eigenvectors of the graph Laplacian The adjacency matrix is used to compute a normalized graph Laplacian @@ -613,12 +613,6 @@ def diffusion_embedding(adjacency, n_components=8, diffusion_time=None, Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. - drop_first : bool, optional, default=True - Whether to drop the first eigenvector. For spectral embedding, this - should be True as the first eigenvector should be constant vector for - connected graph, but for spectral clustering, this should be kept as - False to retain the first eigenvector. - Returns ------- embedding : array, shape=(n_samples, n_components) @@ -641,26 +635,32 @@ def diffusion_embedding(adjacency, n_components=8, diffusion_time=None, Computational Harmonic Analysis 21.1 (2006): 5-30. """ - # Whether to drop the first eigenvector - if drop_first: - n_components += 1 - lambdas, vectors, dd = _solve_eigenvalue_problem(adjacency=adjacency, - n_components=n_components, - eigen_solver=eigen_solver, - random_state=random_state, - eigen_tol=eigen_tol, - norm_laplacian=norm_laplacian) - norm_factor = np.sqrt(np.sum(dd)) - psi = (norm_factor * np.diag(1. / (np.sqrt(dd)))).dot(vectors.T) - if diffusion_time is None: - embedding = psi + + if not _graph_is_connected(adjacency): + warnings.warn("Graph is not fully connected, spectral embedding" + " may not work as expected.") + ndim = adjacency.shape[0] + v = np.sqrt(np.sum(adjacency, axis=1)) + A = adjacency/(v[:, None] * v[None, :]) + A = np.squeeze(A * [A > 1e-5]) + if n_components is not None: + lambdas, vectors = eigsh(A, k=n_components) else: - Lt = np.power(1 + lambdas, diffusion_time) - embedding = psi * Lt - if drop_first: - return embedding[:, 1:] + lambdas, vectors = eigsh(A, k=max(2, int(np.sqrt(ndim/2)))) + lambdas = lambdas[::-1] + vectors = vectors[:, ::-1] + + psi = vectors/vectors[:, 0][:, None] + if diffusion_time <= 0: + lambdas = lambdas[1:] / (1 - lambdas[1:]) else: - return embedding + lambdas = lambdas[1:]**diffusion_time + if n_components is None: + lambda_ratio = lambdas/lambdas[0] + n_components = np.amin(np.nonzero(lambda_ratio < .05)[0]) + n_components = min(n_components, ndim) + embedding = psi[:, 1:(n_components + 1)] * lambdas[:n_components][None, :] + return embedding class DiffusionEmbedding(SpectralEmbedding): @@ -688,7 +688,7 @@ class DiffusionEmbedding(SpectralEmbedding): """ - def __init__(self, diffusion_time=0., n_components=2, + def __init__(self, diffusion_time=0., n_components='auto', affinity="nearest_neighbors", gamma=None, random_state=None, eigen_solver=None, n_neighbors=None): super(DiffusionEmbedding, self).__init__(n_components=n_components, @@ -721,19 +721,25 @@ def fit(self, X, y=None): random_state = check_random_state(self.random_state) if isinstance(self.affinity, basestring): if self.affinity not in set(("nearest_neighbors", "rbf", - "precomputed")): - raise ValueError(("%s is not a valid affinity. Expected " + "precomputed", "markov")): + raise ValueError(("%s is not a valid affinity. Expected 'markov', " "'precomputed', 'rbf', 'nearest_neighbors' " "or a callable.") % self.affinity) elif not hasattr(self.affinity, "__call__"): raise ValueError(("'affinity' is expected to be an an affinity " "name or a callable. Got: %s") % self.affinity) - affinity_matrix = self._get_affinity_matrix(X) + if self.affinity == 'markov': + from ..metrics import pairwise_distances + D = pairwise_distances(X) + k = int(max(2, np.round(D.shape[0] * 0.01))) + eps = 2 * np.median(np.sort(D, axis=0)[k+1, :])**2 + affinity_matrix = np.exp(-(D * D) / eps) + else: + affinity_matrix = self._get_affinity_matrix(X) self.embedding_ = diffusion_embedding(affinity_matrix, n_components=self.n_components, eigen_solver=self.eigen_solver, random_state=random_state, - diffusion_time=self.diffusion_time, - drop_first=True) + diffusion_time=self.diffusion_time) return self From 625e2cc02e6ccb48136f25a1972c80df73259352 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Tue, 16 Sep 2014 21:59:29 -0400 Subject: [PATCH 09/12] fix: a few fixes for components --- sklearn/cluster/spectral.py | 15 ++++++++------- sklearn/manifold/spectral_embedding_.py | 2 ++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 90d99419eae1c..dcbf8d656bd7b 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -454,10 +454,10 @@ def _pairwise(self): return self.affinity == "precomputed" -def diffusion_clustering(affinity, n_clusters=8, n_components='auto', +def diffusion_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans', - diffusion_time=None, method=1): + diffusion_time=0): """Apply clustering to a projection to the normalized laplacian. In practice Spectral Clustering is very useful when the structure of @@ -549,11 +549,12 @@ def diffusion_clustering(affinity, n_clusters=8, n_components='auto', random_state = check_random_state(random_state) n_components = n_clusters if n_components is None else n_components - maps = diffusion_embedding(affinity, n_components=n_components, - eigen_solver=eigen_solver, - random_state=random_state, - eigen_tol=eigen_tol, - diffusion_time=diffusion_time) + maps = diffusion_embedding(affinity, + n_components=n_components, + eigen_solver=eigen_solver, + random_state=random_state, + eigen_tol=eigen_tol, + diffusion_time=diffusion_time) if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters, random_state=random_state, diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 4d106a4e89bed..6e0242ba15e33 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -643,6 +643,8 @@ def diffusion_embedding(adjacency, n_components=8, diffusion_time=None, v = np.sqrt(np.sum(adjacency, axis=1)) A = adjacency/(v[:, None] * v[None, :]) A = np.squeeze(A * [A > 1e-5]) + print A.shape + print n_components if n_components is not None: lambdas, vectors = eigsh(A, k=n_components) else: From 3b3b739e343828ecf671ff35c9ce5306969fff61 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Tue, 16 Sep 2014 22:02:02 -0400 Subject: [PATCH 10/12] sty: remove debug prints --- sklearn/manifold/spectral_embedding_.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 6e0242ba15e33..4d106a4e89bed 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -643,8 +643,6 @@ def diffusion_embedding(adjacency, n_components=8, diffusion_time=None, v = np.sqrt(np.sum(adjacency, axis=1)) A = adjacency/(v[:, None] * v[None, :]) A = np.squeeze(A * [A > 1e-5]) - print A.shape - print n_components if n_components is not None: lambdas, vectors = eigsh(A, k=n_components) else: From 4c9619376e39aa80e1ff5af06cdf6c977665ec5e Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Tue, 16 Sep 2014 22:26:32 -0400 Subject: [PATCH 11/12] ensure an extra component is added because the first component is dropped --- sklearn/manifold/spectral_embedding_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 4d106a4e89bed..938a37f4c96cb 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -644,7 +644,7 @@ def diffusion_embedding(adjacency, n_components=8, diffusion_time=None, A = adjacency/(v[:, None] * v[None, :]) A = np.squeeze(A * [A > 1e-5]) if n_components is not None: - lambdas, vectors = eigsh(A, k=n_components) + lambdas, vectors = eigsh(A, k=n_components + 1) else: lambdas, vectors = eigsh(A, k=max(2, int(np.sqrt(ndim/2)))) lambdas = lambdas[::-1] @@ -688,7 +688,7 @@ class DiffusionEmbedding(SpectralEmbedding): """ - def __init__(self, diffusion_time=0., n_components='auto', + def __init__(self, diffusion_time=0., n_components=None, affinity="nearest_neighbors", gamma=None, random_state=None, eigen_solver=None, n_neighbors=None): super(DiffusionEmbedding, self).__init__(n_components=n_components, From 50fda64bf636ceccc6ab110ff5bcd39994f0f7e1 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Tue, 17 Mar 2015 11:03:35 -0400 Subject: [PATCH 12/12] enh: added new metrics and improved embedding evaluation --- sklearn/manifold/spectral_embedding_.py | 69 ++++++++++++++++++------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index 938a37f4c96cb..0eec7676bf8e8 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -17,7 +17,7 @@ from ..utils.validation import check_array from ..utils.graph import graph_laplacian from ..utils.sparsetools import connected_components -from ..utils.arpack import eigsh +from ..utils.arpack import eigsh, eigs from ..metrics.pairwise import rbf_kernel from ..neighbors import kneighbors_graph @@ -639,26 +639,45 @@ def diffusion_embedding(adjacency, n_components=8, diffusion_time=None, if not _graph_is_connected(adjacency): warnings.warn("Graph is not fully connected, spectral embedding" " may not work as expected.") - ndim = adjacency.shape[0] - v = np.sqrt(np.sum(adjacency, axis=1)) - A = adjacency/(v[:, None] * v[None, :]) - A = np.squeeze(A * [A > 1e-5]) + K = sparse.csr_matrix(adjacency) + ndim = K.shape[0] + v = np.array(np.sqrt(K.sum(axis=1))).flatten() + A = K.copy() + del K + A.data /= v[A.indices] + A = sparse.csr_matrix(A.transpose().toarray()) + A.data /= v[A.indices] + A = sparse.csr_matrix(A.transpose().toarray()) + + func = eigs if n_components is not None: - lambdas, vectors = eigsh(A, k=n_components + 1) + lambdas, vectors = func(A, k=n_components + 1) else: - lambdas, vectors = eigsh(A, k=max(2, int(np.sqrt(ndim/2)))) - lambdas = lambdas[::-1] - vectors = vectors[:, ::-1] + lambdas, vectors = func(A, k=max(2, int(np.sqrt(ndim)))) + del A - psi = vectors/vectors[:, 0][:, None] + if func == eigsh: + lambdas = lambdas[::-1] + vectors = vectors[:, ::-1] + else: + lambdas = np.real(lambdas) + vectors = np.real(vectors) + lambda_idx = np.argsort(lambdas)[::-1] + lambdas = lambdas[lambda_idx] + vectors = vectors[:, lambda_idx] + + psi = vectors/vectors[:, [0]] if diffusion_time <= 0: lambdas = lambdas[1:] / (1 - lambdas[1:]) else: - lambdas = lambdas[1:]**diffusion_time + lambdas = lambdas[1:] ** float(diffusion_time) + lambda_ratio = lambdas/lambdas[0] + threshold = max(0.05, lambda_ratio[-1]) + + n_components_auto = np.amax(np.nonzero(lambda_ratio > threshold)[0]) + n_components_auto = min(n_components_auto, ndim) if n_components is None: - lambda_ratio = lambdas/lambdas[0] - n_components = np.amin(np.nonzero(lambda_ratio < .05)[0]) - n_components = min(n_components, ndim) + n_components = n_components_auto embedding = psi[:, 1:(n_components + 1)] * lambdas[:n_components][None, :] return embedding @@ -720,7 +739,7 @@ def fit(self, X, y=None): """ random_state = check_random_state(self.random_state) if isinstance(self.affinity, basestring): - if self.affinity not in set(("nearest_neighbors", "rbf", + if self.affinity not in set(("nearest_neighbors", "rbf", "cauchy", "precomputed", "markov")): raise ValueError(("%s is not a valid affinity. Expected 'markov', " "'precomputed', 'rbf', 'nearest_neighbors' " @@ -729,14 +748,24 @@ def fit(self, X, y=None): raise ValueError(("'affinity' is expected to be an an affinity " "name or a callable. Got: %s") % self.affinity) - if self.affinity == 'markov': + from ..decomposition import RandomizedPCA + pca = RandomizedPCA(n_components=self.n_components, + random_state=random_state) + #X = pca.fit_transform(X) + eps = self.gamma + if self.affinity in ['markov', 'cauchy']: from ..metrics import pairwise_distances - D = pairwise_distances(X) - k = int(max(2, np.round(D.shape[0] * 0.01))) - eps = 2 * np.median(np.sort(D, axis=0)[k+1, :])**2 - affinity_matrix = np.exp(-(D * D) / eps) + D = pairwise_distances(X, metric='euclidean') #, squared=True) + if eps is None: + k = int(max(2, np.round(D.shape[0] * 0.01))) + eps = 2 * np.median(np.sort(D, axis=0)[k+1, :])**2 + if self.affinity == 'markov': + affinity_matrix = np.exp(-(D * D) / eps) + elif self.affinity == 'cauchy': + affinity_matrix = 1./(D * D + eps) else: affinity_matrix = self._get_affinity_matrix(X) + self.eps_ = eps self.embedding_ = diffusion_embedding(affinity_matrix, n_components=self.n_components, eigen_solver=self.eigen_solver,