ENH add n_components kwarg to SpectralClustering. See #13698 (#13726)

scikit-learn · May 27, 2019 · db48ebc · db48ebc
1 parent f3a6a1a
commit db48ebc
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 8 deletions.
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -57,6 +57,16 @@ Changelog
   ``decision_function_shape='ovr'``, and the number of target classes > 2.
   :pr:`12557` by `Adrin Jalali`_.
 
+
+:mod:`sklearn.cluster`
+..................
+
+- |Enhancement| :class:`cluster.SpectralClustering` now accepts a ``n_components`` 
+  parameter. This parameter extends `SpectralClustering` class functionality to
+  match `spectral_clustering`.
+  :pr:`13726` by :user:`Shuzhe Xiao <fdas3213>`.
+
+
 Miscellaneous
 .............
 

diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
@@ -307,6 +307,9 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
         to be installed. It can be faster on very large, sparse problems,
         but may also lead to instabilities.
 
+    n_components : integer, optional, default=n_clusters
+        Number of eigen vectors to use for the spectral embedding
+
     random_state : int, RandomState instance or None (default)
         A pseudo random number generator used for the initialization of the
         lobpcg eigen vectors decomposition when ``eigen_solver='amg'`` and by
@@ -387,8 +390,8 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
     >>> clustering # doctest: +NORMALIZE_WHITESPACE
     SpectralClustering(affinity='rbf', assign_labels='discretize', coef0=1,
               degree=3, eigen_solver=None, eigen_tol=0.0, gamma=1.0,
-              kernel_params=None, n_clusters=2, n_init=10, n_jobs=None,
-              n_neighbors=10, random_state=0)
+              kernel_params=None, n_clusters=2, n_components=None, n_init=10,
+              n_jobs=None, n_neighbors=10, random_state=0)
 
     Notes
     -----
@@ -425,12 +428,13 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
       https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
     """
 
-    def __init__(self, n_clusters=8, eigen_solver=None, random_state=None,
-                 n_init=10, gamma=1., affinity='rbf', n_neighbors=10,
-                 eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1,
-                 kernel_params=None, n_jobs=None):
+    def __init__(self, n_clusters=8, eigen_solver=None, n_components=None,
+                 random_state=None, n_init=10, gamma=1., affinity='rbf',
+                 n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans',
+                 degree=3, coef0=1, kernel_params=None, n_jobs=None):
         self.n_clusters = n_clusters
         self.eigen_solver = eigen_solver
+        self.n_components = n_components
         self.random_state = random_state
         self.n_init = n_init
         self.gamma = gamma
@@ -486,6 +490,7 @@ def fit(self, X, y=None):
         random_state = check_random_state(self.random_state)
         self.labels_ = spectral_clustering(self.affinity_matrix_,
                                            n_clusters=self.n_clusters,
+                                           n_components=self.n_components,
                                            eigen_solver=self.eigen_solver,
                                            random_state=random_state,
                                            n_init=self.n_init,

diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py
@@ -107,8 +107,7 @@ def test_affinities():
     # a dataset that yields a stable eigen decomposition both when built
     # on OSX and Linux
     X, y = make_blobs(n_samples=20, random_state=0,
-                      centers=[[1, 1], [-1, -1]], cluster_std=0.01
-                     )
+                      centers=[[1, 1], [-1, -1]], cluster_std=0.01)
     # nearest neighbors affinity
     sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                             random_state=0)
@@ -204,3 +203,23 @@ def test_spectral_clustering_with_arpack_amg_solvers():
         assert_raises(
             ValueError, spectral_clustering,
             graph, n_clusters=2, eigen_solver='amg', random_state=0)
+
+
+def test_n_components():
+    # Test that after adding n_components, result is different and
+    # n_components = n_clusters by default
+    X, y = make_blobs(n_samples=20, random_state=0,
+                      centers=[[1, 1], [-1, -1]], cluster_std=0.01)
+    sp = SpectralClustering(n_clusters=2, random_state=0)
+    labels = sp.fit(X).labels_
+    # set n_components = n_cluster and test if result is the same
+    labels_same_ncomp = SpectralClustering(n_clusters=2, n_components=2,
+                                           random_state=0).fit(X).labels_
+    # test that n_components=n_clusters by default
+    assert_array_equal(labels, labels_same_ncomp)
+
+    # test that n_components affect result
+    # n_clusters=8 by default, and set n_components=2
+    labels_diff_ncomp = SpectralClustering(n_components=2,
+                                           random_state=0).fit(X).labels_
+    assert not np.array_equal(labels, labels_diff_ncomp)