scikit-learn · NicolasHug · Apr 29, 2019 · Jun 8, 2017 · Oct 30, 2017 · Dec 6, 2017
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -73,13 +73,13 @@ Overview of clustering methods
      - Graph distance (e.g. nearest-neighbor graph)
 
    * - :ref:`Ward hierarchical clustering <hierarchical_clustering>`
-     - number of clusters
+     - number of clusters or distance threshold
      - Large ``n_samples`` and ``n_clusters``
      - Many clusters, possibly connectivity constraints
      - Distances between points
 
    * - :ref:`Agglomerative clustering <hierarchical_clustering>`
-     - number of clusters, linkage type, distance
+     - number of clusters or distance threshold, linkage type, distance
      - Large ``n_samples`` and ``n_clusters``
      - Many clusters, possibly connectivity constraints, non Euclidean
        distances

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -90,6 +90,11 @@ Support for Python 3.4 and below has been officially dropped.
   ``n_connected_components_``.
   :issue:`13427` by :user:`Stephane Couvreur <scouvreur>`.
 
+- |Enhancement| :class:`cluster.AgglomerativeClustering` and
+  :class:`cluster.FeatureAgglomeration` now accept a ``distance_threshold``
+  parameter which can be used to find the clusters instead of ``n_clusters``.
+  :issue:`9069` by :user:`Vathsala Achar <VathsalaAchar>`.
+
 - |Fix| Fixed a bug in :class:`KMeans` where empty clusters weren't correctly
   relocated when using sample weights. :issue:`13486`
   by :user:`Jérémie du Boisberranger <jeremiedbb>`.

diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
@@ -711,8 +711,21 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin):
             ``pooling_func`` has been deprecated in 0.20 and will be removed
             in 0.22.
 
+    distance_threshold : float (optional)
+        The distance threshold to cluster at.
+        NOTE: You should set either ``n_clusters`` or ``distance_threshold``,
+        NOT both. If the ``distance_threshold`` is set then ``n_clusters`` is
+        ignored.
+
+        .. versionadded:: 0.21
+
     Attributes
     ----------
+    n_clusters_ : int
+        The number of clusters found by the algorithm. If
+        ``distance_threshold=None``, it will be equal to the given
+        ``n_clusters``. Otherwise it is set to the number of reported clusters.
+
     labels_ : array [n_samples]
         cluster labels for each point
 
@@ -739,8 +752,9 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin):
     >>> clustering = AgglomerativeClustering().fit(X)
     >>> clustering # doctest: +NORMALIZE_WHITESPACE
     AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
-                connectivity=None, linkage='ward', memory=None, n_clusters=2,
-                pooling_func='deprecated')
+                            connectivity=None, distance_threshold=None,
+                            linkage='ward', memory=None, n_clusters=2,
+                            pooling_func='deprecated')
     >>> clustering.labels_
     array([1, 1, 1, 0, 0, 0])
 
@@ -749,8 +763,10 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin):
     def __init__(self, n_clusters=2, affinity="euclidean",
                  memory=None,
                  connectivity=None, compute_full_tree='auto',
-                 linkage='ward', pooling_func='deprecated'):
+                 linkage='ward', pooling_func='deprecated',
+                 distance_threshold=None):
         self.n_clusters = n_clusters
+        self.distance_threshold = distance_threshold
         self.memory = memory
         self.connectivity = connectivity
         self.compute_full_tree = compute_full_tree
@@ -788,10 +804,14 @@ def fit(self, X, y=None):
         X = check_array(X, ensure_min_samples=2, estimator=self)
         memory = check_memory(self.memory)
 
-        if self.n_clusters <= 0:
+        if self.n_clusters is not None and self.n_clusters <= 0:
             raise ValueError("n_clusters should be an integer greater than 0."
                              " %s was provided." % str(self.n_clusters))
 
+        if self.n_clusters is None and self.distance_threshold is None:
+            raise ValueError("n_clusters and distance_threshold cannot be "
+                             "both None.")
+
         if self.linkage == "ward" and self.affinity != "euclidean":
             raise ValueError("%s was provided as affinity. Ward can only "
                              "work with euclidean distances." %
@@ -814,7 +834,7 @@ def fit(self, X, y=None):
         compute_full_tree = self.compute_full_tree
         if self.connectivity is None:
             compute_full_tree = True
-        if compute_full_tree == 'auto':
+        if compute_full_tree == 'auto' and self.distance_threshold is None:
             # Early stopping is likely to give a speed up only for
             # a large number of clusters. The actual threshold
             # implemented here is heuristic
@@ -828,14 +848,31 @@ def fit(self, X, y=None):
         if self.linkage != 'ward':
             kwargs['linkage'] = self.linkage
             kwargs['affinity'] = self.affinity
-        (self.children_, self.n_connected_components_, self.n_leaves_,
-            parents) = memory.cache(tree_builder)(X, connectivity,
-                                                  n_clusters=n_clusters,
-                                                  **kwargs)
+
+        distance_threshold = self.distance_threshold
+        # if distance_threshold is set then distances is returned
+        if distance_threshold is not None:
+            ch, n_comps, n_lvs, parents, distances = \
+                memory.cache(tree_builder)(X, connectivity,
+                                           n_clusters=n_clusters,
+                                           return_distance=True,
+                                           **kwargs)
+            self.n_clusters_ = np.count_nonzero(
+                distances >= distance_threshold) + 1
+        else:
+            ch, n_comps, n_lvs, parents = \
+                memory.cache(tree_builder)(X, connectivity,
+                                           n_clusters=n_clusters,
+                                           **kwargs)
+            self.n_clusters_ = self.n_clusters
+
+        self.children_ = ch
+        self.n_connected_components_ = n_comps
+        self.n_leaves_ = n_lvs
 
         # Cut the tree
         if compute_full_tree:
-            self.labels_ = _hc_cut(self.n_clusters, self.children_,
+            self.labels_ = _hc_cut(self.n_clusters_, self.children_,
                                    self.n_leaves_)
         else:
             labels = _hierarchical.hc_get_heads(parents, copy=False)
@@ -904,6 +941,14 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         value, and should accept an array of shape [M, N] and the keyword
         argument `axis=1`, and reduce it to an array of size [M].
 
+    distance_threshold : float (optional)
+        The distance threshold to cluster at.
+        NOTE: You should set either ``n_clusters`` or ``distance_threshold``,
+        NOT both. If the ``distance_threshold`` is set then ``n_clusters`` is
+        ignored.
+
+        .. versionadded:: 0.21
+
     Attributes
     ----------
     labels_ : array-like, (n_features,)
@@ -933,8 +978,9 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
     >>> agglo = cluster.FeatureAgglomeration(n_clusters=32)
     >>> agglo.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto',
-               connectivity=None, linkage='ward', memory=None, n_clusters=32,
-               pooling_func=...)
+                 connectivity=None, distance_threshold=None, linkage='ward',
+                 memory=None, n_clusters=32,
+                 pooling_func=...)
     >>> X_reduced = agglo.transform(X)
     >>> X_reduced.shape
     (1797, 32)
@@ -943,11 +989,12 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
     def __init__(self, n_clusters=2, affinity="euclidean",
                  memory=None,
                  connectivity=None, compute_full_tree='auto',
-                 linkage='ward', pooling_func=np.mean):
+                 linkage='ward', pooling_func=np.mean,
+                 distance_threshold=None):
         super().__init__(
             n_clusters=n_clusters, memory=memory, connectivity=connectivity,
             compute_full_tree=compute_full_tree, linkage=linkage,
-            affinity=affinity)
+            affinity=affinity, distance_threshold=distance_threshold)
         self.pooling_func = pooling_func
 
     def fit(self, X, y=None, **params):

diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
@@ -14,6 +14,7 @@
 from scipy import sparse
 from scipy.cluster import hierarchy
 
+from sklearn.metrics.cluster.supervised import adjusted_rand_score
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_almost_equal
@@ -573,6 +574,21 @@ def test_agg_n_clusters():
         assert_raise_message(ValueError, msg, agc.fit, X)
 
 
+def test_agg_n_cluster_and_distance_threshold():
+    # Test that when distance_threshold is set that n_clusters is ignored
+
+    n_clus, dist_thresh = None, 10
+    rng = np.random.RandomState(0)
+    X = rng.rand(20, 10)
+    agc = AgglomerativeClustering(n_clusters=n_clus,
+                                  distance_threshold=dist_thresh)
+    agc.fit(X)
+    # Expecting no errors here
+    assert agc.n_clusters == n_clus
+    assert agc.n_clusters_ != n_clus
+    assert agc.n_clusters_ > 0
+
+
 def test_affinity_passed_to_fix_connectivity():
     # Test that the affinity parameter is actually passed to the pairwise
     # function
@@ -600,6 +616,90 @@ def increment(self, *args, **kwargs):
     assert_equal(fa.counter, 3)
 
 
+def test_agglomerative_clustering_with_distance_threshold():
+    # Check that we obtain the correct number of clusters with
+    # agglomerative clustering with distance_threshold.
+
+    rng = np.random.RandomState(0)
+    mask = np.ones([10, 10], dtype=np.bool)
+    n_samples = 100
+    X = rng.randn(n_samples, 50)
+    connectivity = grid_to_graph(*mask.shape)
+    # test when distance threshold is set to 10
+    distance_threshold = 10
+    for linkage in ("ward", "complete", "average"):
+        for conn in [None, connectivity]:
+            clustering = AgglomerativeClustering(
+                distance_threshold=distance_threshold,
+                connectivity=conn, linkage=linkage)
+            clustering.fit(X)
+            clusters_produced = clustering.labels_
+            num_clusters_produced = len(np.unique(clustering.labels_))
+            # test if the clusters produced match the point in the linkage tree
+            # where the distance exceeds the threshold
+            tree_builder = _TREE_BUILDERS[linkage]
+            children, n_components, n_leaves, parent, distances = \
+                tree_builder(X, connectivity=conn, n_clusters=None,
+                             return_distance=True)
+            num_clusters_at_threshold = np.count_nonzero(
+                distances >= distance_threshold) + 1
+            # test number of clusters produced
+            assert num_clusters_at_threshold == num_clusters_produced
+            # test clusters produced
+            clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced,
+                                            children=children,
+                                            n_leaves=n_leaves)
+            assert np.array_equiv(clusters_produced,
+                                  clusters_at_threshold)
+
+    rng = np.random.RandomState(0)
+    n_samples = 10
+    X = rng.randint(-3, 3, size=(n_samples, 3))
+    # this should result in all data in their own clusters
+    clustering = AgglomerativeClustering(
+        distance_threshold=1,
+        linkage="single").fit(X)
+    assert len(np.unique(clustering.labels_)) == 10
+
+    # check the distances within the clusters and with other clusters
+    threshold = 2
+    clustering = AgglomerativeClustering(
+        distance_threshold=threshold,
+        linkage="single").fit(X)
+    labels = clustering.labels_
+    D = pairwise_distances(X, metric="euclidean")
+    # to avoid taking the 0 diagonal in min()
+    np.fill_diagonal(D, np.inf)
+    for i in np.unique(labels):
+        in_cluster_mask = labels == i
+        max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask]
+                                   .min(axis=0).max())
+        min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask]
+                                    .min(axis=0).min())
+        # single data point clusters only have that inf diagonal here
+        if in_cluster_mask.sum() > 1:
+            assert max_in_cluster_distance < threshold
+        assert min_out_cluster_distance >= threshold
+
+
+def test_agglomerative_clustering_with_distance_threshold_edge_case():
+    # test boundary case of distance_threshold matching the distance
+    X = [[0], [1]]
+    for linkage in ("ward", "complete", "average"):
+        for threshold, y_true in [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])]:
+            clusterer = AgglomerativeClustering(distance_threshold=threshold,
+                                                linkage=linkage)
+            y_pred = clusterer.fit_predict(X)
+            assert_equal(1, adjusted_rand_score(y_true, y_pred))
+
+
+def test_none_dis_threshold_n_clust():
+    X = [[0], [1]]
+    with pytest.raises(ValueError, match="cannot be both None"):
+        AgglomerativeClustering(n_clusters=None,
+                                distance_threshold=None).fit(X)
+
+
 def test_n_components_deprecation():
     # Test that a Deprecation warning is thrown when n_components_
     # attribute is accessed