scikit-learn · ogrisel · Jul 26, 2020 · Apr 12, 2020 · Apr 12, 2020 · Jul 25, 2020
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -70,6 +70,15 @@ Changelog
   `init_size_`, are deprecated and will be removed in 0.26. :pr:`17864` by
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Fix| :class:`cluster.AgglomerativeClustering.__init__()` has a new parameter
+  `compute_distances`. When set to `True`, distances between clusters are
+  computed and stored in the `distances_` attribute even when the parameter
+  `distance_threshold` is not used. This new parameter is useful to produce
+  dendrogram visualizations, but introduces a computational and memory
+  overhead. :pr:`17984` by :user:`Michael Riedmann <mriedmann>`,
+  :user:`Emilie Delattre <EmilieDel>`, and
+  :user:`Francesco Casalegno <FrancescoCasalegno>`.
+
 :mod:`sklearn.covariance`
 .........................
 

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
@@ -747,6 +747,13 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
 
         .. versionadded:: 0.21
 
+    compute_distances : bool, default=False
+        Computes distances between clusters even if no `distance_threshold` is
+        used. This can be used to make dendrogram visualization, but introduces
+        a computational and memory overhead.
+
+        .. versionadded:: 0.24
+
     Attributes
     ----------
     n_clusters_ : int
@@ -776,7 +783,8 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
 
     distances_ : array-like of shape (n_nodes-1,)
         Distances between nodes in the corresponding place in `children_`.
-        Only computed if distance_threshold is not None.
+        Only computed if `distance_threshold` is used or `compute_distances`
+        is set to `True`.
 
     Examples
     --------
@@ -795,14 +803,16 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
     def __init__(self, n_clusters=2, *, affinity="euclidean",
                  memory=None,
                  connectivity=None, compute_full_tree='auto',
-                 linkage='ward', distance_threshold=None):
+                 linkage='ward', distance_threshold=None,
+                 compute_distances=False):
         self.n_clusters = n_clusters
         self.distance_threshold = distance_threshold
         self.memory = memory
         self.connectivity = connectivity
         self.compute_full_tree = compute_full_tree
         self.linkage = linkage
         self.affinity = affinity
+        self.compute_distances = compute_distances
 
     def fit(self, X, y=None):
         """Fit the hierarchical clustering from features, or distance matrix.
@@ -879,7 +889,10 @@ def fit(self, X, y=None):
 
         distance_threshold = self.distance_threshold
 
-        return_distance = distance_threshold is not None
+        return_distance = (
+            (distance_threshold is not None) or self.compute_distances
+        )
+
         out = memory.cache(tree_builder)(X, connectivity=connectivity,
                                          n_clusters=n_clusters,
                                          return_distance=return_distance,
@@ -891,9 +904,11 @@ def fit(self, X, y=None):
 
         if return_distance:
             self.distances_ = out[-1]
+
+        if self.distance_threshold is not None:  # distance_threshold is used
             self.n_clusters_ = np.count_nonzero(
                 self.distances_ >= distance_threshold) + 1
-        else:
+        else:  # n_clusters is used
             self.n_clusters_ = self.n_clusters
 
         # Cut the tree
@@ -999,6 +1014,13 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
 
         .. versionadded:: 0.21
 
+    compute_distances : bool, default=False
+        Computes distances between clusters even if no `distance_threshold` is
+        used. This can be used to make dendrogram visualization, but introduces
+        a computational and memory overhead.
+
+        .. versionadded:: 0.24
+
     Attributes
     ----------
     n_clusters_ : int
@@ -1028,7 +1050,8 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
 
     distances_ : array-like of shape (n_nodes-1,)
         Distances between nodes in the corresponding place in `children_`.
-        Only computed if distance_threshold is not None.
+        Only computed if `distance_threshold` is used or `compute_distances`
+        is set to `True`.
 
     Examples
     --------
@@ -1049,11 +1072,12 @@ def __init__(self, n_clusters=2, *, affinity="euclidean",
                  memory=None,
                  connectivity=None, compute_full_tree='auto',
                  linkage='ward', pooling_func=np.mean,
-                 distance_threshold=None):
+                 distance_threshold=None, compute_distances=False):
         super().__init__(
             n_clusters=n_clusters, memory=memory, connectivity=connectivity,
             compute_full_tree=compute_full_tree, linkage=linkage,
-            affinity=affinity, distance_threshold=distance_threshold)
+            affinity=affinity, distance_threshold=distance_threshold,
+            compute_distances=compute_distances)
         self.pooling_func = pooling_func
 
     def fit(self, X, y=None, **params):

diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
@@ -143,6 +143,37 @@ def test_zero_cosine_linkage_tree():
     assert_raise_message(ValueError, msg, linkage_tree, X, affinity='cosine')
 
 
+@pytest.mark.parametrize('n_clusters, distance_threshold',
+                         [(None, 0.5), (10, None)])
+@pytest.mark.parametrize('compute_distances', [True, False])
+@pytest.mark.parametrize('linkage', ["ward", "complete", "average", "single"])
+def test_agglomerative_clustering_distances(n_clusters,
+                                            compute_distances,
+                                            distance_threshold,
+                                            linkage):
+    # Check that when `compute_distances` is True or `distance_threshold` is
+    # given, the fitted model has an attribute `distances_`.
+    rng = np.random.RandomState(0)
+    mask = np.ones([10, 10], dtype=bool)
+    n_samples = 100
+    X = rng.randn(n_samples, 50)
+    connectivity = grid_to_graph(*mask.shape)
+
+    clustering = AgglomerativeClustering(n_clusters=n_clusters,
+                                         connectivity=connectivity,
+                                         linkage=linkage,
+                                         distance_threshold=distance_threshold,
+                                         compute_distances=compute_distances)
+    clustering.fit(X)
+    if compute_distances or (distance_threshold is not None):
+        assert hasattr(clustering, 'distances_')
+        n_children = clustering.children_.shape[0]
+        n_nodes = n_children + 1
+        assert clustering.distances_.shape == (n_nodes-1, )
+    else:
+        assert not hasattr(clustering, 'distances_')
+
+
 def test_agglomerative_clustering():
     # Check that we obtain the correct number of clusters with
     # agglomerative clustering.