Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distances for agglomerativeclustering #17984

Merged
merged 14 commits into from
Jul 26, 2020
9 changes: 9 additions & 0 deletions doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,15 @@ Changelog
`init_size_`, are deprecated and will be removed in 0.26. :pr:`17864` by
:user:`Jérémie du Boisberranger <jeremiedbb>`.

- |Fix| :class:`cluster.AgglomerativeClustering.__init__()` has a new parameter
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
`compute_distances`. When set to `True`, distances between clusters are
computed and stored in the `distances_` attribute even when the parameter
`distance_threshold` is not used. This new parameter is useful to produce
dendrogram visualizations, but introduces a computational and memory
overhead. :pr:`17984` by :user:`Michael Riedmann <mriedmann>`,
:user:`Emilie Delattre <EmilieDel>`, and
:user:`Francesco Casalegno <FrancescoCasalegno>`.

:mod:`sklearn.covariance`
.........................

Expand Down
38 changes: 31 additions & 7 deletions sklearn/cluster/_agglomerative.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,13 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):

.. versionadded:: 0.21

compute_distances : bool, default=False
rth marked this conversation as resolved.
Show resolved Hide resolved
Computes distances between clusters even if no `distance_threshold` is
used. This can be used to make dendrogram visualization, but introduces
a computational and memory overhead.

.. versionadded:: 0.24

Attributes
----------
ogrisel marked this conversation as resolved.
Show resolved Hide resolved
n_clusters_ : int
Expand Down Expand Up @@ -776,7 +783,8 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):

distances_ : array-like of shape (n_nodes-1,)
Distances between nodes in the corresponding place in `children_`.
Only computed if distance_threshold is not None.
Only computed if `distance_threshold` is used or `compute_distances`
is set to `True`.

Examples
--------
Expand All @@ -795,14 +803,16 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
def __init__(self, n_clusters=2, *, affinity="euclidean",
memory=None,
connectivity=None, compute_full_tree='auto',
linkage='ward', distance_threshold=None):
linkage='ward', distance_threshold=None,
compute_distances=False):
self.n_clusters = n_clusters
self.distance_threshold = distance_threshold
self.memory = memory
self.connectivity = connectivity
self.compute_full_tree = compute_full_tree
self.linkage = linkage
self.affinity = affinity
self.compute_distances = compute_distances

def fit(self, X, y=None):
"""Fit the hierarchical clustering from features, or distance matrix.
Expand Down Expand Up @@ -879,7 +889,10 @@ def fit(self, X, y=None):

distance_threshold = self.distance_threshold

return_distance = distance_threshold is not None
return_distance = (
(distance_threshold is not None) or self.compute_distances
)

out = memory.cache(tree_builder)(X, connectivity=connectivity,
n_clusters=n_clusters,
return_distance=return_distance,
Expand All @@ -891,9 +904,11 @@ def fit(self, X, y=None):

if return_distance:
self.distances_ = out[-1]

if self.distance_threshold is not None: # distance_threshold is used
self.n_clusters_ = np.count_nonzero(
self.distances_ >= distance_threshold) + 1
else:
else: # n_clusters is used
self.n_clusters_ = self.n_clusters

# Cut the tree
Expand Down Expand Up @@ -999,6 +1014,13 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):

.. versionadded:: 0.21

compute_distances : bool, default=False
Computes distances between clusters even if no `distance_threshold` is
used. This can be used to make dendrogram visualization, but introduces
a computational and memory overhead.

.. versionadded:: 0.24

ogrisel marked this conversation as resolved.
Show resolved Hide resolved
Attributes
----------
n_clusters_ : int
Expand Down Expand Up @@ -1028,7 +1050,8 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):

distances_ : array-like of shape (n_nodes-1,)
Distances between nodes in the corresponding place in `children_`.
Only computed if distance_threshold is not None.
Only computed if `distance_threshold` is used or `compute_distances`
is set to `True`.

Examples
--------
Expand All @@ -1049,11 +1072,12 @@ def __init__(self, n_clusters=2, *, affinity="euclidean",
memory=None,
connectivity=None, compute_full_tree='auto',
linkage='ward', pooling_func=np.mean,
distance_threshold=None):
distance_threshold=None, compute_distances=False):
super().__init__(
n_clusters=n_clusters, memory=memory, connectivity=connectivity,
compute_full_tree=compute_full_tree, linkage=linkage,
affinity=affinity, distance_threshold=distance_threshold)
affinity=affinity, distance_threshold=distance_threshold,
compute_distances=compute_distances)
self.pooling_func = pooling_func

def fit(self, X, y=None, **params):
Expand Down
31 changes: 31 additions & 0 deletions sklearn/cluster/tests/test_hierarchical.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,37 @@ def test_zero_cosine_linkage_tree():
assert_raise_message(ValueError, msg, linkage_tree, X, affinity='cosine')


@pytest.mark.parametrize('n_clusters, distance_threshold',
[(None, 0.5), (10, None)])
@pytest.mark.parametrize('compute_distances', [True, False])
@pytest.mark.parametrize('linkage', ["ward", "complete", "average", "single"])
def test_agglomerative_clustering_distances(n_clusters,
compute_distances,
distance_threshold,
linkage):
# Check that when `compute_distances` is True or `distance_threshold` is
# given, the fitted model has an attribute `distances_`.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)

clustering = AgglomerativeClustering(n_clusters=n_clusters,
connectivity=connectivity,
linkage=linkage,
distance_threshold=distance_threshold,
compute_distances=compute_distances)
clustering.fit(X)
if compute_distances or (distance_threshold is not None):
assert hasattr(clustering, 'distances_')
n_children = clustering.children_.shape[0]
n_nodes = n_children + 1
assert clustering.distances_.shape == (n_nodes-1, )
else:
assert not hasattr(clustering, 'distances_')


def test_agglomerative_clustering():
# Check that we obtain the correct number of clusters with
# agglomerative clustering.
Expand Down