diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 188e52a27d925..7be27894abe0f 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -224,8 +224,15 @@ Changelog to return root mean squared error. :pr:`13467` by :user:`Urvang Patel `. +:mod:`sklearn.metrics` +...................... + +- |Fix| Raise a ValueError in :func:`metrics.silhouette_score` when a + precomputed distance matrix contains non-zero diagonal entries. + :pr:`12258` by :user:`Stephen Tierney `. + :mod:`sklearn.model_selection` -............................... +.............................. - |Enhancement| :class:`model_selection.learning_curve` now accepts parameter ``return_times`` which can be used to retrieve computation times in order to diff --git a/sklearn/metrics/cluster/tests/test_unsupervised.py b/sklearn/metrics/cluster/tests/test_unsupervised.py index 02a4e85501e77..8e88247db7f0b 100644 --- a/sklearn/metrics/cluster/tests/test_unsupervised.py +++ b/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -168,6 +168,22 @@ def test_non_numpy_labels(): silhouette_score(list(X), list(y)) == silhouette_score(X, y)) +def test_silhouette_nonzero_diag(): + # Construct a zero-diagonal matrix + dists = pairwise_distances( + np.array([[0.2, 0.1, 0.12, 1.34, 1.11, 1.6]]).transpose()) + + # Construct a nonzero-diagonal distance matrix + diag_dists = dists.copy() + np.fill_diagonal(diag_dists, 1) + + labels = [0, 0, 0, 1, 1, 1] + + assert_raise_message(ValueError, "distance matrix contains non-zero", + silhouette_samples, + diag_dists, labels, metric='precomputed') + + def assert_raises_on_only_one_label(func): """Assert message when there is only one label""" rng = np.random.RandomState(seed=0) diff --git a/sklearn/metrics/cluster/unsupervised.py b/sklearn/metrics/cluster/unsupervised.py index 05206ab42a291..0e12c06b41799 100644 --- a/sklearn/metrics/cluster/unsupervised.py +++ b/sklearn/metrics/cluster/unsupervised.py @@ -185,7 +185,8 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`. If X is - the distance array itself, use "precomputed" as the metric. + the distance array itself, use "precomputed" as the metric. Precomputed + distance matrices must have 0 along the diagonal. `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. @@ -210,6 +211,15 @@ def silhouette_samples(X, labels, metric='euclidean', **kwds): """ X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr']) + + # Check for diagonal entries in precomputed distance matrix + if metric == 'precomputed': + if np.any(np.diagonal(X)): + raise ValueError( + 'The precomputed distance matrix contains non-zero ' + 'elements on the diagonal. Use np.fill_diagonal(X, 0).' + ) + le = LabelEncoder() labels = le.fit_transform(labels) n_samples = len(labels)