Permalink
Browse files

Merge pull request #364 from robertlayton/silhouette

Silhouette Coefficient added to metrics
  • Loading branch information...
robertlayton committed Oct 6, 2011
2 parents 543bc78 + e2888ed commit bc9ad28741cab0cf38748041290926c2c17b7130
View
@@ -621,3 +621,72 @@ mean of homogeneity and completeness**:
measure <http://acl.ldc.upenn.edu/D/D07/D07-1043.pdf>`_
Andrew Rosenberg and Julia Hirschberg, 2007
+Silhouette Coefficient
+----------------------
+
+Presentation and usage
+~~~~~~~~~~~~~~~~~~~~~~
+
+If the ground truth labels are not known, evaluation must be performed using
+the model itself. The Silhouette Coefficient is an example of this, where a
+higher Silhouette Coefficient score relates to a model with better defined
+clusters. The Silhouette Coefficient is defined for each sample and is composed
+of two scores:
+
+- **a**: The mean distance between a sample and all other points in the same
+ class.
+
+- **b**: The mean distance between a sample and all other points in the *next
+ nearest cluster*.
+
+The Silhoeutte Coefficient *s* for a single sample is then given as:
+
+.. math:: s = \frac{b - a}{max(a, b)}
+
+The Silhouette Coefficient for a set of samples is given as the mean of the
+Silhouette Coefficient for each sample.
+
+
+ >>> from sklearn import metrics
+ >>> from sklearn.metrics import pairwise_distances
+ >>> from sklearn import datasets
+ >>> dataset = datasets.load_iris()
+ >>> X = dataset.data
+ >>> y = dataset.target
+
+In normal usage, the Silhouette Coefficient is applied to the results of a
+cluster analysis.
+
+ >>> import numpy as np
+ >>> np.random.seed(1)
+ >>> from sklearn.cluster import KMeans
+ >>> kmeans_model = KMeans(init="k-means++", k=3).fit(X)
+ >>> labels = kmeans_model.labels_
+ >>> metrics.silhouette_score(X, labels, metric='euclidean')
+ ... # doctest: +ELLIPSIS
+ 0.5525...
+
+.. topic:: References
+
+ * Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
+ Interpretation and Validation of Cluster Analysis". Computational
+ and Applied Mathematics 20: 53–65. doi:10.1016/0377-0427(87)90125-7.
+
+
+Advantages
+~~~~~~~~~~
+
+- The score is bounded between -1 for incorrect clustering and +1 for highly
+ dense clustering. Scores around zero indicate overlapping clusters.
+
+- The score is higher when clusters are dense and well separated, which relates
+ to a standard concept of a cluster.
+
+
+Drawbacks
+~~~~~~~~~
+
+- The Silhouette Coefficient is generally higher for convex clusters than other
+ concepts of clusters, such as density based clusters like those obtained
+ through DBSCAN.
+
View
@@ -9,6 +9,9 @@
- Faster tests by `Fabian Pedregosa`_.
+ - Silhoeutte Coefficient cluster analysis evaluation metric added as
+ ``sklearn.metrics.silhouette_score`` by Robert Layton.
+
API changes summary
-------------------
@@ -34,6 +37,11 @@ version 0.9:
- The :ref:`covariance` module now has a robust estimator of
covariance, the Minimum Covariance Determinant estimator.
+ - Cluster evaluation metrics in ``metrics.cluster.py`` have been refactored
+ but the changes are backwards compatible. They have been moved to the
+ ``metrics.cluster.supervised``, along with ``metrics.cluster.unsupervised``
+ which contains the Silhouette Coefficient.
+
Changelog
---------
@@ -40,6 +40,9 @@
print "V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)
print "Adjusted Rand Index: %0.3f" % \
metrics.adjusted_rand_score(labels_true, labels)
+D = (S / np.min(S))
+print ("Silhouette Coefficient: %0.3f" %
+ metrics.silhouette_score(D, labels, metric='precomputed'))
##############################################################################
# Plot result
@@ -41,6 +41,8 @@
print "V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)
print "Adjusted Rand Index: %0.3f" % \
metrics.adjusted_rand_score(labels_true, labels)
+print ("Silhouette Coefficient: %0.3f" %
+ metrics.silhouette_score(D, labels, metric='precomputed'))
##############################################################################
# Plot result
@@ -31,6 +31,8 @@
n_digits = len(np.unique(digits.target))
labels = digits.target
+sample_size = 300
+
print "n_digits: %d" % n_digits
print "n_features: %d" % n_features
print "n_samples: %d" % n_samples
@@ -46,6 +48,11 @@
print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
print "Adjusted Rand Index: %0.3f" % \
metrics.adjusted_rand_score(labels, km.labels_)
+#print ("Silhouette Coefficient: %0.3f" %
+# metrics.silhouette_score(D, km.labels_, metric='precomputed'))
+print ("Silhouette Coefficient: %0.3f" %
+ metrics.silhouette_score(data, km.labels_,
+ metric='euclidean', sample_size=sample_size))
print
print "Raw k-means with random centroid init..."
@@ -58,6 +65,11 @@
print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
print "Adjusted Rand Index: %0.3f" % \
metrics.adjusted_rand_score(labels, km.labels_)
+#print ("Silhouette Coefficient: %0.3f" %
+# metrics.silhouette_score(D, km.labels_, metric='precomputed'))
+print ("Silhouette Coefficient: %0.3f" %
+ metrics.silhouette_score(data, km.labels_,
+ metric='euclidean', sample_size=sample_size))
print
print "Raw k-means with PCA-based centroid init..."
@@ -73,6 +85,11 @@
print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
print "Adjusted Rand Index: %0.3f" % \
metrics.adjusted_rand_score(labels, km.labels_)
+#print ("Silhouette Coefficient: %0.3f" %
+# metrics.silhouette_score(D, km.labels_, metric='precomputed'))
+print ("Silhouette Coefficient: %0.3f" %
+ metrics.silhouette_score(data, km.labels_,
+ metric='euclidean', sample_size=sample_size))
print
# Plot k-means++ form on a 2D plot using PCA
@@ -9,9 +9,11 @@
precision_recall_curve, explained_variance_score, r2_score, \
zero_one, mean_square_error, hinge_loss
+from . import cluster
from .cluster import adjusted_rand_score
from .cluster import homogeneity_completeness_v_measure
from .cluster import homogeneity_score
from .cluster import completeness_score
from .cluster import v_measure_score
+from .cluster import silhouette_score
from .pairwise import euclidean_distances, pairwise_distances, pairwise_kernels
@@ -0,0 +1,11 @@
+"""
+:mod:`sklearn.metrics.cluster` is a module containing evaluation metrics for
+cluster analysis results. There are two forms of evaluation:
+
+- supervised, which uses a ground truth class values for each sample.
+- unsupervised, which does not and measures the 'quality' of the model itself.
+"""
+from supervised import (homogeneity_completeness_v_measure,
+ homogeneity_score, completeness_score,
+ v_measure_score, adjusted_rand_score)
+from unsupervised import silhouette_score, silhouette_samples
File renamed without changes.
No changes.
@@ -0,0 +1,29 @@
+from scipy.sparse import csr_matrix
+
+from .... import datasets
+from ..unsupervised import silhouette_score
+from ... import pairwise_distances
+
+
+def test_silhouette():
+ """Tests the Silhouette Coefficient. """
+ dataset = datasets.load_iris()
+ X = dataset.data
+ y = dataset.target
+ D = pairwise_distances(X, metric='euclidean')
+ # Given that the actual labels are used, we can assume that S would be
+ # positive.
+ silhouette = silhouette_score(D, y, metric='precomputed')
+ assert(silhouette > 0)
+ # Test without calculating D
+ silhouette_metric = silhouette_score(X, y, metric='euclidean')
+ assert(silhouette == silhouette_metric)
+ # Test with sampling
+ silhouette = silhouette_score(D, y, metric='precomputed',
+ sample_size=int(X.shape[0] / 2))
+ assert(silhouette > 0)
+ # Test with sparse X
+ X_sparse = csr_matrix(X)
+ D = pairwise_distances(X_sparse, metric='euclidean')
+ silhouette = silhouette_score(D, y, metric='precomputed')
+ assert(silhouette > 0)
Oops, something went wrong.

0 comments on commit bc9ad28

Please sign in to comment.