In [None]:
import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

In [None]:
seed = 1

X, _ = make_blobs(n_samples=300, centers=5, cluster_std=.8, random_state=seed)
plt.scatter(X[:,0], X[:,1])
plt.show()

In [None]:
# Method 1 : K Means

numberOfGroups = 5
kmeans = KMeans(n_clusters=numberOfGroups, random_state=seed)
kmeans.fit(X)
groups = kmeans.labels_

print(kmeans.cluster_centers_)
print(groups)

plt.scatter(X[:,0],X[:,1], c=groups, cmap='rainbow')   # color each group differently
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], color='black', marker='x')  # plot the centroids

Warning

There is absolutely no guarantee of recovering a ground truth. First, choosing the right number of clusters is hard. Second, the algorithm is sensitive to initialization, and can fall into local minima, although scikit-learn employs several tricks to mitigate this issue.

One method to validate the number of clusters is the elbow method. The idea of the elbow method is to run k-means clustering on the dataset for a range of values of k (say, k from 1 to 10 in the examples above), and for each value of k calculate the sum of squared errors (SSE).

Then, plot a line chart of the SSE for each value of k. If the line chart looks like an arm, then the "elbow" on the arm is the value of k that is the best.

In [None]:
wcss = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i, init = 'k-means++')
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

with plt.style.context(('fivethirtyeight')):
    plt.figure(figsize=(10,6))
    plt.plot(range(1, 10), wcss)
    plt.title('The Elbow Method with k-means++\n',fontsize=25)
    plt.xlabel('Number of clusters')
    plt.xticks(fontsize=10)
    plt.ylabel('WCSS (within-cluster sums of squares)')
    plt.vlines(x=3,ymin=0,ymax=100,linestyles='--')
    plt.show()

In [None]:
# Method 2 : Agglomerative Clustering

kmeans = AgglomerativeClustering(n_clusters=numberOfGroups)
kmeans.fit(X)
groups = kmeans.labels_
print(groups)
plt.scatter(X[:,0],X[:,1], c=groups, cmap='rainbow')   # color each group differently