Skip to content

Latest commit

 

History

History
268 lines (192 loc) · 5.06 KB

File metadata and controls

268 lines (192 loc) · 5.06 KB

Note: This is a generated markdown export from the Jupyter notebook file clustering_meanshift.ipynb. You can also view the notebook with the nbviewer from Jupyter.

Clustering with MeanShift

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

import umap

from sklearn import datasets, cluster
def get_color(i, n_clusters):
    if i == -1:
        return 'gray'
    return plt.cm.jet(float(i) / n_clusters)

High dimensional data

digits = datasets.load_digits()

fig, axes = plt.subplots(nrows=1, ncols=10, figsize=(10, 3))
for ax, image, label in zip(axes, digits.images, digits.target):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r)
    ax.set_title('%i' % label)

png

X = digits.data
y = digits.target

n_clusters=10

meanshift = cluster.MeanShift()
label = meanshift.fit_predict(X)

predicted_clusters = np.unique(label)
true_clusters = list(range(0, n_clusters))
embedding = umap.UMAP().fit_transform(X)
df = pd.DataFrame(embedding, columns=['X1', 'X2'])
df['true_cluster'] = y
df['predicted_cluster'] = label
df.head()
X1 X2 true_cluster predicted_cluster
0 15.643538 7.158202 0 0
1 -3.757154 9.996881 1 0
2 0.414931 9.605992 2 0
3 0.273054 5.647682 3 0
4 4.314103 18.812122 4 0
fig, (ax1, ax2) = plt.subplots(2, 1, sharey=True, figsize=(10, 10))

fig.suptitle('Clusters in high dimensional data (features = {})'.format(np.shape(X)[1]), fontsize=14, fontweight='bold')


ax1.set_title('True values')
for i in true_clusters:
    ax1.scatter(df[df.true_cluster == i].X1, df[df.true_cluster == i].X2, label=i, color=get_color(i, len(true_clusters)))
    


ax2.set_title('Predicted cluste')
for i in predicted_clusters:
    ax2.scatter(df[df.predicted_cluster == i].X1, df[df.predicted_cluster == i].X2, label=i, color=get_color(i, len(predicted_clusters)))

ax1.legend(bbox_to_anchor=(1.1, 1))
ax2.legend(bbox_to_anchor=(1.1, 1))

plt.show()

png

Low dimensional data

X, y = datasets.make_blobs(n_samples=750, centers=[[3,4],[-2,6],[3,12]], cluster_std=[1, 0.8, 1.5],
                            random_state=0)
n_clusters=3

meanshift = cluster.MeanShift()
label = meanshift.fit_predict(X)

predicted_clusters = np.unique(label)
true_clusters = list(range(0, n_clusters))
df = pd.DataFrame(X, columns=['X1', 'X2'])
df['true_cluster'] = y
df['predicted_cluster'] = label
df.head()
X1 X2 true_cluster predicted_cluster
0 2.600551 4.370056 0 1
1 -2.309497 5.591766 1 0
2 2.196590 3.310450 0 1
3 0.940436 10.398387 2 2
4 4.230291 5.202380 0 1
fig, (ax1, ax2) = plt.subplots(2, 1, sharey=True, figsize=(10, 10))

fig.suptitle('Clusters in low dimensional', fontsize=14, fontweight='bold')


ax1.set_title('True values')
for i in true_clusters:
    ax1.scatter(df[df.true_cluster == i].X1, df[df.true_cluster == i].X2, label=i, color=get_color(i, len(true_clusters)))
    


ax2.set_title('Predicted cluster = {}'.format(len(predicted_clusters)))
for i in predicted_clusters:
    ax2.scatter(df[df.predicted_cluster == i].X1, df[df.predicted_cluster == i].X2, label=i, color=get_color(i, len(predicted_clusters)))

ax1.legend(bbox_to_anchor=(1.1, 1))
ax2.legend(bbox_to_anchor=(1.1, 1))

plt.show()

png