# K-Means Clustering Evaluation

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics


In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(4, random_state=0)
labels = kmeans.fit(X).predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');

In [5]:
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn import datasets
dataset = pd.read_csv('s1.txt',delim_whitespace=True)
X=np.asarray(dataset)

### 1. Computing Silhoutte Score For Different Cluster Sizes (k=9,12,15,18,21)

In [6]:
import numpy as np
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.silhouette_score(X, labels, metric='euclidean')

0.5829953708315514

In [7]:
kmeans_model = KMeans(n_clusters=12, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.silhouette_score(X, labels, metric='euclidean')

0.6317592554828408

In [8]:
kmeans_model = KMeans(n_clusters=15, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.silhouette_score(X, labels, metric='euclidean')

0.711332992937958

In [9]:
kmeans_model = KMeans(n_clusters=18, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.silhouette_score(X, labels, metric='euclidean')

0.6373816715762535

In [10]:
kmeans_model = KMeans(n_clusters=21, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.silhouette_score(X, labels, metric='euclidean')

0.549398698865092

   ### 2. Computing Calinski Score For Different Cluster Sizes

In [11]:
    kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X)
    labels = kmeans_model.labels_
    metrics.calinski_harabaz_score(X, labels)  


8277.115037478403

In [12]:
kmeans_model = KMeans(n_clusters=12, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X, labels)  

10453.220259994929

In [13]:
kmeans_model = KMeans(n_clusters=15, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X, labels)  

22679.716928133905

In [14]:
kmeans_model = KMeans(n_clusters=18, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X, labels)  

20255.796173810202

In [15]:
kmeans_model = KMeans(n_clusters=21, random_state=1).fit(X)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X, labels)  

18646.987405420376

In [16]:
df2 = pd.read_csv('dim2.txt',delim_whitespace=True)
X2=np.asarray(df2)

In [17]:
df5 = pd.read_csv('dim5.txt',delim_whitespace=True)
X5=np.asarray(df5)

In [18]:
df10 = pd.read_csv('dim10.txt',delim_whitespace=True)
X10=np.asarray(df10)

In [19]:
df15 = pd.read_csv('dim15.txt',delim_whitespace=True)
X15=np.asarray(df15)

In [20]:
# print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [21]:
df2_half=df2.loc[:674]
df5_half=df5.loc[:1687]
df10_half=df10.loc[:3374]
df15_half=df15.loc[:6749]

In [22]:
len(df2_half)
len(df5_half)
len(df10_half)
len(df15_half)

675

1688

3375

6750

In [23]:
X2_half=np.asarray(df2_half)
X5_half=np.asarray(df5_half)
X10_half=np.asarray(df10_half)
X15_half=np.asarray(df15_half)

## Evaluating Metrics for varying dimensions (dim=2,5,10,15)

In [24]:
import numpy as np
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X2)
labels = kmeans_model.labels_
metrics.silhouette_score(X2, labels, metric='euclidean')

0.9494769434200909

In [25]:

kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X5)
labels = kmeans_model.labels_
metrics.silhouette_score(X5, labels, metric='euclidean')

0.9212599793207223

In [26]:

kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X10)
labels = kmeans_model.labels_
metrics.silhouette_score(X10, labels, metric='euclidean')

0.9229225725895405

In [27]:

kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X15)
labels = kmeans_model.labels_
metrics.silhouette_score(X15, labels, metric='euclidean')

0.9167924566702247

In [30]:
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X2)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X2, labels)  

126599.74695659941

In [31]:
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X5)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X5, labels)  

137552.4623088356

In [32]:
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X10)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X10, labels)  

278319.7188912014

In [33]:
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X15)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X15, labels)  

302436.3684358934

### Evaluating the metrics by defining the dataset into half for varying dimensions

In [62]:
import numpy as np
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X2_half)
labels = kmeans_model.labels_
metrics.silhouette_score(X2_half, labels, metric='euclidean')

0.9535892949328115

In [36]:
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X5_half)
labels = kmeans_model.labels_
metrics.silhouette_score(X5_half, labels, metric='euclidean')

0.9391922769236488

In [37]:
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X10_half)
labels = kmeans_model.labels_
metrics.silhouette_score(X10_half, labels, metric='euclidean')

0.9166824475031685

In [38]:
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X15_half)
labels = kmeans_model.labels_
metrics.silhouette_score(X15_half, labels, metric='euclidean')

0.7237226631125406

In [None]:
calinski half

In [63]:
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X2_half)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X2_half, labels)  

49626.98784439353

In [40]:
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X5_half)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X5_half, labels) 

103364.18162791427

In [41]:
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X10_half)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X10_half, labels) 

157444.82113231663

In [42]:
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X15_half)
labels = kmeans_model.labels_
metrics.calinski_harabaz_score(X15_half, labels) 

10145.59864211574

### Evaluating the TimePerformance

In [1]:
%install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
%load_ext autotime

UsageError: Line magic function `%install_ext` not found.


In [43]:
import numpy as np
from sklearn.cluster import KMeans


In [44]:
%%timeit
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X)
labels = kmeans_model.predict(X)

173 ms ± 29.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [45]:
%%timeit
kmeans_model = KMeans(n_clusters=12, random_state=1).fit(X)
labels = kmeans_model.predict(X)

167 ms ± 38.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [46]:
%%timeit
kmeans_model = KMeans(n_clusters=15, random_state=1).fit(X)
labels = kmeans_model.predict(X)

207 ms ± 28.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [47]:
%%timeit
kmeans_model = KMeans(n_clusters=18, random_state=1).fit(X)
labels = kmeans_model.predict(X)

363 ms ± 60.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
%%timeit
kmeans_model = KMeans(n_clusters=21, random_state=1).fit(X)
labels = kmeans_model.predict(X)

548 ms ± 42.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [49]:
%%timeit
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X2)
labels = kmeans_model.predict(X2)

58.9 ms ± 7.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [51]:
%%timeit
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X5)
labels = kmeans_model.predict(X5)

80.1 ms ± 8.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [52]:
%%timeit
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X10)
labels = kmeans_model.predict(X10)

175 ms ± 40.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
%%timeit
kmeans_model = KMeans(n_clusters=9, random_state=1).fit(X15)
labels = kmeans_model.predict(X15)

299 ms ± 82.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [60]:
%%timeit
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X2_half)
labels = kmeans_model.predict(X2_half)

27.7 ms ± 3.77 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [61]:
%%timeit
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X5_half)
labels = kmeans_model.predict(X5_half)

41.1 ms ± 5.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [57]:
%%timeit
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X10_half)
labels = kmeans_model.predict(X10_half)

93.9 ms ± 34.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [58]:
%%timeit
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(X15_half)
labels = kmeans_model.predict(X15_half)

149 ms ± 6.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
