# Implementation of K-Mean Algorithm From scratch and measuring its performance using NMI and Silhouette Index


In [1]:
#importing basic modules for implementation of k-means
import pandas as pd
import numpy as np
import math
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline

#importing modules of sklearn for measuring performance of K-mean
from sklearn import metrics
from sklearn.metrics import silhouette_score


In [2]:
#method that is used for calculating the eculidean distance between two data points
def euclidean_distance(x1,x2):
        res=np.sum((x1-x2)**2)
        return math.sqrt(res)

In [3]:
#defining a class for kmeans
class Kmeans:
    #constructor of the class Kmeans
    def __init__(self,K,max_iteration):
        self.K=K
        self.max_iteration=max_iteration
        self.clusters=[[] for _ in range(self.K)]
        self.centroids=[]  
    
    
    """
    This method performs the following things--
       1.Finds the total number of samples and total features from the Data Set X
       2.Choose the K random indices from the samples.
       3.After that we access the K random samples using these K Random indices.
         These K random samples will be our initial Centroids.
       4.We ran a loop to check the convergernce of the algorithm.
         If it is converges then we break and return cluster labels assigned to data points.
    """  
    def predict(self,X):
        self.X=X
        self.no_samples,self.no_features=X.shape
        
        random_K_indxs=np.random.choice(self.no_samples,self.K,replace=False)
        
        self.centroids=[self.X[i] for i in random_K_indxs]

        
        for _ in range(self.max_iteration):
            self.clusters=self.createClusters(self.centroids)
            oldCentroids=self.centroids
            self.centroids=self.getCentroids(self.clusters)
            
            if self.isConverge(oldCentroids,self.centroids):
                break
            
        labels=self.getClusterLabels(self.clusters)
        return labels
    
    # method is used for finding cluster for each data points in X
    def createClusters(self,centroids):
        clusters=[[] for _ in range(self.K)]
        for i,data in enumerate(self.X):
            centroid_index=self.closestCentroid(data,centroids)
            clusters[centroid_index].append(i)
        
        return clusters
    
    # method is used for finding the closer centroid for a particular given data
    def closestCentroid(self,data,centroids):
        d=[euclidean_distance(data,k) for k in centroids]
        closeIndex=np.argmin(d)
        return closeIndex
    
    
    #when all data points assigned to diffrent clusters then we have to update the centroids points by taking mean 
    #this method is used for that purpose
    
    def getCentroids(self,clusters):
        centroids=np.zeros((self.K,self.no_features))
        
        for clus_index,clus_data in enumerate(clusters):
            clus_mean=np.mean(self.X[clus_data],axis=0)
            centroids[clus_index]=clus_mean
        
        return centroids
    
    #this method returns the cluster labels assigned to diffrent datapoints and it returns a list
    def getClusterLabels(self,clusters):
        label=np.empty(self.no_samples)
        for clus_index,clus_data in enumerate(clusters):
            for k in clus_data:
                label[k]=clus_index 
        return label
    
    #this method check whether old centroids gets updated or not
    def isConverge(self,oldCentroids,centroids):
        d=[euclidean_distance(oldCentroids[i],centroids[i]) for i in range(self.K)]
        if sum(d)==0:
            return True
        else:
            return False

In [6]:
def main():
    df = pd.read_csv("bupa.txt") # read the data 
    trainData=df[["mcv","alkphos","sgpt","sgot","gammagt","drinks"]]# read the specified column values according to their name 
    labels_original=df["selector"].tolist() # read the the last column value because it is our class label.
    
    X=trainData.to_numpy() #converting the train data into numpy Array
    
    k=Kmeans(2,100)# initializing the K- means with K no of clusters and max_number of iteration
    labels_pred=k.predict(X) #call the predict method which will give the cluster labels
    
    trainData['clusture']=labels_pred
    
    #printing the datapoints along with their cluster
    print("------Final Output of K-mean------")
    for i,cluster in enumerate(labels_pred):
        print("data ",i+1,"--> cluster ",int(cluster+1))
        
    #calculating clustering performance using NMI and silhoutte index    
    print("----Clustering Performance----")
    label_prediction=labels_pred.tolist()#converting labels_pred to list
    NMI_score=metrics.normalized_mutual_info_score(labels_original,label_prediction)
    print("Using Ground Information->",NMI_score)
    silhouette_avg = silhouette_score(X, labels_pred)
    print("Without using ground Information->",silhouette_avg )

    trainData['clusture']=labels_pred

    #printing the datapoints along with their cluster
    
    # code for radar chart  to visualise the clusture points 
    # here we are taking the mean value of all the attributes present in the cluster
    data=pd.DataFrame(trainData)
    segments= data.pivot_table (index =['clusture'],values =['mcv','alkphos','sgpt','sgot','gammagt','drinks'],aggfunc ={'mcv':'mean','alkphos':'mean','sgpt':'mean','sgot':'mean','gammagt':'mean','drinks':'mean'})
    segments.reset_index(inplace = True)
    segments.set_index('clusture',inplace =True)
    print(segments)
    
    # setting the value on radar chart 
    categories=segments.columns
    values = segments.values
    
    import plotly.graph_objects as go
    
    fig = go.Figure()
    fig.add_trace (go.Scatterpolar(
       r=values[0],
       theta= categories,
       fill ='toself',
       name ='cluster 1'
    ))
    
    fig.add_trace (go.Scatterpolar(
       r=values[1],
       theta= categories,
       fill ='toself',
       name ='cluster 2'
    ))
    
    fig.update_layout(
     polar=dict(
      radialaxis=dict(
        visible=True
      )),
    title="visualisation of cluster according to there centroid"
    )
    fig.show()
   
   

In [7]:
if __name__=="__main__":
    main()

------Final Output of K-mean------
data  1 --> cluster  1
data  2 --> cluster  1
data  3 --> cluster  1
data  4 --> cluster  1
data  5 --> cluster  1
data  6 --> cluster  1
data  7 --> cluster  1
data  8 --> cluster  1
data  9 --> cluster  1
data  10 --> cluster  1
data  11 --> cluster  1
data  12 --> cluster  1
data  13 --> cluster  1
data  14 --> cluster  1
data  15 --> cluster  1
data  16 --> cluster  1
data  17 --> cluster  1
data  18 --> cluster  1
data  19 --> cluster  1
data  20 --> cluster  1
data  21 --> cluster  1
data  22 --> cluster  1
data  23 --> cluster  1
data  24 --> cluster  1
data  25 --> cluster  2
data  26 --> cluster  1
data  27 --> cluster  1
data  28 --> cluster  1
data  29 --> cluster  1
data  30 --> cluster  1
data  31 --> cluster  1
data  32 --> cluster  1
data  33 --> cluster  1
data  34 --> cluster  1
data  35 --> cluster  1
data  36 --> cluster  2
data  37 --> cluster  1
data  38 --> cluster  1
data  39 --> cluster  1
data  40 --> cluster  1
data  41 --> c

            alkphos    drinks     gammagt        mcv       sgot       sgpt
clusture                                                                  
0.0       69.175896  3.144951   27.055375  89.970684  22.859935  26.798046
1.0       75.473684  5.960526  129.000000  91.684211  39.052632  59.552632
