# Use Kmean++ clustering algorithm as our heuristic and analyzing its performance using NMI and Silhouette Index

In [1]:
#importing basic modules for implementation of k-means
import pandas as pd
import numpy as np
import math
import sys
import matplotlib.pyplot as plt
%matplotlib inline

#importing modules of sklearn for measuring performance of K-mean
from sklearn import metrics
from sklearn.metrics import silhouette_score

In [2]:
#method that is used for calculating the eculidean distance between two data points
def euclidean_distance(x1,x2):
        res=np.sum((x1-x2)**2)
        return math.sqrt(res)

In [3]:

class Kmeans_plus_plus:
    #constructo for Kmeans plus plus
    def __init__(self,K):
        self.K=K
    
    #method for initialization
    def initialization(self,X):
        self.X=X
        self.no_samples,self.no_features=X.shape
        centroids = []
        
        """
        Here we are performing following things-
           1.First we choose random data point from given data set as a first centroid.
           2.After choosing that data point, we remove it from from the original data set.
           3.Now we ran a for loop from 1 to K-1 for choosing next K-1 centroids.
           4.After that we return cluster points.
        """
        index=np.random.randint(self.no_samples)  
        centroids.append(self.X[index]) 
        self.X=np.delete(self.X,index,0)
        self.no_samples,self.no_features=self.X.shape
        for cID in range(self.K-1):
            dist = []
            for i in range(self.no_samples):
                point=self.X[i]
                d = sys.maxsize
                for j in centroids:
                    temp_dist = euclidean_distance(point,j)
                    d=min(d,temp_dist)
                dist.append(d)
            
            dist = np.array(dist)
            index=np.argmax(dist)
            next_centroid = self.X[index]
            self.X=np.delete(self.X,index,0)
            self.no_samples,self.no_features=self.X.shape
            centroids.append(next_centroid)
            dist=[]
        
        return centroids
        

In [12]:
def main():
    df = pd.read_csv("bupa.txt") # read the data 
    trainData=df[["mcv","alkphos","sgpt","sgot","gammagt","drinks"]]# read the specified column values according to their name 
    labels_original=df["selector"].tolist() # read the the last column value because it is our class label.
    
    X=trainData.to_numpy() #converting the train data into numpy Array
    X1=X
    k=Kmeans_plus_plus(2)# initializing the K- means with K no of clusters and max_number of iteration
    centroids=k.initialization(X) #call the predict method which will give the cluster labels
    clusterLabel=[]
    """
    here we perform the following things-
      1.For each data point belongs to the data set we find the nearest cluster point using Eculidean distance.
      2.After that we store the cluster index into clusterLabel[] list.
      3.Now we print the data point with their cluster 
    """
    for datapoint in X1:
        d=sys.maxsize
        index=-1
        for j in range(len(centroids)):
            temp_dist=euclidean_distance(datapoint,centroids[j])
            if(temp_dist<d):
                d=temp_dist
                index=j+1;
        clusterLabel.append(index)
        trainData['clusture']=index
    for i in range(len(clusterLabel)):
        print("Data ",i+1,"-->Cluster",clusterLabel[i])
    
    #Analyzing the cluster performance using NMI and silhouette Index
    print("----Clustering Performance----")
    NMI_score=metrics.normalized_mutual_info_score(labels_original,clusterLabel)
    print("Using Ground Information->",NMI_score)
    silhouette_avg = silhouette_score(X, clusterLabel)
    print("Without using ground Information->",silhouette_avg )
    

    
        

In [13]:
if __name__=="__main__":
     main()

Data  1 -->Cluster 1
Data  2 -->Cluster 1
Data  3 -->Cluster 1
Data  4 -->Cluster 1
Data  5 -->Cluster 1
Data  6 -->Cluster 1
Data  7 -->Cluster 1
Data  8 -->Cluster 1
Data  9 -->Cluster 1
Data  10 -->Cluster 1
Data  11 -->Cluster 1
Data  12 -->Cluster 1
Data  13 -->Cluster 1
Data  14 -->Cluster 1
Data  15 -->Cluster 1
Data  16 -->Cluster 1
Data  17 -->Cluster 1
Data  18 -->Cluster 1
Data  19 -->Cluster 1
Data  20 -->Cluster 1
Data  21 -->Cluster 1
Data  22 -->Cluster 1
Data  23 -->Cluster 1
Data  24 -->Cluster 1
Data  25 -->Cluster 1
Data  26 -->Cluster 1
Data  27 -->Cluster 1
Data  28 -->Cluster 1
Data  29 -->Cluster 1
Data  30 -->Cluster 1
Data  31 -->Cluster 1
Data  32 -->Cluster 1
Data  33 -->Cluster 1
Data  34 -->Cluster 1
Data  35 -->Cluster 1
Data  36 -->Cluster 1
Data  37 -->Cluster 1
Data  38 -->Cluster 1
Data  39 -->Cluster 1
Data  40 -->Cluster 1
Data  41 -->Cluster 1
Data  42 -->Cluster 1
Data  43 -->Cluster 1
Data  44 -->Cluster 1
Data  45 -->Cluster 1
Data  46 -->Cluster