# Performing TEST-A algorithm using K-mean clustering algorithm


In [1]:
#importing basic modules for implementation of k-means
import pandas as pd
import numpy as np
import random
import math

import matplotlib.pyplot as plt
%matplotlib inline

#importing modules of sklearn for measuring performance of K-mean
from sklearn import metrics

In [2]:
def data_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    t_indices = random.sample(population=indices, k=test_size)

    first_df = df.loc[t_indices]
    second_df = df.drop(t_indices)
    
    return first_df, second_df

In [3]:
#method that is used for calculating the eculidean distance between two data points
def euclidean_distance(x1,x2):
        res=np.sum((x1-x2)**2)
        return math.sqrt(res)

In [4]:
#defining a class for kmeans
class Kmeans:
    #constructor of the class Kmeans
    def __init__(self,K,max_iteration):
        self.K=K
        self.max_iteration=max_iteration
        self.clusters=[[] for _ in range(self.K)]
        self.centroids=[]  
    
    
    """
    This method performs the following things--
       1.Finds the total number of samples and total features from the Data Set X
       2.Choose the K random indices from the samples.
       3.After that we access the K random samples using these K Random indices.
         These K random samples will be our initial Centroids.
       4.We ran a loop to check the convergernce of the algorithm.
         If it is converges then we break and return cluster labels assigned to data points.
    """  
    def predict(self,X,clusterData):
        self.X=X
        self.no_samples,self.no_features=X.shape
        self.centroids=clusterData
        for _ in range(self.max_iteration):
            self.clusters=self.createClusters(self.centroids)
            oldCentroids=self.centroids
            self.centroids=self.getCentroids(self.clusters)
            
            if self.isConverge(oldCentroids,self.centroids):
                break  
        labels=self.getClusterLabels(self.clusters)
        return oldCentroids
    
    # method is used for finding cluster for each data points in X
    def createClusters(self,centroids):
        clusters=[[] for _ in range(self.K)]
        for i,data in enumerate(self.X):
            centroid_index=self.closestCentroid(data,centroids)
            clusters[centroid_index].append(i)
        
        return clusters
    
    # method is used for finding the closer centroid for a particular given data
    def closestCentroid(self,data,centroids):
        d=[euclidean_distance(data,k) for k in centroids]
        closeIndex=np.argmin(d)
        return closeIndex
    
    
    #when all data points assigned to diffrent clusters then we have to update the centroids points by taking mean 
    #this method is used for that purpose
    
    def getCentroids(self,clusters):
        centroids=np.zeros((self.K,self.no_features))
        
        for clus_index,clus_data in enumerate(clusters):
            clus_mean=np.mean(self.X[clus_data],axis=0)
            centroids[clus_index]=clus_mean
        
        return centroids
    
    #this method returns the cluster labels assigned to diffrent datapoints and it returns a list
    def getClusterLabels(self,clusters):
        label=np.empty(self.no_samples)
        for clus_index,clus_data in enumerate(clusters):
            for k in clus_data:
                label[k]=clus_index 
        return label
    
    #this method check whether old centroids gets updated or not
    def isConverge(self,oldCentroids,centroids):
        d=[euclidean_distance(oldCentroids[i],centroids[i]) for i in range(self.K)]
        if sum(d)==0:
            return True
        else:
            return False

In [5]:
def closestCluster(data,centroids):
    d=[euclidean_distance(data,k) for k in centroids]
    closeIndex=np.argmin(d)
    return closeIndex

In [6]:
def main():
    df = pd.read_csv("bupa.txt") 
    
    Avg_NMI_Scores=[]
    for i in range(50):
        cluster_df,NonCluster_df=data_split(df,2)
        cluster_df=cluster_df[["mcv","alkphos","sgpt","sgot","gammagt","drinks"]]
        clusterData=cluster_df.to_numpy()
        sum1=0.0
        for k in range(50):
            test_df,train_df=data_split(NonCluster_df,0.2)
            train_df=train_df[["mcv","alkphos","sgpt","sgot","gammagt","drinks"]]
            test_data_original_label=test_df["selector"].tolist()
            X=train_df.to_numpy()
        
            k=Kmeans(2,50)
            centroids=k.predict(X,clusterData)
            test_data_pred_label=[]
            test_df=test_df[["mcv","alkphos","sgpt","sgot","gammagt","drinks"]]
            Y=test_df.to_numpy()
            for data in Y:
                cluster_index=closestCluster(data,centroids)
                test_data_pred_label.append(cluster_index+1)
            nmi_score=metrics.normalized_mutual_info_score(test_data_original_label,test_data_pred_label)
            sum1=sum1+nmi_score
        
        avg_nmi=sum1/50
        print("Iteration",i+1," -->NMI Score",avg_nmi)
        Avg_NMI_Scores.append(avg_nmi)
            
        
    

In [7]:
if __name__=="__main__":
    main()

Iteration 1 -->NMI Score 0.018216892943561425
Iteration 2 -->NMI Score 0.019694280116679404
Iteration 3 -->NMI Score 0.011458887857960896
Iteration 4 -->NMI Score 0.01685424959775059
Iteration 5 -->NMI Score 0.015622476695916916
Iteration 6 -->NMI Score 0.014605551934256117
Iteration 7 -->NMI Score 0.008989496522803012
Iteration 8 -->NMI Score 0.017877209459709522
Iteration 9 -->NMI Score 0.006476787447877757
Iteration 10 -->NMI Score 0.010307484999768001
Iteration 11 -->NMI Score 0.010927687659017702
Iteration 12 -->NMI Score 0.013010565507826685
Iteration 13 -->NMI Score 0.01674669214248877
Iteration 14 -->NMI Score 0.010268768793368183
Iteration 15 -->NMI Score 0.009425670744025654
Iteration 16 -->NMI Score 0.018049944940081764
Iteration 17 -->NMI Score 0.017186071723875894
Iteration 18 -->NMI Score 0.013155644661519339
Iteration 19 -->NMI Score 0.016822583571809647
Iteration 20 -->NMI Score 0.01240309387114009
Iteration 21 -->NMI Score 0.012259510260230288
Iteration 22 -->NMI Score