In [1]:
import numpy as np
import pandas as pd
import scipy 
import sklearn
from collections import Counter
from sklearn.metrics import multilabel_confusion_matrix
from scipy import spatial

In [2]:
data = pd.read_csv(r'C:\Users\keert\kmeans_data\data.csv')
labels = pd.read_csv(r'C:\Users\keert\kmeans_data\label.csv',names=['label'],header=None)

In [3]:
data.head()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.658,0.659,0.660,0.661,0.662,0.663,0.664,0.665,0.666,0.667
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data.count()

0        9999
0.1      9999
0.2      9999
0.3      9999
0.4      9999
         ... 
0.663    9999
0.664    9999
0.665    9999
0.666    9999
0.667    9999
Length: 784, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split( data, test_size=0.07, random_state=42)
train_labels, test_labels = train_test_split( labels, test_size=0.07, random_state=42)


In [6]:
class KMeans:
    
    def SSEcalculate(self, centroid_value_dict, centroid_dict,data):
        ssedata = 0
        for i in centroid_dict:
            ssecluster = 0
           
            for j in centroid_dict[i]:
                dp = list(data.iloc[int(j)])
                for a,b in zip(centroid_value_dict[i],dp):
                    ssecluster += (a-b)**2
            ssedata+=ssecluster
        return ssedata    
    
    def Initialize_Centroids(self,data,K):
        m = data.shape[0]
        centroid_value_dict={}
        for i in range(K):
            r = np.random.randint(0, m-1)
            centroid_value_dict[i] = data.iloc[r]
        return centroid_value_dict
    
    def jaccard_similarity(self,centroid, dp):
        intersection = len(list(set(centroid).intersection(dp)))
        union = (len(set(centroid)) + len(set(dp))) - intersection
        return float(intersection) / union

    def train_Kmeans(self,data,K,max_iter=20,mode=1,tol=10):
        #Model 1 => Euclidean
        #Model 2 => Jaccard
        #Model 3 => Cosine
        centroid_value_dict = self.Initialize_Centroids(data,K)
        new_centroid_value_dict = {}
        count = 0
        centroid_dict = {}
        convergence = False
        while((count<max_iter) and not convergence):
            for i in list(centroid_value_dict.keys()):
                centroid_dict[i]=[]
            for i in range(data.shape[0]):
                x = data.iloc[i]
                if mode==1 :
                    distance_measure = [np.linalg.norm(x-centroid_value_dict[j])  for j in centroid_value_dict]
                    idx = np.argmin(distance_measure)
                    centroid_dict[idx].append(i)
                elif mode==2 :
                    distance_measure = [self.jaccard_similarity(list(x),centroid_value_dict[j]) for j in centroid_value_dict]
                    idx = np.argmax(distance_measure)
                    centroid_dict[idx].append(i)
                elif mode==3 :
                    distance_measure = [1-scipy.spatial.distance.cosine(x,list(centroid_value_dict[j]))  for j in centroid_value_dict]
                    idx = np.argmax(distance_measure)
                    centroid_dict[idx].append(i)
                
                prev_centroids=dict(centroid_value_dict)
                
            
            for i in centroid_dict:
                if len(centroid_dict[i]):
                    dps_centroid = centroid_dict[i]
                    centroid_value_dict[i] = np.average(data.iloc[dps_centroid],axis=0)
            
            
            current_tol=-1
            for i in centroid_value_dict:
                prev_centroid_point = prev_centroids[i]
                new_centroid_point = centroid_value_dict[i]
                change = np.sum(np.absolute(new_centroid_point-prev_centroid_point))
                current_tol = max(change, current_tol)
                
            print("Tolerance for Iteration",count,":",current_tol)
            
            count+=1
            if (current_tol<10):
                convergence = True
                break
        return centroid_value_dict,centroid_dict

In [7]:
def predict_cluster_labels(C, S, labels):
    '''
    Input : C -> Centroids
            S -> Set of Indicies corresponding to Centroid C
            data -> Data used to form clusters
    Output : Returns an array of size K having labels based on majority voting in the cluster
    '''
    cluster_labels = np.zeros(10,dtype=int)
    for c in C:
        labels_of_points = []
        for point in S[c]:
            labels_of_points.extend(labels.iloc[point])
        counter = Counter(labels_of_points)
        try:
            cluster_labels[c] = max(counter, key=counter.get)
        except:
            cluster_labels[c] = np.random.randint(0,9)
    return cluster_labels

def jaccard_similarity(centroid, dp):
        intersection = len(list(set(centroid).intersection(dp)))
        union = (len(set(centroid)) + len(set(dp))) - intersection
        return float(intersection) / union

def accuracy(centroids, centroid_Labels, test_data, true_labels, mode=1):
    y_true = list(true_labels['label']);
    y_pred = []
    for index in range(test_data.shape[0]):
        featureset = test_data.iloc[index]
        if mode==1:
            distances = [np.linalg.norm(featureset - centroids[centroid]) for centroid in centroids]
            classification = distances.index(min(distances))
            y_pred.append(centroid_Labels[classification])
        elif mode==2:
            similarity = [jaccard_similarity(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification]) 
        elif mode==3:
            similarity = [1 - spatial.distance.cosine(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification])
    denominator = test_data.shape[0]
    correctly_classified = 0
    for i in range(0,len(y_pred)):
        if y_true[i] == y_pred[i]:
            correctly_classified += 1
    accuracy = correctly_classified/denominator
    return accuracy

In [8]:
model1 = KMeans()
centroids1,clusters1 = model1.train_Kmeans(data,10, max_iter=60,mode=1)

Tolerance for Iteration 0 : 25913.7807486631
Tolerance for Iteration 1 : 6657.992029376374
Tolerance for Iteration 2 : 3910.6614699738902
Tolerance for Iteration 3 : 1747.3356462395545
Tolerance for Iteration 4 : 1249.397531254945
Tolerance for Iteration 5 : 1491.925598756513
Tolerance for Iteration 6 : 2995.568722882608
Tolerance for Iteration 7 : 1588.871883793466
Tolerance for Iteration 8 : 1284.6013221820294
Tolerance for Iteration 9 : 1635.5355727543692
Tolerance for Iteration 10 : 1590.2092842669645
Tolerance for Iteration 11 : 1517.0866404209755
Tolerance for Iteration 12 : 879.5473752655369
Tolerance for Iteration 13 : 1070.7503260869566
Tolerance for Iteration 14 : 1136.7258333333336
Tolerance for Iteration 15 : 1142.2128027681658
Tolerance for Iteration 16 : 994.5812156614703
Tolerance for Iteration 17 : 918.7510411175103
Tolerance for Iteration 18 : 776.5528831329017
Tolerance for Iteration 19 : 564.5697213531691
Tolerance for Iteration 20 : 379.88241136131217
Tolerance for 

In [30]:
SSEEuclidean = model1.SSEcalculate(centroids1,clusters1,data)

In [31]:
print("SSE of Euclidean:",SSEEuclidean)

SSE of Euclidean: 25403417962.182713


In [32]:
cluster_labels1 = predict_cluster_labels(centroids1,clusters1,labels)
cluster_labels1

array([0, 5, 6, 8, 1, 9, 2, 0, 7, 3])

In [33]:
EuclideanAccuracy = accuracy(centroids1, cluster_labels1,test_data,test_labels)
EuclideanAccuracy

0.07285714285714286

In [34]:
model2 = KMeans()
centroids2,clusters2 = model2.train_Kmeans(data,10, max_iter=50,mode=2)
SSEJaccard = model2.SSEcalculate(centroids2,clusters2,data)

Tolerance for Iteration 0 : 39012.240188383046
Tolerance for Iteration 1 : 6727.170877222853
Tolerance for Iteration 2 : 2268.9075610359623
Tolerance for Iteration 3 : 1358.8530942962525
Tolerance for Iteration 4 : 2255.9950202867585
Tolerance for Iteration 5 : 1030.7183311797603
Tolerance for Iteration 6 : 1337.2812332080666
Tolerance for Iteration 7 : 1529.2528422418302
Tolerance for Iteration 8 : 1777.1855040983187
Tolerance for Iteration 9 : 1245.6852716583785
Tolerance for Iteration 10 : 1033.7471833830793
Tolerance for Iteration 11 : 975.2692704802714
Tolerance for Iteration 12 : 413.9461673540967
Tolerance for Iteration 13 : 420.5575771494676
Tolerance for Iteration 14 : 0.0


In [35]:
print("SSE of Jacard:",SSEJaccard)

SSE of Jacard: 34361687572.938736


In [36]:
cluster_labels2 = predict_cluster_labels(centroids2,clusters2,labels)
cluster_labels2

array([1, 3, 1, 1, 7, 5, 7, 3, 3, 5])

In [37]:
JaccardAccuracy = accuracy(centroids2, cluster_labels2,test_data,test_labels)
JaccardAccuracy

0.10857142857142857

In [38]:
model3 = KMeans()
centroids3,clusters3 = model3.train_Kmeans(data,10, max_iter = 60,mode=3)
SSECosine = model3.SSEcalculate(centroids3,clusters3,data)

Tolerance for Iteration 0 : 26023.545816733065
Tolerance for Iteration 1 : 4750.702014415996
Tolerance for Iteration 2 : 3831.8703405402093
Tolerance for Iteration 3 : 4541.774698158599
Tolerance for Iteration 4 : 2880.79211853034
Tolerance for Iteration 5 : 1383.9025574135057
Tolerance for Iteration 6 : 961.5543675969049
Tolerance for Iteration 7 : 672.415553358236
Tolerance for Iteration 8 : 608.5931590531376
Tolerance for Iteration 9 : 433.0947794281497
Tolerance for Iteration 10 : 342.307169612219
Tolerance for Iteration 11 : 383.5524118225569
Tolerance for Iteration 12 : 322.37008991683706
Tolerance for Iteration 13 : 335.65569272976666
Tolerance for Iteration 14 : 317.0080019043435
Tolerance for Iteration 15 : 257.74949485638433
Tolerance for Iteration 16 : 204.54036757284354
Tolerance for Iteration 17 : 144.8688805839671
Tolerance for Iteration 18 : 185.9035117822009
Tolerance for Iteration 19 : 149.07196371127037
Tolerance for Iteration 20 : 92.04387142093287
Tolerance for Iter

In [39]:
print("SSE of Euclidean :",SSEEuclidean)
print("SSE of Jacard :",SSEJaccard)
print("SSE of Cosine :",SSECosine)

SSE of Euclidean : 25403417962.182713
SSE of Jacard : 34361687572.938736
SSE of Cosine : 25625283170.03969


In [40]:
cluster_labels3 = predict_cluster_labels(centroids3,clusters3,labels)
cluster_labels3

array([3, 7, 1, 0, 0, 9, 8, 2, 6, 2])

In [41]:
EuclideanAccuracy
JaccardAccuracy
CosineAccuracy = accuracy(centroids3, cluster_labels3,test_data,test_labels)
CosineAccuracy

0.06142857142857143

In [42]:
print("Euclidean accuracy:",EuclideanAccuracy)
print("Jacard accuracy:",JaccardAccuracy)
print("Cosine accuracy :",CosineAccuracy)

Euclidean accuracy: 0.07285714285714286
Jacard accuracy: 0.10857142857142857
Cosine accuracy : 0.06142857142857143
