In [2]:
import scipy 
import sklearn
import numpy as np
import pandas as pd
from scipy import spatial
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix

In [3]:
data = pd.read_csv('hw3_data/data.csv')
label = pd.read_csv('hw3_data/label.csv', header=None, names=['label'])

In [4]:
data.head(10)

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.658,0.659,0.660,0.661,0.662,0.663,0.664,0.665,0.666,0.667
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
label.head(10)

Unnamed: 0,label
0,7
1,2
2,1
3,0
4,4
5,1
6,4
7,9
8,5
9,9


In [6]:
train_data, test_data = train_test_split(data, test_size=0.1, random_state=5)
train_labels, test_labels = train_test_split(label, test_size=0.1, random_state=5)

In [7]:
class KMeans:
    def cal_sse_value(self, centroid_value, centroid_dict, data):
        sse_count = 0
        for i in centroid_dict:
            clust_sse = 0
            # np.sum()
            for j in centroid_dict[i]:
                datp = list(data.iloc[int(j)])
                for x,y in zip(centroid_value[i],datp):
                    clust_sse += (x-y)**2
            sse_count+=clust_sse
        return sse_count    
    
    def Centroid_intz(self,data,k):
        z = data.shape[0]
        centroid_value={}
        for i in range(k):
            rd = np.random.randint(0, z-1)
            centroid_value[i] = data.iloc[rd]
        return centroid_value
    
    def jaccard_sim(self,centroid, datp):
        intersection = len(list(set(centroid).intersection(datp)))
        union = (len(set(centroid)) + len(set(datp))) - intersection
        return float(intersection) / union

    def train_Kmeans(self,data,K,max_iter=20,mode=1,tol=10):
        #mode 1 is eucledian
        #mode 2 is jaccard
        #mode 3 is cosine
        centroid_value = self.Centroid_intz(data,K)
        new_centroid_value = {}
        cnt = 0
        centroid_dict = {}
        convergence = False
        while((cnt<max_iter) and not convergence):
            
            for i in list(centroid_value.keys()):
                centroid_dict[i]=[]
            for i in range(data.shape[0]):
                x = data.iloc[i]
                if mode==1 :
                    distance_measure = [np.linalg.norm(x-centroid_value[j])  for j in centroid_value]
                    idx = np.argmin(distance_measure)
                    centroid_dict[idx].append(i)
                elif mode==2 :
                    distance_measure = [self.jaccard_sim(list(x),centroid_value[j]) for j in centroid_value]
                    idx = np.argmax(distance_measure)
                    centroid_dict[idx].append(i)
                elif mode==3 :
                    distance_measure = [1-scipy.spatial.distance.cosine(x,list(centroid_value[j]))  for j in centroid_value]
                    idx = np.argmax(distance_measure)
                    centroid_dict[idx].append(i)
                
                prev_centroids=dict(centroid_value)
                
            
            for i in centroid_dict:
                if len(centroid_dict[i]):
                    dps_centroid = centroid_dict[i]
                    centroid_value[i] = np.average(data.iloc[dps_centroid],axis=0)
            
            
            tot_cur=-1
            for i in centroid_value:
                prev_centroid_point = prev_centroids[i]
                new_centroid_point = centroid_value[i]
                change = np.sum(np.absolute(new_centroid_point-prev_centroid_point))
                tot_cur= max(change, tot_cur)
                
            print("Total number of Iterations count ",cnt,": ",tot_cur)
            
            cnt+=1
            if (tot_cur<10):
                convergence = True
                break
        
        return centroid_value,centroid_dict


In [8]:
def forecast_clust_labels(C, S, labels):
    '''
    Input : C -> Centroids
            S -> Set of Indicies corresponding to Centroid C
            data -> Data used to form clusters
    Output : Returns an array of size K having labels based on majority voting in the cluster
    '''
    cluster_labels = np.zeros(10,dtype=int)
    for c in C:
        labels_of_points = []
        for point in S[c]:
            labels_of_points.extend(labels.iloc[point])
        counter = Counter(labels_of_points)
        try:
            cluster_labels[c] = max(counter, key=counter.get)
        except:
            cluster_labels[c] = np.random.randint(0,9)
    return cluster_labels

In [9]:
def jacr_similarity(centroid, dp):
        intersection = len(list(set(centroid).intersection(dp)))
        union = (len(set(centroid)) + len(set(dp))) - intersection
        return float(intersection) / union

In [10]:
def accuracy(centroids, centroid_Labels, test_data, true_labels, mode=1):
    y_true = list(true_labels['label']);
    y_pred = []
    for index in range(test_data.shape[0]):
        featureset = test_data.iloc[index]
        if mode==1:
            distances = [np.linalg.norm(featureset - centroids[centroid]) for centroid in centroids]
            classification = distances.index(min(distances))
            y_pred.append(centroid_Labels[classification])
        elif mode==2:
            similarity = [jacr_similarity(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification]) 
        elif mode==3:
            similarity = [1 - spatial.distance.cosine(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification])
    denominator = test_data.shape[0]
    correctly_classified = 0
    for i in range(0,len(y_pred)):
        if y_true[i] == y_pred[i]:
            correctly_classified += 1
    accuracy = correctly_classified/denominator
    return accuracy

In [11]:
model1 = KMeans()
centroids1,clusters1 = model1.train_Kmeans(data,10, max_iter=100,mode=1)

Total number of Iterations count  0 :  25004.56752136752
Total number of Iterations count  1 :  6041.217310166145
Total number of Iterations count  2 :  4119.124118938511
Total number of Iterations count  3 :  3123.6111432734465
Total number of Iterations count  4 :  2678.6287238035657
Total number of Iterations count  5 :  2025.5825393615487
Total number of Iterations count  6 :  1254.4669509768082
Total number of Iterations count  7 :  1194.5314618196771
Total number of Iterations count  8 :  1586.3401342358381
Total number of Iterations count  9 :  1648.3248266240312
Total number of Iterations count  10 :  1582.6314851540722
Total number of Iterations count  11 :  1196.0017030772842
Total number of Iterations count  12 :  770.0147903884213
Total number of Iterations count  13 :  668.2908543615216
Total number of Iterations count  14 :  434.8941918522533
Total number of Iterations count  15 :  487.576139442231
Total number of Iterations count  16 :  513.5633934760449
Total number of 

In [12]:
Euclidean_SSE = model1.cal_sse_value(centroids1,clusters1,data)
print("Euclidean SSE:",Euclidean_SSE)

Euclidean SSE: 25406246552.32479


In [13]:
cluster_labels1 = forecast_clust_labels(centroids1,clusters1,label)
cluster_labels1

array([8, 5, 4, 9, 1, 2, 3, 3, 7, 0])

In [14]:
Accuracy_Euclidean = accuracy(centroids1, cluster_labels1,test_data,test_labels)
Accuracy_Euclidean

0.097

In [15]:
model2 = KMeans()
centroids2,clusters2 = model2.train_Kmeans(data,10, max_iter=100,mode=2)
Jaccard_SSE = model2.cal_sse_value(centroids2,clusters2,data)
print("Jacard SSE:",Jaccard_SSE)

Total number of Iterations count  0 :  38024.47173913043
Total number of Iterations count  1 :  3648.5852858906983
Total number of Iterations count  2 :  3545.5830269005955
Total number of Iterations count  3 :  4832.656344555921
Total number of Iterations count  4 :  1453.293057620369
Total number of Iterations count  5 :  807.3259123151313
Total number of Iterations count  6 :  850.118143687413
Total number of Iterations count  7 :  730.8148685199362
Total number of Iterations count  8 :  1963.3977186511756
Total number of Iterations count  9 :  1468.2206503693844
Total number of Iterations count  10 :  769.8642332312837
Total number of Iterations count  11 :  717.547906069535
Total number of Iterations count  12 :  894.2404720269055
Total number of Iterations count  13 :  518.2119120899575
Total number of Iterations count  14 :  0.0
Jacard SSE: 34361687572.938736


In [16]:
cluster_labels2 = forecast_clust_labels(centroids2,clusters2,label)
cluster_labels2

array([1, 0, 1, 8, 8, 3, 7, 0, 0, 5])

In [17]:
Accuracy_Jaccard = accuracy(centroids2, cluster_labels2,test_data,test_labels)
Accuracy_Jaccard

0.137

In [18]:
model3 = KMeans()
centroids3,clusters3 = model3.train_Kmeans(data,10, max_iter = 100,mode=3)

Total number of Iterations count  0 :  26983.534405719394
Total number of Iterations count  1 :  6821.033987461353
Total number of Iterations count  2 :  5796.926095986944
Total number of Iterations count  3 :  2550.0457006270567
Total number of Iterations count  4 :  2716.1427342419083
Total number of Iterations count  5 :  3036.533637002342
Total number of Iterations count  6 :  1432.465652428018
Total number of Iterations count  7 :  1156.0926618341973
Total number of Iterations count  8 :  1142.8892089913218
Total number of Iterations count  9 :  828.7040589401657
Total number of Iterations count  10 :  917.2211319822046
Total number of Iterations count  11 :  632.5254313319531
Total number of Iterations count  12 :  470.5285633882089
Total number of Iterations count  13 :  409.72263192233925
Total number of Iterations count  14 :  404.3637691745801
Total number of Iterations count  15 :  383.3767304345409
Total number of Iterations count  16 :  436.6426419534243
Total number of It

In [19]:
Cosine_SSE = model3.cal_sse_value(centroids3,clusters3,data)

In [20]:
print("Euclidean Sum of Squares Error:",Euclidean_SSE)
print("Jacard Sum of Squares Error:",Jaccard_SSE)
print("Cosine Sum of Squares Error :",Cosine_SSE)

Euclidean Sum of Squares Error: 25406246552.32479
Jacard Sum of Squares Error: 34361687572.938736
Cosine Sum of Squares Error : 25414653626.769764


In [21]:
cluster_labels3 = forecast_clust_labels(centroids3,clusters3,label)
cluster_labels3

array([8, 3, 3, 7, 1, 0, 9, 5, 2, 0])

In [22]:
Accuracy_Cosine = accuracy(centroids3, cluster_labels3,test_data,test_labels)
print("Euclidean-K-means accuracy:",Accuracy_Euclidean)
print("Jacard-K-means accuracy:",Accuracy_Jaccard)
print("Cosine-K-means accuracy :",Accuracy_Cosine)

Euclidean-K-means accuracy: 0.097
Jacard-K-means accuracy: 0.137
Cosine-K-means accuracy : 0.096
