In [1]:
import kmeans_satvik as km
import numpy as np

In [2]:
df = km.loadCSV('kmeans_data/data.csv')
labels = km.loadCSV('kmeans_data/label.csv')

In [3]:
k = len(set(labels))
print(k)

10


In [4]:
c_euclidean = km.kmeans(df,k,dist='euclidean')
c_cosine = km.kmeans(df,k,dist='cosine')
c_jaccard = km.kmeans(df,k,dist='jaccard')

In [5]:
print('SSE of Euclidean: ', c_euclidean['withinss'])
print('SSE of Cosine: ', c_cosine['withinss'])
print('SSE of Jaccard: ', c_jaccard['withinss'])

SSE of Euclidean:  25640844979.153355
SSE of Cosine:  2462.884963406293
SSE of Jaccard:  9999.0


In [7]:
from tqdm import tqdm
def label_clusters(clusters, labels, df):
    df_list = [list(instance) for instance in df]
    cluster_labels = []
    for cluster in clusters:
        if not cluster:
            continue
        label_counts = {}
        for instance in cluster:
            instance_label = labels[df_list.index(list(instance))]
            label_counts[instance_label] = label_counts.get(instance_label, 0) + 1
        most_frequent_label = max(label_counts, key=label_counts.get)
        cluster_labels.append(most_frequent_label)
    return cluster_labels

def cal_acc(clusters, cluster_labels, actual_labels, df):
    correct_assignments = 0
    total_assignments = 0
    df_list = [list(instance) for instance in df]
    for cluster_index, cluster in enumerate(clusters):
        for instance in cluster:
            instance_index = df_list.index(list(instance))
            actual_label = actual_labels[instance_index]
            predicted_label = cluster_labels[cluster_index]
            if actual_label == predicted_label:
                correct_assignments += 1
            total_assignments += 1
    accuracy = correct_assignments / total_assignments if total_assignments > 0 else 0
    return accuracy

df_list = [list(instance) for instance in df]
labels_list = [label[0] for label in labels]

euclidean_c_labels = label_clusters(c_euclidean['clusters'], labels_list, df)
acc_euclidean = cal_acc(c_euclidean['clusters'], euclidean_c_labels, labels_list, df)

cosine_c_labels = label_clusters(c_cosine['clusters'], labels_list, df)
acc_cosine = cal_acc(c_cosine['clusters'], cosine_c_labels, labels_list, df)

jaccard_c_labels = label_clusters(c_jaccard['clusters'], labels_list, df)
acc_jaccard = cal_acc(c_jaccard['clusters'], jaccard_c_labels, labels_list, df)

print('Accuracy of Euclidean:', acc_euclidean)
print('Accuracy of Cosine:', acc_cosine)
print('Accuracy of Jaccard:', acc_jaccard)

Accuracy of Euclidean: 0.5344534453445344
Accuracy of Cosine: 0.6189618961896189
Accuracy of Jaccard: 0.11351135113511351


In [15]:
from time import time

def run_condition(condition, df,k):

    euclidean_start = time()
    c_euclidean = km.kmeans(df,k,dist='euclidean',condition=condition)
    euclidean_time = time() - euclidean_start
    print("Name: Euclidean \t Time Taken: {} \t Iteration Count: {}".format(euclidean_time, c_euclidean['iterations']))
    
    cosine_start = time()
    c_cosine = km.kmeans(df,k,dist='cosine',condition=condition)
    cosine_time = time() - cosine_start
    print("Name: Cosine \t Time Taken: {} \t Iteration Count: {}".format(cosine_time, c_cosine['iterations']))

    jaccard_start = time()
    c_jaccard = km.kmeans(df,k,dist='jaccard',condition=condition)
    jaccard_time = time() - jaccard_start
    print("Name: Jaccard \t Time Taken: {} \t Iteration Count: {}".format(jaccard_time, c_jaccard['iterations']))

In [16]:
run_condition('centroid',df,k)

Name: Euclidean 	 Time Taken: 470.15547585487366 	 Iteration Count: 45
Name: Cosine 	 Time Taken: 386.84544682502747 	 Iteration Count: 56
Name: Jaccard 	 Time Taken: 119.29828476905823 	 Iteration Count: 39


In [17]:
run_condition('sse',df,k)

Name: Euclidean 	 Time Taken: 533.1912040710449 	 Iteration Count: 47
Name: Cosine 	 Time Taken: 325.0952205657959 	 Iteration Count: 43
Name: Jaccard 	 Time Taken: 15.544544219970703 	 Iteration Count: 3


In [18]:
run_condition('iteration',df,k)

Name: Euclidean 	 Time Taken: 1046.5089251995087 	 Iteration Count: 100
Name: Cosine 	 Time Taken: 729.0478780269623 	 Iteration Count: 100
Name: Jaccard 	 Time Taken: 204.43774604797363 	 Iteration Count: 100


In [19]:
def run_condition_sse(condition, df,k):
    c_euclidean = km.kmeans(df,k,dist='euclidean',condition=condition)
    print("Euclidean : {}".format(c_euclidean['withinss']))
    c_cosine = km.kmeans(df,k,dist='cosine',condition=condition)
    print("Cosine : {}".format(c_cosine['withinss']))
    c_jaccard = km.kmeans(df,k,dist='jaccard',condition=condition)
    print("Jaccard : {}".format(c_jaccard['withinss']))

In [20]:
run_condition_sse('centroid',df,k)

Euclidean : 25482724134.745213
Cosine : 2491.2243403842886
Jaccard : 9999.0


In [None]:
run_condition_sse('sse',df,k)

In [None]:
run_condition_sse('iteration',df,k)