In [1]:
import numpy as np
import pandas as pd
from operator import sub
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read data and labels
data = pd.read_csv("../data/kmeans_data/data.csv", header = None)
labels = pd.read_csv('../data/kmeans_data/label.csv', header = None)

data.shape

(10000, 784)

In [3]:
# Define class to implement kmeans
class KMeans():
    def __init__(self, k, distance, max_iter, evaluation_criteria):
        # Store in the class
        self.k = k
        self.max_iter = max_iter

        # Decide which distance to use
        if distance == "euclidean":
            self.distance_function = self.__euclidean
        elif distance == 'cosine':
            self.distance_function = self.__cosine
        else:
            self.distance_function = self.__jaccard

        # Decide which evaluation function to use
        self.evaluation_criteria = evaluation_criteria

    # Compute euclidean function
    def __euclidean(self, x, centroids):
        euclid = []
        for center in centroids:
            temp_euclid = x - np.array(center)
            euclid.append(np.sqrt(np.dot(temp_euclid, temp_euclid.T)))
        return euclid

    # Compute cosine function
    def __cosine(self, x, centroids):
        cosine = []
        for center in centroids:
            cosine.append(np.dot(x, center) / (np.linalg.norm(x) * np.linalg.norm(center)))
        return cosine

    # Compute jaccard function
    def __jaccard(self, x, centroids):
        jaccard = []
        for center in centroids:
            temp_jaccard = np.sum(min(x, center)) / np.sum(max(x, center))
            jaccard.append(1 - temp_jaccard)
        return jaccard
    
    # Compute Sum Of Squared Error
    def __sumOfSquares(self, x, center):
        sse = 0
        for i in range(len(center)):
            sse += (center[i] - x[i])**2
        return sse

    # Compute Accuracy
    def __accuracy(self, X_train, labels):
        correct = 0
        for index, row in enumerate(X_train.iterrows()):
            if row in self.clusters[labels[index]]:
                correct += 1
        return 100 * correct / X_train.shape[0]

    # Fit the model and find the centroids
    def fit(self, X_train):
        # Initially, randomly select k datapoints as centrod.
        centroids = []
        for _ in range(self.k):
            centroids.append(np.mean(np.array(X_train.sample())))
        
        centroids = np.array(centroids).reshape(10, -1)
        
        # Initialize few required variables
        iterations = 0
        prev_centroids = None
        
        # Iteratively update the centroids
        while(iterations < self.max_iter and np.not_equal(centroids, prev_centroids).any()):
            print("Iteration Number: ", iterations)
                
            # Save the previous centroids so that current ones can be overwritten
            prev_centroids = centroids

            # Increment the number of iterations
            iterations += 1

            # Iterate over all rows
            clusters = [[] for _ in range(self.k)]
            for row in X_train.iterrows():
                distance = self.distance_function(np.array(row[1].T), centroids)
                distance = np.array(distance).flatten()
                cluster_number = np.argmin(distance)
                clusters[cluster_number].append(row)
            
            # Update the new centroid
            centroids = []
            for cluster in clusters:
                centroids.append(np.mean(np.mean(cluster)))
            
            centroids = np.array(centroids).reshape(-1, 1)
    
        self.centroids = centroids
        self.clusters = cluster

    # Evaluate the generated centroids
    def evaluate(self, X_train, labels):
        if self.evaluation_criteria == 'sse':
            sse = 0
            for index, cluster in enumerate(self.clusters):
                for row in cluster:
                    sse += self.__sumOfSquares(row, self.centroids[index])
            return sse
        else:
            return self.__accuracy(X_train, labels)

In [5]:
kmeans_euclid = KMeans(k = 10, distance = 'euclidean', max_iter = 10, evaluation_criteria = 'sse')
kmeans_cosine = KMeans(k = 10, distance = 'cosine', max_iter = 10, evaluation_criteria = 'sse')
kmeans_jaccard = KMeans(k = 10, distance = 'jaccard', max_iter = 10, evaluation_criteria = 'sse')

kmeans_euclid.fit(data)
# kmeans_cosine.fit(data)
# kmeans_jaccard.fit(data)

sse_euclid = kmeans_euclid.evaluate(data, labels)
# sse_cosine = kmeans_cosine.evaluate(data, labels)
# sse_jaccard = kmeans_jaccard.evaluate(data, labels)

print("sum of squares for euclidean distance:", sse_euclid)
# print(sse_cosine)
# print(sse_jaccard)

Iteration Number:  0
Iteration Number:  1
Iteration Number:  2
Iteration Number:  3
Iteration Number:  4
Iteration Number:  5
Iteration Number:  6
Iteration Number:  7
Iteration Number:  8
Iteration Number:  9
sum of squares for euclidean distance 0
