In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
# Read data and labels
data = pd.read_csv("../data/kmeans_data/data.csv", header = None)
labels = pd.read_csv('../data/kmeans_data/label.csv', header = None)

data.shape

(10000, 784)

In [4]:
# Define class to implement kmeans
class KMeans():
    def __init__(self, k, distance, max_iter, evaluation_criteria):
        # Store in the class
        self.k = k
        self.max_iter = max_iter

        # Decide which distance to use
        if distance == "euclidean":
            self.distance_function = self.__euclidean()
        elif distance == 'cosine':
            self.distance_function = self.__cosine()
        else:
            self.distance_function = self.__jaccard()

        # Decide which evaluation function to use
        self.evaluation_criteria = evaluation_criteria

    # Compute euclidean function
    def __euclidean(self, x, centroids):
        euclid = []
        for center in centroids:
            euclid.append(np.linalg.norm(x - center))
        return euclid

    # Compute cosine function
    def __cosine(self, x, centroids):
        cosine = []
        for center in centroids:
            cosine.append(np.dot(x, center) / (np.linalg.norm(x) * np.linalg.norm(center)))
        return cosine

    # Compute jaccard function
    def __jaccard(self, x, centroids):
        jaccard = []
        for center in centroids:
            temp_jaccard = np.sum(min(x, center)) / np.sum(max(x, center))
            jaccard.append(1 - temp_jaccard)
        return jaccard
    
    # Compute Sum Of Squared Error
    def __sumOfSquares(self, x, center):
        sse = 0
        for i in range(len(center)):
            sse += (center[i] - x[i])**2
        return sse

    # Compute Accuracy
    def __accuracy(self, X_train, labels):
        correct = 0
        for index, row in enumerate(X_train.iterrows()):
            if row in self.clusters[labels[index]]:
                correct += 1
        return 100 * correct / X_train.shape[0]

    # Fit the model and find the centroids
    def fit(self, X_train):
        # Initially, randomly select k datapoints as centrod.
        centroids = []
        for _ in range(self.k):
            centroids.append(X_train.sample())
        
        # Initialize few required variables
        iterations = 0
        prev_centroids = None
        
        # Iteratively update the centroids
        while(iterations < self.max_iter and prev_centroids != centroids):
            # Save the previous centroids so that current ones can be overwritten
            prev_centroids = centroids

            # Increment the number of iterations
            iterations += 1

            # Iterate over all rows
            clusters = [[] for _ in range(self.k)]
            for row in X_train.iterrows():
                distance = self.distance_function(row, centroids)
                cluster_number = np.argmin(distance)
                clusters[cluster_number].append(row)
            
            # Update the new centroid
            centroids = []
            for cluster in clusters:
                centroids.append(np.mean(cluster))
    
        self.centroids = centroids
        self.clusters = cluster

    # Evaluate the generated centroids
    def evaluate(self, X_train, labels):
        if self.evaluation_criteria == 'sse':
            for index, cluster in enumerate(self.clusters):
                for row in cluster:
                    sse += self.__sumOfSquares(row, self.centroids[index])
            return sse
        else:
            return self.__accuracy(X_train, labels)