In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read data and labels
data = pd.read_csv("../data/kmeans_data/data.csv", header = None)
labels = pd.read_csv('../data/kmeans_data/label.csv', header = None)

data.shape

(10000, 784)

In [13]:
# Define class to implement kmeans
class KMeans():
    def __init__(self, k, distance, max_iter, evaluation_criteria):
        # Store in the class
        self.k = k
        self.max_iter = max_iter
        self.distance = distance
        self.evaluation_criteria = evaluation_criteria

        # Decide which distance to use
        if distance == "euclidean":
            self.distance_function = self.__euclidean
        elif distance == 'cosine':
            self.distance_function = self.__cosine
        else:
            self.distance_function = self.__jaccard

    # Compute euclidean function
    def __euclidean(self, x, centroids):
        euclid = []
        for center in centroids:
            temp_euclid = x - np.array(center)
            euclid.append(np.sqrt(np.dot(temp_euclid, temp_euclid.T)))
        return euclid

    # Compute cosine function
    def __cosine(self, x, centroids):
        x = np.array(x).reshape(-1, 1)
        cosine = []
        for center in centroids:
            center = np.array(center).reshape(-1, 1)
            temp_cosine = np.dot(x, center) / (np.linalg.norm(x) * np.linalg.norm(center))
            cosine.append(np.linalg.norm(temp_cosine))
        return cosine

    # Compute jaccard function
    def __jaccard(self, x, centroids):
        jaccard = []
        for center in centroids:
            numerator = 0
            denominator = 0
            for i in range(len(center)):
                numerator += min(x[i], center[i])
                denominator += max(x[i], center[i])
            temp_jaccard = numerator / denominator
            jaccard.append(1 - temp_jaccard)
        return jaccard
    
    # Compute Sum Of Squared Error
    def __sumOfSquares(self, x, center):
        sse = 0
        for i in range(len(center)):
            sse += (center[i] - x[i])**2
        return sse

    # Compute Accuracy
    def __accuracy(self, X_train, labels):
        correct = 0
        for index, row in enumerate(X_train.iterrows()):
            if row in self.clusters[labels[index]]:
                correct += 1
        return 100 * correct / X_train.shape[0]

    # Fit the model and find the centroids
    def fit(self, X_train):
        # Initially, randomly select k datapoints as centrod.
        centroids = []

        # For jaccard, we need entire row instead of just one element
        if self.distance == 'jaccard':
            for _ in range(self.k):
                centroids.append(np.array(X_train.sample()))
        else:
            for _ in range(self.k):
                centroids.append(np.mean(np.array(X_train.sample())))
        
        centroids = np.array(centroids).reshape(10, -1)
        
        # Initialize few required variables
        iterations = 0
        prev_centroids = None
        
        # Iteratively update the centroids
        while(iterations < self.max_iter and np.not_equal(centroids, prev_centroids).any()):
            print("Iteration Number: ", iterations)
                
            # Save the previous centroids so that current ones can be overwritten
            prev_centroids = centroids

            # Increment the number of iterations
            iterations += 1

            # Iterate over all rows
            clusters = [[] for _ in range(self.k)]
            for row in tqdm(X_train.iterrows()):
                distance = self.distance_function(np.array(row[1].T), centroids)
                if self.distance != 'jaccard':
                    print("Flattening...")
                    distance = np.array(distance).flatten()
                cluster_number = np.argmin(distance)
                clusters[cluster_number].append(row)
            
            # Update the new centroid
            centroids = []
            if self.distance == 'jaccard':
                print("Cluster Shape", np.array(clusters).shape)
                print("Value cluster shape", np.array(clusters[0]).shape)
                for cluster in clusters:
                    centroids.append(np.mean(cluster))
            else:
                for cluster in clusters:
                    centroids.append(np.mean(np.mean(cluster)))
                centroids = np.array(centroids).reshape(-1, 1)
        
            print("Centroid Shape", np.array(centroids).shape)
            print("prev centroid shape", np.array(prev_centroids).shape)

        # Save the centroids and cluster in the class
        self.centroids = centroids
        self.clusters = clusters

    # Evaluate the generated centroids
    def evaluate(self, X_train, labels):
        if self.evaluation_criteria is 'sse':
            sse = 0
            for index, cluster in enumerate(self.clusters):
                for row in cluster:
                    sse += self.__sumOfSquares(row, self.centroids[index])
            return sse
        else:
            return self.__accuracy(X_train, labels)

In [14]:
# Jaccard distance, with evaluation_criteria as "sse"
kmeans_jaccard = KMeans(k = 10, distance = 'jaccard', max_iter = 10, evaluation_criteria = 'sse')

# Fit model
kmeans_jaccard.fit(data)

# Compute sse
sse_jaccard = kmeans_jaccard.evaluate(data, labels)

# Display results
print("Sum of Squares for Jaccard Distance:", sse_jaccard / data.shape[0])

Iteration Number:  0


10000it [00:53, 186.87it/s]


Cluster Shape (10,)
Value cluster shape (202, 2)
Centroid Shape (10, 784)
prev centroid shape (10, 784)
Iteration Number:  1


10000it [00:52, 190.79it/s]


Cluster Shape (10,)
Value cluster shape (0,)
Centroid Shape (10,)
prev centroid shape (10, 784)


ValueError: operands could not be broadcast together with shapes (10,) (10,784) 

In [None]:
# Euclidean distance, with evaluation_criteria as "sse"
kmeans_euclid = KMeans(k = 10, distance = 'euclidean', max_iter = 10, evaluation_criteria = 'sse')

# Fit model
kmeans_euclid.fit(data)

# Compute sse
sse_euclid = kmeans_euclid.evaluate(data, labels)

# Display results
print("Sum of Squares for Euclidean Distance:", sse_euclid / data.shape[0])

In [None]:
# Cosine Similarity, with evaluation_criteria = 'sse'
kmeans_cosine = KMeans(k = 10, distance = 'cosine', max_iter = 10, evaluation_criteria = 'sse')

# Fit model
kmeans_cosine.fit(data)

# Compute sse
sse_cosine = kmeans_cosine.evaluate(data, labels)

# Display results
print("Sum of Squares for Euclidean Distance:", sse_cosine / data.shape[0])