In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Read data and labels
data = pd.read_csv("../data/kmeans_data/data.csv", header = None)
labels = pd.read_csv('../data/kmeans_data/label.csv', header = None)

data.shape

(10000, 784)

In [16]:
# Define class to implement kmeans
class KMeans():
    def __init__(self, k, distance, max_iter, evaluation_criteria):
        # Store in the class
        self.k = k
        self.max_iter = max_iter
        self.distance = distance
        self.evaluation_criteria = evaluation_criteria

        # Decide which distance to use
        if distance == "euclidean":
            self.distance_function = self.__euclidean
        elif distance == 'cosine':
            self.distance_function = self.__cosine
        else:
            self.distance_function = self.__jaccard

    # Compute euclidean function
    def __euclidean(self, x, centroids):
        euclid = []
        for center in centroids:
            euclid.append(np.sqrt(np.sum((x - center)**2)))
        return euclid

    # Compute cosine function
    def __cosine(self, x, centroids):
        cosine = []
        for center in centroids:
            cosine.append(np.dot(x.T, center) / (np.linalg.norm(x) * np.linalg.norm(center)))
        return cosine

    # Compute jaccard function
    def __jaccard(self, x, centroids):
        jaccard = []

        for center in centroids:
            numerator = np.sum(np.minimum(x, center))
            denominator = np.sum(np.maximum(x, center))
            temp_jaccard = 1 - (numerator / denominator)
            jaccard.append(temp_jaccard)
        
        return jaccard
    
    # Compute Sum Of Squared Error
    def __sumOfSquares(self, x, center):
        sse = 0
        for i in range(len(center)):
            sse += (center[i] - x[i])**2
        return sse

    # Compute Accuracy
    def __accuracy(self, X_train, labels):
        correct = 0
        for index, row in enumerate(X_train.iterrows()):
            flag = True
            for cluster_row in self.clusters[labels.iloc[index].values[0]]:
                if np.not_equal(cluster_row, row[1]).any():
                    flag = False
                    break
                if flag:
                    correct += 1
        return 100 * (correct / X_train.shape[0])

    # Fit the model and find the centroids
    def fit(self, X_train):
        # Initially, randomly select k datapoints as centrod.
        centroids = []

        # For jaccard, we need entire row instead of just one element
        for _ in range(self.k):
            centroids.append(np.array(X_train.sample()))
        
        # Reshape
        centroids = np.array(centroids).reshape(self.k, -1)
        # print("Centroids array:", np.array(centroids).shape)

        # Initialize few required variables
        iterations = 0
        prev_centroids = None
        
        # Iteratively update the centroids
        while(iterations < self.max_iter and np.not_equal(centroids, prev_centroids).any()):
            print("Iteration Number: ", iterations)
            
            # Save the previous centroids so that current ones can be overwritten
            prev_centroids = centroids

            # Increment the number of iterations
            iterations += 1

            # Iterate over all rows
            clusters = [[] for _ in range(self.k)]
            for row in X_train.iterrows():
                distance = self.distance_function(np.array(row[1]), np.array(centroids))
                cluster_number = np.argmin(distance)
                clusters[cluster_number].append(row[1])
            
            # Update the new centroid
            centroids = []
            for cluster in clusters:
                cluster = np.array(cluster)
                if len(cluster) == 0:
                    centroids.append(np.zeros(784))
                else:
                    centroids.append(np.mean(cluster, axis = 0))

        # Save the centroids and cluster in the class
        self.centroids = centroids
        self.clusters = clusters

    # Evaluate the generated centroids
    def evaluate(self, X_train, labels):
        if self.evaluation_criteria == 'sse':
            sse = 0
            for index, cluster in enumerate(self.clusters):
                for row in cluster:
                    sse += self.__sumOfSquares(row, self.centroids[index])
            return sse
        else:
            return self.__accuracy(X_train, labels)

In [12]:
# Cosine Similarity, with evaluation_criteria = 'sse'
kmeans_cosine = KMeans(k = 10, distance = 'cosine', max_iter = 10, evaluation_criteria = 'sse')

# Fit model
kmeans_cosine.fit(data)

# Compute sse
sse_cosine = kmeans_cosine.evaluate(data, labels)

# Display results
print("Normalized Sum of Squares for Euclidean Distance:", sse_cosine / data.shape[0])

Iteration Number:  0
Iteration Number:  1
Iteration Number:  2
Iteration Number:  3
Iteration Number:  4


KeyboardInterrupt: 

In [9]:
# Jaccard distance, with evaluation_criteria as "sse"
kmeans_jaccard = KMeans(k = 10, distance = 'jaccard', max_iter = 10, evaluation_criteria = 'sse')

# Fit model
kmeans_jaccard.fit(data)

# Compute sse
sse_jaccard = kmeans_jaccard.evaluate(data, labels)

# Display results
print("Normalized Sum of Squares for Jaccard Distance:", sse_jaccard / data.shape[0])

Iteration Number:  0
Iteration Number:  1
Iteration Number:  2
Iteration Number:  3
Iteration Number:  4
Iteration Number:  5
Iteration Number:  6
Iteration Number:  7
Iteration Number:  8
Iteration Number:  9
Normalized Sum of Squares for Jaccard Distance: 2574885.1229187064


In [10]:
# Euclidean distance, with evaluation_criteria as "sse"
kmeans_euclid = KMeans(k = 10, distance = 'euclidean', max_iter = 10, evaluation_criteria = 'sse')

# Fit model
kmeans_euclid.fit(data)

# Compute sse
sse_euclid = kmeans_euclid.evaluate(data, labels)

# Display results
print("Normalized Sum of Squares for Euclidean Distance:", sse_euclid / data.shape[0])

Iteration Number:  0
Iteration Number:  1
Iteration Number:  2
Iteration Number:  3
Iteration Number:  4
Iteration Number:  5
Iteration Number:  6
Iteration Number:  7
Iteration Number:  8
Iteration Number:  9
Normalized Sum of Squares for Euclidean Distance: 2550361.8446529703


In [17]:
# Euclidean Distance, with evaluation_criteria = 'accuracy'
kmeans_euclid_acc = KMeans(k = 10, distance = 'euclidean', max_iter = 10, evaluation_criteria = 'accuracy')

# Fit Model
kmeans_euclid_acc.fit(data)

# Compute accuracy
accuracy = kmeans_euclid_acc.evaluate(data, labels)

# Display results
print('Accuracy for Euclidean Distance:', accuracy)

Iteration Number:  0
Iteration Number:  1
Iteration Number:  2
Iteration Number:  3
Iteration Number:  4
Iteration Number:  5
Iteration Number:  6
Iteration Number:  7
Iteration Number:  8
Iteration Number:  9
Accuracy for Euclidean Distance: 0.01


In [18]:
# Cosine Distance, with evaluation_criteria = 'accuracy'
kmeans_cosine_acc = KMeans(k = 10, distance = 'cosine', max_iter = 10, evaluation_criteria = 'accuracy')

# Fit Model
kmeans_cosine_acc.fit(data)

# Compute accuracy
accuracy = kmeans_cosine_acc.evaluate(data, labels)

# Display results
print('Accuracy for Cosine Similarity:', accuracy)

Iteration Number:  0
Iteration Number:  1
Iteration Number:  2
Iteration Number:  3
Iteration Number:  4
Iteration Number:  5
Iteration Number:  6
Iteration Number:  7
Iteration Number:  8
Iteration Number:  9
Accuracy for Cosine Similarity: 0.0


In [19]:
# Jaccard Distance, with evaluation_criteria = 'accuracy'
kmeans_jaccard_acc = KMeans(k = 10, distance = 'jaccard', max_iter = 10, evaluation_criteria = 'accuracy')

# Fit Model
kmeans_jaccard_acc.fit(data)

# Compute accuracy
accuracy = kmeans_jaccard_acc.evaluate(data, labels)

# Display results
print('Accuracy for Jaccard Similarity:', accuracy)

Iteration Number:  0
Iteration Number:  1
Iteration Number:  2
Iteration Number:  3
Iteration Number:  4
Iteration Number:  5
Iteration Number:  6
Iteration Number:  7
Iteration Number:  8
Iteration Number:  9
Accuracy for Jaccard Similarity: 0.01
