<a href="https://colab.research.google.com/github/shafayat1004/Data-Mining-Assignments/blob/main/Assignment%204.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KMeans Clustering on MNIST Dataset
## 1910456 Mir Shafayat Ahmed                                                                                                     

In [119]:
import numpy as np
import pandas as pd
import struct
from array import array
from os.path  import join
import plotly.express as px
pd.set_option('display.max_rows', None)

!mkdir -p mnist/t10k-images-idx3-ubyte
!mkdir -p mnist/t10k-labels-idx1-ubyte
!mkdir -p mnist/train-images-idx3-ubyte
!mkdir -p mnist/train-labels-idx1-ubyte

!curl "https://raw.githubusercontent.com/shafayat1004/Data-Mining-Assignments/main/mnist/t10k-images-idx3-ubyte/t10k-images-idx3-ubyte" --output mnist/t10k-images-idx3-ubyte/t10k-images-idx3-ubyte

!curl "https://raw.githubusercontent.com/shafayat1004/Data-Mining-Assignments/main/mnist/t10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte" --output mnist/t10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte

!curl "https://raw.githubusercontent.com/shafayat1004/Data-Mining-Assignments/main/mnist/train-images-idx3-ubyte/train-images-idx3-ubyte" --output mnist/train-images-idx3-ubyte/train-images-idx3-ubyte

!curl "https://raw.githubusercontent.com/shafayat1004/Data-Mining-Assignments/main/mnist/train-labels-idx1-ubyte/train-labels-idx1-ubyte" --output mnist/train-labels-idx1-ubyte/train-labels-idx1-ubyte

divider = "--------------------------------------------------------------------------------------"

In [120]:
#
# MNIST Data Loader Class
#
class MnistDataloader(object):
    def __init__(
            self,
            training_images_filepath,
            training_labels_filepath,
            test_images_filepath,
            test_labels_filepath
    ):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath     = test_images_filepath
        self.test_labels_filepath     = test_labels_filepath

    def read_images_labels(self, images_filepath, labels_filepath):
        labels = []

        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))

            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))

            labels = array("B", file.read())

        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))

            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))

            image_data = array("B", file.read())

        images = []

        for i in range(size):
            images.append([0] * rows * cols)

        for i in range(size):
            img          = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            images[i][:] = img

        return images, labels

    def load_data(self):
        train_images, train_labels = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        test_images,  test_labels  = self.read_images_labels(self.test_images_filepath,     self.test_labels_filepath)
        return (train_images, train_labels), (test_images, test_labels)

simple centroid initialization function

In [121]:
def initialize_centroids_simple(data, dimension_of_data, no_of_clusters):
    #centroids: [[centroid0:  3 dimensions, , , ]; [centroid1: 3 dimensions ] ... ..]
    centroids = np.zeros((no_of_clusters, dimension_of_data))
    
    random_indexes = np.random.permutation(len(data))                                                # Selecting random data points for each initial Centroid
    for i in range(no_of_clusters):
        centroids[i] = data[random_indexes[i]]
        
    print(f"Initialized Random Centroids =\n{centroids}")
    print(divider)
    
    return centroids

Centroid initilization using min max

Calculate eucledian distance

In [122]:
def get_euclidean_distance(vector1, vector2):
    difference = vector1 - vector2
    sum_of_squared_diff = np.dot (difference, difference)
    distance = np.sqrt(sum_of_squared_diff)
    
    return distance

In [123]:
def clustering_objective (data, centroids):
    total_sum = float(0)                                                                             # Initial total_sum is 0
    for point in data:                                                                               # Iterating through each datapoint
        total_sum += min([np.dot(point - centroid, point - centroid) for centroid in centroids])     # List comprehension to get a list of all the squared distances between...
                                                                                                     # ...the current datapoint and each of the k centroids. Then using min()...
                                                                                                     # ...to get the minimum distance and adding it to sum.
    return total_sum/float(len(data))                                                                # J is found by dividing sum with the number of datapoints. 1/N * sum

KMeans Function


In [124]:
def kmeans(data, dimension_of_data, no_of_clusters, max_iterations):
    no_of_features = np.size(data, 0)
    
    #centroids: [[centroid0:  , , ,3 dimensions, , , ],  [centroid1: , , ,3 dimensions, , , ],  ... ..]
    centroids = initialize_centroids_simple(data, dimension_of_data, no_of_clusters)
    #cluster_affiliation: cluster_affiliation = [clusternumber, clutsernumber, ..., ..., ..., ...]
    
    #initialize the cluster affiliations. Initially assign -1
    cluster_affiliation = (np.zeros(len(data)) - 1).astype(int)
    print(f"Initial Cluster Affiliation =\n{cluster_affiliation}")
    print(divider)
    
    continue_loop = True
    j_prev = np.inf                                                                                  # So that loop does not break immediately after first Iteration.
    for iteration in range(max_iterations):                                                          # Setting a max limit of loops so that it is not infinite.
        print(f"Iteration {iteration}:")
        for i, point in enumerate(data): #use numpy equivalent code
            
            min_distance = float('inf')
            min_distance_index = None

            #find the closest centroids for each data points
            for cluster_index, centroid in enumerate(centroids): #use numpy equivalent code
                distance = get_euclidean_distance(centroid, point)
                if distance < min_distance:
                    min_distance = distance
                    min_distance_index = cluster_index

            #record or update cluster for each data points
            if cluster_affiliation[i] != min_distance_index:
               cluster_affiliation[i] = min_distance_index
                
        #recompute centroids
        for cluster_index in range(no_of_clusters):                                                  # Iterating over all centroid indexes
            grouped_points = data[np.take(cluster_affiliation, cluster_index)]                              # The actual datapoints closest to the current centroid are extracted by using a boolean array 
            one_vector = np.ones(len(grouped_points))
            centroids[cluster_index] = (np.dot(one_vector, grouped_points) / len(grouped_points))    # The new centroid is now the mean of the grouped datapoints

        j = clustering_objective (data, centroids)
    
        print(f"New centroids in this iteration =\n{centroids}")
        print(f"|J - J_prev| = {abs(j - j_prev)}")
        print(divider)
        
        if abs(j - j_prev) <= 10**(-5):                                                              # Breaks the loop if there is no significant change in the value of J over an iteration
            break
        j_prev = j                                                                                   # Updating J_prev for later iterations.

    print(f"Final Centroids =\n{centroids}")
    print(f"Final Cluster Affiliations =\n{cluster_affiliation}")
    return centroids, cluster_affiliation


Driver funtion/Main Function

In [125]:
def create_3d_scatter_plot_of_dataset (data, class_labels):
    flower_names = class_labels.unique()
    symbols_dict = {
        flower_names[0] : "square-open",
        flower_names[1] : "diamond-open",
        flower_names[2] : "circle-open"
    }

    dataset_3d_plot = \
        px.scatter_3d (
            title      = "Dataset",
            data_frame = data,
            x          = data.columns[0],
            y          = data.columns[1],
            z          = data.columns[2],
            color      = class_labels,
            height     = 600
        )
    
    for i, d in enumerate(dataset_3d_plot.data):
        dataset_3d_plot.data[i].marker.symbol = symbols_dict[dataset_3d_plot.data[i].name]
    
    return dataset_3d_plot

In [126]:
def create_3d_scatter_plot_showing_cluster_affiliation (data, cluster_affiliation):
    clusters = np.unique(cluster_affiliation)
    symbols_dict = {
        clusters[0] : "square-open",
        clusters[1] : "diamond-open",
        clusters[2] : "circle-open"
    }
    
    data["Cluster Affiliation"] = cluster_affiliation

    dataset_3d_plot = \
        px.scatter_3d (
            title      = "Cluster Affiliation",
            data_frame = data,
            x          = data.columns[0],
            y          = data.columns[1],
            z          = data.columns[2],
            color      = data.columns[3],
            height     = 600
        )
    
    for i, d in enumerate(dataset_3d_plot.data):
        dataset_3d_plot.data[i].marker.symbol = symbols_dict[dataset_3d_plot.data[i].name]
        
    return dataset_3d_plot

In [127]:
def get_most_frequent (arr):
    return np.argmax(np.bincount(arr))

def create_3d_scatter_plot_showing_wrong_affiliation (data, cluster_affiliation, class_labels):
    data["Cluster Affiliation"] = cluster_affiliation
    data["Actual Class"] = class_labels
    most_common_affiliations = [get_most_frequent(cluster_affiliation[start : end]) for start, end in [(0, 50), (50, 100), (100, 150)]]
    
    affiliation_to_flower_name_dict ={
        most_common_affiliations[0]: class_labels[0],
        most_common_affiliations[1]: class_labels[50],
        most_common_affiliations[2]: class_labels[100]
    }

    data["Cluster Affiliation"] = data["Cluster Affiliation"].map(affiliation_to_flower_name_dict)
    data["Is Correct"] = data["Cluster Affiliation"] == data["Actual Class"]
    
    print("Data:")
    display(data)
    
    cluster_correctness_3d_scatter_plot = \
        px.scatter_3d (
            title      = "Cluster Correctness",
            data_frame = data,
            x          = data.columns[0],
            y          = data.columns[1],
            z          = data.columns[2],
            color      = "Is Correct",
            height     = 600
        )
    
    if cluster_correctness_3d_scatter_plot.data[0].name == "True":
        cluster_correctness_3d_scatter_plot.data[0].marker.color = "green"
    else:
        cluster_correctness_3d_scatter_plot.data[0].marker.color = "red"
        
    if cluster_correctness_3d_scatter_plot.data[1].name == "True":
        cluster_correctness_3d_scatter_plot.data[1].marker.color = "green"
    else:
        cluster_correctness_3d_scatter_plot.data[1].marker.color = "red"

    return cluster_correctness_3d_scatter_plot
    

In [128]:
def main():
    no_of_clusters = 10                                                   # K clusters
    no_of_training_data = 10000
    
    print(f"Number of clusters = {no_of_clusters}")

    # Setting file paths based on added MNIST Datasets
    input_path = 'mnist/'
    training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
    training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
    test_images_filepath     = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
    test_labels_filepath     = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')
    
    # Loading MINST dataset
    mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
    
    (train_images, train_labels), (test_images, test_labels) = mnist_dataloader.load_data()
    
    train_images, train_labels = train_images[:no_of_training_data], train_labels[:no_of_training_data]
    
    
    print(f"Length of train set = {len(train_images)}")
    print(f"Length of test set = {len(test_images)}")
    
    dimension_of_data = np.size(train_images, 1)                                                             # number of  data dimension in the data
    
    print(f"Dimensions being used = {dimension_of_data}")
    print(divider)
    
    centroids, cluster_affiliation = kmeans(train_images, dimension_of_data, no_of_clusters, 300)
    
    display(centroids)
	
if __name__ == "__main__":
	main()
			

Number of clusters = 10
Length of train set = 20000
Length of test set = 10000
Dimensions being used = 784
--------------------------------------------------------------------------------------
Initialized Random Centroids =
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
--------------------------------------------------------------------------------------
Initial Cluster Affiliation =
[-1 -1 -1 ... -1 -1 -1]
--------------------------------------------------------------------------------------
Iteration 0:
New centroids in this iteration =
[[13.86989796 13.86989796 13.86989796 ... 13.86989796 13.86989796
  13.86989796]
 [13.86989796 13.86989796 13.86989796 ... 13.86989796 13.86989796
  13.86989796]
 [21.85586735 21.85586735 21.85586735 ... 21.85586735 21.85586735
  21.85586735]
 ...
 [13.86989796 13.86989796 13.86989796 ... 13.86989796 13.86989796
  13.86989796]
 [37.75637755 3

KeyboardInterrupt: 