In [1]:
import numpy as np
from keras.datasets import mnist
from sklearn.cluster import KMeans

In [2]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [3]:
'Training Data: {}'.format(x_train.shape)

'Training Data: (60000, 28, 28)'

In [4]:
'Training Labels: {}'.format(y_train.shape)

'Training Labels: (60000,)'

In [5]:
'Testing Data: {}'.format(x_test.shape)

'Testing Data: (10000, 28, 28)'

In [6]:
'Testing Labels: {}'.format(y_test.shape)

'Testing Labels: (10000,)'

In [7]:
# preprocessing the images
# convert each image to 1 dimensional array
X = x_train.reshape(len(x_train),-1)
Y = y_train
# normalize the data to 0 - 1
X = X.astype(float) / 255.

In [8]:
X.shape, X[0].shape

((60000, 784), (784,))

In [9]:
n_digits = len(np.unique(y_test))
n_digits

10

Experiment logs:

- KMeans with 15 clusters on mnist training dataset takes 37.6s on MacBook Air (M2) and has training accuracy 0.67643

- KMeans with 20 clusters on mnist training dataset takes 28.8s on MacBook Air (M2) and has training accuracy 0.71753

In [10]:
# Initialize KMeans model
kmeans = KMeans(n_clusters = 20)

# Fit the model to the training data
kmeans.fit(X)

In [11]:
kmeans.labels_[:6]

array([16, 19, 17,  2, 13,  6], dtype=int32)

In [12]:
def infer_cluster_labels(kmeans, ground_truth_labels):
    inferred_labels = {}
    for i in range(kmeans.n_clusters):
        # find index of points in cluster
        labels = []
        index = np.where(kmeans.labels_ == i)
        # append actual labels for each point in cluster
        labels.append(ground_truth_labels[index])
        # determine most common label
        if len(labels[0]) == 1:
            counts = np.bincount(labels[0])
        else:
            counts = np.bincount(np.squeeze(labels))
        # assign the cluster to a value in the inferred_labels dictionary
        if np.argmax(counts) in inferred_labels:
            # append the new number to the existing array at this slot
            inferred_labels[np.argmax(counts)].append(i)
        else:
            # create a new array in this slot
            inferred_labels[np.argmax(counts)] = [i]
    return inferred_labels

In [13]:
def infer_data_labels(X_labels, cluster_labels):
    """
    Determines label for each array, depending on the cluster it has been assigned to.
    returns: predicted labels for each array
    """
    
    # empty array of len(X)
    predicted_labels = np.zeros(len(X_labels)).astype(np.uint8)

    for i, cluster in enumerate(X_labels):
        for key, value in cluster_labels.items():
            if cluster in value:
                predicted_labels[i] = key

    return predicted_labels

In [14]:
cluster_labels = infer_cluster_labels(kmeans, Y)
X_clusters = kmeans.predict(X)
predicted_labels = infer_data_labels(X_clusters, cluster_labels)
'accuracy over training dataset: {}'.format(np.sum(predicted_labels == Y) / len(Y))

'accuracy over training dataset: 0.7175333333333334'