# KMeans Clustering On MNIST

In [48]:
import sys
import sklearn
import matplotlib
import numpy as np
from keras.datasets import mnist
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:

(x_train, y_train), (x_test, y_test) = mnist.load_data()

print('Training Data: {}'.format(x_train.shape))
print('Training Labels: {}'.format(y_train.shape))

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
Training Data: (60000, 28, 28)
Training Labels: (60000,)


In [4]:
print('Testing Data: {}'.format(x_test.shape))
print('Testing Labels: {}'.format(y_test.shape))

Testing Data: (10000L, 28L, 28L)
Testing Labels: (10000L,)


In [7]:
x_train[1][5]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,  48, 238, 252, 252, 252, 237,   0,   0,   0,   0,   0,   0,
         0,   0], dtype=uint8)

### 2. Preprocessing the MNIST images

Images stored as NumPy arrays are 2-dimensional arrays.  However, the K-means clustering algorithm provided by scikit-learn ingests 1-dimensional arrays; as a result, we will need to reshape each image.  

Clustering algorithms almost always use 1-dimensional data.  For example, if you were clustering a set of X, Y coordinates, each point would be passed to the clustering algorithm as a 1-dimensional array with a length of two (example: [2,4] or [-1, 4]). If you were using 3-dimensional data, the array would have a length of 3 (example: [2, 4, 1] or [-1, 4, 5]).  

MNIST contains images that are 28 by 28 pixels; as a result, they will have a length of 784 once we reshape them into a 1-dimensional array.  

In [16]:

# convert each image to 1 dimensional array
x_train = x_train.reshape(len(x_train),-1)
y_train = y_train

# normalize the data to 0 - 1
x_train = x_train / 255.0


In [17]:
x_train.shape

(60000, 784)

In [18]:
x_train[1].shape

(784,)

# KMeans


In [24]:

classes = 10 #0,1,2,3,4,5,6,7,8,9

# Initialize KMeans model
kmeans = MiniBatchKMeans(n_clusters = classes)

# use if your PC can handle it
# kmeans = MiniBatchKMeans(n_clusters = classes)

# Fit the model to the training data
kmeans.fit(x_train)

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=10, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

In [29]:
kmeans.labels_

array([8, 7, 2, ..., 8, 6, 9], dtype=int32)

In [32]:
kmeans.n_clusters

10

# Assigning
 

In [52]:
def infer_cluster_labels(kmeans, actual_labels):

    inferred_labels = {}

    for i in range(kmeans.n_clusters):

        #index of cluster points
        labels = []
        index = np.where(kmeans.labels_ == i)

        # append actual labels for each point in cluster
        labels.append(actual_labels[index])

        # get most common label
        if len(labels[0]) == 1:
            counts = np.bincount(labels[0]) 
        else:
            counts = np.bincount(np.squeeze(labels))
# BINCOUNT Count number of occurrences of each value in array of non-negative ints.


        # assign the cluster to a value in the inferred_labels dictionary
        if np.argmax(counts) in inferred_labels:
            inferred_labels[np.argmax(counts)].append(i)
        else:
            # create a new array in this slot
            inferred_labels[np.argmax(counts)] = [i]

#         print(labels)
#         print('Cluster: {}, label: {}'.format(i, np.argmax(counts)))
        
    return inferred_labels  



In [35]:
def infer_data_labels(x_labels, cluster_labels):
    
    predicted_labels = np.zeros(len(x_labels)).astype(np.uint8)
    
    for i, cluster in enumerate(x_labels):
        for key, value in cluster_labels.items():
            if cluster in value:
                predicted_labels[i] = key
                
    return predicted_labels

In [38]:
# test the infer_cluster_labels() and infer_data_labels() functions
cluster_labels = infer_cluster_labels(kmeans, y_train)
k_clusters = kmeans.predict(x_train)
predicted_labels = infer_data_labels(k_clusters, cluster_labels)


[array([3, 2, 3, ..., 3, 5, 5], dtype=uint8)]
Cluster: 0, label: 3
[array([1, 1, 1, ..., 1, 5, 1], dtype=uint8)]
Cluster: 1, label: 1
[array([4, 4, 7, ..., 9, 9, 4], dtype=uint8)]
Cluster: 2, label: 4
[array([2, 2, 2, ..., 2, 2, 2], dtype=uint8)]
Cluster: 3, label: 2
[array([9, 4, 7, ..., 7, 9, 9], dtype=uint8)]
Cluster: 4, label: 7
[array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)]
Cluster: 5, label: 0
[array([6, 6, 6, ..., 6, 6, 6], dtype=uint8)]
Cluster: 6, label: 6
[array([0, 0, 0, ..., 3, 0, 0], dtype=uint8)]
Cluster: 7, label: 0
[array([5, 3, 3, ..., 3, 3, 5], dtype=uint8)]
Cluster: 8, label: 3
[array([3, 8, 8, ..., 8, 8, 8], dtype=uint8)]
Cluster: 9, label: 8


In [44]:
predicted_labels[:10]

array([3, 0, 4, 1, 7, 2, 1, 8, 1, 7], dtype=uint8)

In [45]:
y_train[:10]

array([5, 0, 4, 1, 9, 2, 1, 3, 1, 4], dtype=uint8)

## Good enough on top

## Make the clustering more generic

In [54]:

def calculate_metrics(estimator, data, labels):

    # Calculate and print metrics
    print('Number of Clusters: {}'.format(estimator.n_clusters))

In [56]:
classes = [2, 5, 8, 10, 12, 16, 20, 22, 25]

# test different numbers of clusters
for clusters in classes:
    generic_kmeans = MiniBatchKMeans(n_clusters = clusters)
    generic_kmeans.fit(x_train)
    
    # print cluster metrics
    calculate_metrics(generic_kmeans, x_train, y_train)
    
    # determine predicted labels
    cluster_labels = infer_cluster_labels(generic_kmeans, y_train)
    predicted_Y = infer_data_labels(generic_kmeans.labels_, cluster_labels)
    
    # calculate and print accuracy
    print('Accuracy: ', metrics.accuracy_score(y_train, predicted_Y))
    print('\n')

Number of Clusters: 2
Accuracy:  0.20516666666666666


Number of Clusters: 5
Accuracy:  0.3884166666666667


Number of Clusters: 8
Accuracy:  0.53015


Number of Clusters: 10
Accuracy:  0.50555


Number of Clusters: 12
Accuracy:  0.6029333333333333


Number of Clusters: 16
Accuracy:  0.6398666666666667


Number of Clusters: 20
Accuracy:  0.6792166666666667


Number of Clusters: 22
Accuracy:  0.7006666666666667


Number of Clusters: 25
Accuracy:  0.7121666666666666




## more clusters more accuracy hmmmm.... why???

In [64]:
# test kmeans algorithm on testing dataset

x_test = x_test.reshape(len(x_test),-1)
x_test = x_test / 255.0

# initialize and fit KMeans algorithm on training data
kmeans = MiniBatchKMeans(n_clusters = 10)
kmeans.fit(x_train)
cluster_labels = infer_cluster_labels(kmeans, y_train)

# predictions
test_clusters = kmeans.predict(x_test)
predicted_labels = infer_data_labels(kmeans.predict(x_test), cluster_labels)
    
# calculate and print accuracy
print('Accuracy: ', metrics.accuracy_score(y_test, predicted_labels))

Accuracy:  0.1135
