In [1]:
import numpy as np
import pandas as pd
from keras.datasets import mnist
from sklearn.cluster import AgglomerativeClustering
from statistics import mode
from math import sqrt
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
def get_predicted_label(df):

    pred_cst = {}

    for cn in range(10):

        actual_labels = []

        for i in range(len(df)):
            if cn == df.iloc[i,2]:
                actual_labels.append(df.iloc[i,1])

        pred = mode(actual_labels)

        pred_cst[cn] = pred

    pred_labels = []

    for i in range(len(df)):
        c = df.iloc[i,2]
        pred_labels.append(pred_cst[c])

    return pred_labels

In [3]:
def get_purity(actual, cluster_lb, n):

    purity_df = pd.DataFrame({'actual':actual, 'cluster_lb':cluster_lb})
    purity_df = purity_df.groupby(['actual','cluster_lb'],as_index=False)['cluster_lb'].count().rename(columns={'cluster_lb':'count_cluster_lb'})
    purity_df = purity_df.sort_values(['actual','count_cluster_lb'],ascending=[True,False]).drop_duplicates(['actual'],keep='first')

    return round(purity_df['count_cluster_lb'].sum()/n, 2)

In [4]:
def get_gini(actual, cluster_lb, n):

    cluster_lb = np.array(cluster_lb).astype(int)
    actual = np.array(actual).astype(int)

    gini = 0

    for label in np.unique(actual):

        index = np.where(actual == label)[0]

        if len(index) > 0:
            p = (np.bincount(np.abs(cluster_lb[index]).astype(int))) / len(index)
            gini += len(index) / len(actual) * (1 - np.sum(p ** 2))
            
    return round(gini,2)

#### Data Preprocessing

In [5]:
data = mnist.load_data()
(x_train, y_train), (x_test, y_test) = data
x_train = x_train.reshape((x_train.shape[0], 28*28)).astype('float32')


# Normalization: Scaling pixel values between 0 and 1

x_train = x_train / 255
x_test = x_test / 255


# Converting image pixels and label into train and test dataframe

xtrain = list()
ytrain = list()

for i in range(len(x_train)):
  xtrain.append(x_train[i])
  ytrain.append(y_train[i])

xtrain_df = pd.DataFrame({
    'Data': xtrain,
    'Label': ytrain
})


# Taking a sample of 100 images for each digit from train

mnist_df = xtrain_df.groupby('Label', group_keys=False).apply(pd.DataFrame.sample, n=2000)


# Random shuffling dataframe

mnist_df = mnist_df.sample(frac=1)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


#### Using sklearn library

In [6]:
X = mnist_df['Data'].values.tolist()

cst = AgglomerativeClustering(n_clusters=10, affinity='euclidean').fit(X)

mnist_df['cluster'] = cst.labels_

mnist_df['Predicted Label'] = get_predicted_label(mnist_df)

In [7]:
mnist_df

Unnamed: 0,Data,Label,cluster,Predicted Label
56388,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,2,0
48918,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6,3,6
33926,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4,0,4
13122,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8,9,9
33159,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6,3,6
...,...,...,...,...
9084,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5,6,3
32329,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8,1,8
35144,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4,0,4
46424,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3,6,3


In [8]:
acc = 0

for i in range(len(mnist_df)):
    if mnist_df.iloc[i,1] == mnist_df.iloc[i,3]:
        acc += 1

print('Accuracy:', round(acc/20000,2))
print('Gini Index:', get_gini(mnist_df['Label'], mnist_df['cluster'], 20000))
print('Purity:', get_purity(mnist_df['Label'], mnist_df['cluster'], 20000))

Accuracy: 0.68
Gini Index: 0.4
Purity: 0.7


#### Using implemented method

In [4]:
def get_euc_dist(x, y):
    return round(sqrt( np.dot(x, x) - 2 * np.dot(x, y) + np.dot(y, y) ), 2)

In [49]:
initial_pts = mnist_df.groupby('Label', group_keys=False).apply(pd.DataFrame.sample, n=1).reset_index()
initial_pts

Unnamed: 0,index,Data,Label
0,8723,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
1,29057,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
2,39338,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
3,33407,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
4,12779,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4
5,54361,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5
6,19698,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6
7,39784,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7
8,4261,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8
9,16055,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9


In [215]:
k = 20
initial_pts = mnist_df.groupby('Label', group_keys=False).apply(pd.DataFrame.sample, n=k//10).reset_index()

centroids = [p for p in initial_pts['Data']]
clusters = [[p] for p in initial_pts['Data']]
cluster_els = [[i] for i in initial_pts['index']]
cluster_labels = [[l] for l in initial_pts['Label']]

ed = float('inf')
cst = -1


# Initial assignment

for i in range(len(mnist_df.index)):

    p = mnist_df.iloc[i,0]
    l = mnist_df.iloc[i,1]

    for j, c in enumerate(centroids):
        d = get_euc_dist(c,p)
        if d < ed:
            ed = d
            cst = j

    clusters[cst].append(p)
    cluster_els[cst].append(i)
    cluster_labels[cst].append(l)

In [216]:
# Centroid recomputation

for i in range(len(clusters)):
    centroids[i] = np.mean(clusters[i], axis=0).tolist()

In [219]:
# Combining clusters

new_clusters = clusters
new_cluster_labels = cluster_labels
new_cluster_els = cluster_els

while(len(new_clusters) != 10):

    merge = {}
    clusters = new_clusters
    cluster_labels = new_cluster_labels
    cluster_els = new_cluster_els

    euc_dist = euclidean_distances(centroids).tolist()

    for i in range(len(euc_dist)):
        s = euc_dist[i].index(sorted(euc_dist[i])[1])
        merge[i] = s

    new_clusters = []
    new_cluster_labels = []
    new_cluster_els = []

    print(merge)
    merged = []

    i = 0
    for k, v in merge.items():
        if k not in merged and v not in merged:
            new_clusters[i] = clusters[k] + clusters[v]
            new_cluster_labels[i] = cluster_labels[k] + cluster_labels[v]
            new_cluster_els[i] = cluster_els[k] + cluster_els[v]
            merged.append(k)
            merged.append(v)
            i += 1

    # print(len(clusters))

    if len(new_clusters) != 10:
        centroids = list()
        for i in range(len(new_clusters)):
            centroids[i] = np.mean(new_clusters[i], axis=0).tolist()

    # print(len(centroids))

    print('----')

{0: 1, 1: 13, 2: 7, 3: 13, 4: 1, 5: 13, 6: 13, 7: 13, 8: 13, 9: 1, 10: 3, 11: 13, 12: 13, 13: 3, 14: 1, 15: 3, 16: 3, 17: 15, 18: 19, 19: 18}


IndexError: ignored

In [155]:
clusters[11]

IndexError: ignored

In [126]:
l = [1,2,3,4,5,6,7]
del l[3]
len(l)

6

In [165]:
l = [1,2,3]
p = [4,5,6]
z =l + p
z

[1, 2, 3, 4, 5, 6]