Using Euclidian distance or dot product similarity (choose one per dataset, you can try other similarity metrics),

A) run KMeans on the MNIST Dataset, try K=10

B) run KMeans on the FASHION Dataset, try K=10

C) run KMeans on the 20NG Dataset, try K=20

In [1]:
# A) run KMeans on the MNIST Dataset, try K=10

import tensorflow as tf
import numpy as np
import pandas as pd
import math
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import nltk
from sklearn import metrics

In [2]:
# Load MNIST Data from Keras

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

In [3]:
# resize the train data for normalization
x_train_reshaped = np.reshape(x_train, [-1, x_train.shape[0]])
x_train_reshaped.shape

(784, 60000)

In [4]:
# resize the test data for normalization
x_test_reshaped = np.reshape(x_test, [-1, x_test.shape[0]])
x_test_reshaped.shape

(784, 10000)

In [5]:
# Normalizing Train Data - Subtracting mean and Dividing by SD
train_mean = np.mean(x_train_reshaped)
train_sd = np.std(x_train_reshaped)
train_norm = (x_train_reshaped - train_mean)/train_sd

# Normalizing Test Data - Subtracting mean and Dividing by SD
test_mean = np.mean(x_test_reshaped)
test_sd = np.std(x_test_reshaped)
test_norm = (x_test_reshaped - test_mean)/test_sd

In [6]:
train_norm = train_norm.transpose()

In [7]:
test_norm = test_norm.transpose()

In [None]:
# Initialization for k-means

n = train_norm.shape[0]
k = 10

# Membership initialization (pi)
pi = np.zeros((n, k))

# Centroid initialization (mu) - setting 10 random centroids
mu = train_norm[np.random.choice(train_norm.shape[0], k, replace=False)]

In [None]:
# E & M Step for k-means

new_pi = pi
new_mu = mu
flag = 0
z = 1

while flag == 0:
  if  (new_pi == pi).all() and z != 1:
    flag = 1
  else:
    pi = new_pi
    new_pi = np.zeros((n, k))
    print ("Iteration ", z)
    mu = new_mu
  # M step for setting Membership in EM
    for i in range(train_norm.shape[0]):
      dist_arr = [];
      for clust in range(len(mu)):
        dist_arr.append(abs(np.linalg.norm(train_norm[i] - mu[clust])))
      ind_min = pd.Series(dist_arr).idxmin()
      new_pi[i, ind_min] = 1
  # E step for recalculating centroids
    new_mu = (np.matmul(train_norm.transpose(), new_pi).transpose())/(np.array([new_pi.sum(axis=0)] * mu.shape[1]).transpose())
    z = z + 1 


Iteration  1
Iteration  2
Iteration  3
Iteration  4
Iteration  5
Iteration  6
Iteration  7
Iteration  8
Iteration  9
Iteration  10
Iteration  11
Iteration  12
Iteration  13
Iteration  14
Iteration  15
Iteration  16
Iteration  17
Iteration  18
Iteration  19
Iteration  20
Iteration  21
Iteration  22
Iteration  23
Iteration  24
Iteration  25
Iteration  26
Iteration  27
Iteration  28
Iteration  29
Iteration  30
Iteration  31
Iteration  32
Iteration  33
Iteration  34
Iteration  35
Iteration  36
Iteration  37
Iteration  38
Iteration  39
Iteration  40
Iteration  41
Iteration  42
Iteration  43
Iteration  44
Iteration  45
Iteration  46
Iteration  47
Iteration  48
Iteration  49
Iteration  50
Iteration  51
Iteration  52
Iteration  53
Iteration  54
Iteration  55
Iteration  56
Iteration  57
Iteration  58
Iteration  59
Iteration  60
Iteration  61
Iteration  62
Iteration  63
Iteration  64
Iteration  65
Iteration  66
Iteration  67


In [25]:
# Initialization for k-means - Trying with a lower k

n = train_norm.shape[0]
k = 5

# Membership initialization (pi)
pi = np.zeros((n, k))

# Centroid initialization (mu) - setting 10 random centroids
mu = train_norm[np.random.choice(train_norm.shape[0], k, replace=False)]

In [26]:
# E & M Step for k-means

new_pi = pi
new_mu = mu
flag = 0
z = 1

while flag == 0:
  if  (new_pi == pi).all() and z != 1:
    flag = 1
  else:
    pi = new_pi
    new_pi = np.zeros((n, k))
    print ("Iteration ", z)
    mu = new_mu
  # M step for setting Membership in EM
    for i in range(train_norm.shape[0]):
      dist_arr = [];
      for clust in range(len(mu)):
        dist_arr.append(abs(np.linalg.norm(train_norm[i] - mu[clust])))
      ind_min = pd.Series(dist_arr).idxmin()
      new_pi[i, ind_min] = 1
  # E step for recalculating centroids
    new_mu = (np.matmul(train_norm.transpose(), new_pi).transpose())/(np.array([new_pi.sum(axis=0)] * mu.shape[1]).transpose())
    z = z + 1 


Iteration  1
Iteration  2
Iteration  3
Iteration  4
Iteration  5
Iteration  6
Iteration  7
Iteration  8
Iteration  9
Iteration  10
Iteration  11
Iteration  12
Iteration  13
Iteration  14
Iteration  15
Iteration  16
Iteration  17
Iteration  18
Iteration  19
Iteration  20
Iteration  21
Iteration  22
Iteration  23
Iteration  24
Iteration  25
Iteration  26
Iteration  27
Iteration  28
Iteration  29
Iteration  30
Iteration  31
Iteration  32
Iteration  33
Iteration  34
Iteration  35
Iteration  36
Iteration  37
Iteration  38
Iteration  39
Iteration  40
