In [5]:
######################################################### Intro to ML ###############################################################
######################################################## Assignment-2 ##########################################################



## Extracting the data set downloaded from http://www.cs.toronto.edu/~kriz/cifar.html

import tarfile
file = tarfile.open(r"C:\Users\rames\Downloads\cifar-10-python.tar.gz")
file.extractall(r"C:\Users\rames\Downloads")
file.close()

In [55]:
## Importing the libraries

import cv2
import pickle
import numpy as np
from scipy.spatial import distance
import collections
import random
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances
from validclust import dunn

In [28]:
######################################################### PART 2 ###############################################################
## Unpickling the data set as per instructions in the site http://www.cs.toronto.edu/~kriz/cifar.html 

def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict
Dict=unpickle(r"C:\Users\rames\Downloads\cifar-10-batches-py\test_batch")

In [29]:
## Loading the test batch's  data and labels

data = Dict[b'data']
labels = Dict[b'labels']

In [30]:
## Data preprocessing for test dataset

data_updated=[]
for i in range(len(data)):
    img=data[i].reshape(3,32,32).transpose(1,2,0).astype("uint8")
    gray_image=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    dummy_norm=np.zeros((32,32))
    min_=gray_image.min()
    max_=gray_image.max()
    norm_gray_image=cv2.normalize(gray_image,dummy_norm,min_,max_,cv2.NORM_MINMAX)
    data_updated.append(norm_gray_image)

In [48]:
## Updated data array

data_updated_arr=np.array(data_updated)
#print(data_updated_arr[0])
data_updated_arr=data_updated_arr.reshape(10000,1024)

[[ 98  97 102 ...  82  79  73]
 [ 98  94  98 ...  81  77  75]
 [ 96  91  94 ...  84  81  76]
 ...
 [133 108  96 ... 105  71  92]
 [125 110  93 ...  90  89  72]
 [117 113  97 ...  85  92  75]]


In [52]:
## K Means clustering algorithm implentation

def initialize_centroids(value, inputs):
    centroids_dict={}
    for i in range(value):
        centroids_dict[str(i)]=inputs[random.randint(0,len(inputs)-1)]
    return centroids_dict

def assign_cluster(image,centroids,cluster):
    min_=float('inf')
    for key,value in centroids.items():
        dst=distance.euclidean(image,value)
        if dst < min_:
            min_=dst
            cluster_id=key
    if cluster_id in cluster:
        cluster[cluster_id].append(image)
    else:
        cluster[cluster_id]=[image]
    return cluster

def compute_centroid(cluster):
    updated_centroids_dict={}
    for key,value in cluster.items():
        value_arr=np.array(value)
        #print('avg',np.mean(value_arr,0))
        #print('len',len(np.mean(value_arr,0)))
        updated_centroids_dict[key]=np.mean(value_arr,0)
    return updated_centroids_dict
    
    

In [53]:
## Training the model

iterations=10
num_clusters=10
centroids_dict=initialize_centroids(num_clusters,data_updated_arr)
for i in range(iterations):
    image_cluster={}
    for img in data_updated_arr:
        image_cluster=assign_cluster(img,centroids_dict,image_cluster)
    centroids_dict=compute_centroid(image_cluster)  
centroids_dict=dict(sorted(centroids_dict.items()))
print(image_cluster)

{'7': [array([ 98,  97, 102, ...,  85,  92,  75], dtype=uint8), array([90, 90, 89, ..., 39, 39, 37], dtype=uint8), array([57, 52, 70, ..., 86, 74, 36], dtype=uint8), array([79, 79, 78, ..., 36, 16, 17], dtype=uint8), array([ 42,  38,  37, ..., 196, 199, 180], dtype=uint8), array([24, 32, 31, ..., 74, 64, 65], dtype=uint8), array([116, 115, 114, ...,  24,  29,  35], dtype=uint8), array([106,  83,  86, ..., 125, 127, 113], dtype=uint8), array([50, 58, 63, ..., 23, 19, 20], dtype=uint8), array([152, 152, 151, ...,  20,  19,  16], dtype=uint8), array([46, 98, 80, ..., 83, 73, 31], dtype=uint8), array([ 63,  61,  60, ..., 100, 100, 100], dtype=uint8), array([146, 141, 144, ...,  35,  35,  34], dtype=uint8), array([17, 17, 15, ..., 12, 12, 12], dtype=uint8), array([ 83,  89,  81, ..., 144, 124, 148], dtype=uint8), array([ 58,  54,  52, ..., 126, 131, 134], dtype=uint8), array([54, 56, 48, ...,  3,  1,  1], dtype=uint8), array([88, 91, 93, ..., 80, 81, 67], dtype=uint8), array([ 85, 106, 101,

In [54]:
##Silhouette score calculation

inputs=[]
outputs=[]
for key,values in image_cluster.items():
    num_items=len(image_cluster[key])
    inputs+=values
    outputs+=num_items*[int(key)]
inputs=np.asarray(inputs)
outputs=np.asarray(outputs)
silhouette_score(inputs, outputs)

0.05406823008462551

In [66]:
print(inputs)

[[  0.      237.3564    0.      ... 323.81488   0.        0.     ]
 [  0.      151.01584   0.      ... 444.0542    0.        0.     ]
 [  0.      206.71893   0.      ... 382.16437   0.        0.     ]
 ...
 [  0.      325.51718   0.      ... 309.48196   0.        0.     ]
 [  0.      155.73557   0.      ... 154.86514   0.        0.     ]
 [  0.      446.00546   0.      ... 463.74463   0.        0.     ]]


In [56]:
## Dunn's index

dist = pairwise_distances(data) 
labels=np.asarray(labels)
dunn(dist, labels)

0.055636835992773456

In [7]:
######################################################### PART 2 ###############################################################

## Unpickling the data set as per instructions in the site http://www.cs.toronto.edu/~kriz/cifar.html 

Dict1=unpickle(r"C:\Users\rames\Downloads\cifar-10-batches-py\data_batch_1")
data1 = list(Dict1[b'data'])
labels1 = Dict1[b'labels']
Dict2=unpickle(r"C:\Users\rames\Downloads\cifar-10-batches-py/data_batch_2")
data2 = list(Dict2[b'data'])
labels2 = Dict2[b'labels']
Dict3=unpickle(r"C:\Users\rames\Downloads\cifar-10-batches-py/data_batch_3")
data3 = list(Dict3[b'data'])
labels3 = Dict3[b'labels']
Dict4=unpickle(r"C:\Users\rames\Downloads\cifar-10-batches-py/data_batch_4")
data4 = list(Dict4[b'data'])
labels4 = Dict4[b'labels']
Dict5=unpickle(r"C:\Users\rames\Downloads\cifar-10-batches-py/data_batch_5")
data5 = list(Dict5[b'data'])
labels5 = Dict5[b'labels']

In [65]:
data_auto_encoder=[]
labels_auto_encoder=[]
for i in range(1,6):
   data_auto_encoder+=(eval('data'+str(i)))
   labels_auto_encoder+=(eval('data'+str(i)))
data_auto_encoder=np.asarray(data_auto_encoder)
data_auto_encoder.shape

(50000, 3072)

In [58]:
##Data preprocessing of train dataset

data_updated=[]
for i in range(len(data_auto_encoder)):
    img=data_auto_encoder[i].reshape(3,32,32).transpose(1,2,0).astype("uint8")
    gray_image=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    dummy_norm=np.zeros((32,32))
    min_=gray_image.min()
    max_=gray_image.max()
    norm_gray_image=cv2.normalize(gray_image,dummy_norm,min_,max_,cv2.NORM_MINMAX)
    data_updated.append(norm_gray_image)
data_updated_arr=np.array(data_updated)
data_updated_arr=data_updated_arr.reshape(50000,1024)

In [59]:
import keras
from keras import layers
from keras import regularizers

In [76]:
## Neural Network(Encoder and Decoder)

input_img = keras.Input(shape=(1024,))
encoded = layers.Dense(512, activation='relu')(input_img)
encoded = layers.Dense(256, activation='relu')(encoded)
encoded = layers.Dense(128, activation='relu')(encoded)
decoded = layers.Dense(128, activation='relu')(encoded)
decoded = layers.Dense(256, activation='relu')(decoded)
decoded = layers.Dense(1024, activation='relu')(decoded)

In [77]:
##Training the model

autoencoder = keras.Model(input_img, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

autoencoder.fit(data_updated_arr, data_updated_arr,
                epochs=45,
                batch_size=256,
                shuffle=True,)

Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


<tensorflow.python.keras.callbacks.History at 0x2288c20c820>

In [78]:
encoder = keras.Model(input_img, encoded)
encoded_data_updated_arr = encoder.predict(data_updated_arr)

In [79]:
##K-Means Clustering from Part-1

iterations=20
num_clusters=10
centroids_dict_auto_encoder=initialize_centroids(num_clusters, encoded_data_updated_arr)
for i in range(iterations):
    image_cluster_auto_encoder={}
    for img in encoded_data_updated_arr:
      image_cluster_auto_encoder=assign_cluster(img,centroids_dict_auto_encoder,image_cluster_auto_encoder)
    centroids_dict_auto_encoder=compute_centroid(image_cluster_auto_encoder)
centroids_dict_auto_encoder=dict(sorted(centroids_dict_auto_encoder.items()))

In [80]:
##Silhouette score calculation

inputs=[]
outputs=[]
for key,values in image_cluster_auto_encoder.items():
    num_items=len(image_cluster_auto_encoder[key])
    inputs+=values
    outputs+=num_items*[int(key)]
inputs=np.asarray(inputs)
outputs=np.asarray(outputs)
silhouette_score(inputs, outputs)

0.06839757