In [None]:
%cd /content/drive/MyDrive/Assignment_2

/content/drive/MyDrive/Assignment_2


In [None]:
import scipy.io
import os
import matplotlib.pyplot as plt
import numpy as np
import cv2
from sklearn import svm
import heapq
from sklearn.metrics import confusion_matrix

In [None]:
#loading svhn dataset
svhn=scipy.io.loadmat("train_32x32.mat")
X=svhn["X"]
y=svhn['y']
Data=np.zeros((73257,1024))
for i in range(73257):
  temp=cv2.cvtColor(X[:,:,:,i],cv2.COLOR_BGR2GRAY)
  temp=temp.flatten()
  Data[i,:]=temp

In [None]:
def PCA(X , k):    
    X_shifted = X - np.mean(X , axis = 0)
    sigma = np.cov(X_shifted , rowvar = False)
    eigen_values , eigen_vectors = np.linalg.eigh(sigma)
    sorted_index = np.argsort(eigen_values)[::-1]
    sorted_eigenvalue = eigen_values[sorted_index]
    sorted_eigenvectors = eigen_vectors[:,sorted_index]
    eigenvector_subset = sorted_eigenvectors[:,0:k]
    X_reduced = np.dot(eigenvector_subset.transpose() , X_shifted.transpose() ).transpose()
    return X_reduced

In [None]:
#standardizing data
mu=np.mean(Data)
sig=np.std(Data)
data=(Data-mu)/sig
k=10
data_p=PCA(data,20)


In [None]:
class K_Means:
    def __init__(self, k=2, tol=0.00001, max_iter=3000):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter

    def fit(self,data):

        self.centroids = {}

        for i in range(self.k):
            self.centroids[i] = data[i]

        for i in range(self.max_iter):
            self.classifications = {}

            for i in range(self.k):
                self.classifications[i] = []

            for featureset in data:
                distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(featureset)

            prev_centroids = dict(self.centroids)

            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification],axis=0)

            optimized = True

            for c in self.centroids:
                original_centroid = prev_centroids[c]
                current_centroid = self.centroids[c]
                if np.sum((current_centroid-original_centroid)/original_centroid*100.0) > self.tol:
                    # print(np.sum((current_centroid-original_centroid)/original_centroid*100.0))
                    optimized = False

            if optimized:
                break

    def predict(self,data):
        distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification

In [None]:
#prepares data for the Classifier, labels each cluster as a class.
def make_data(model):
  n=len(model.centroids)
  y=[]
  data=[]
  for i in model.centroids:
    data=data+model.classifications[i]
    y=y+[i]*len(model.classifications[i])
  return data,y


In [None]:
#prepares label for test data
def make_y(test_x,model):
  y=np.zeros(test_x.shape[0])
  for i in range(test_x.shape[0]):
    y[i]=model.predict(test_x[i,:])
  return y

In [None]:
#Supervised learning part
data=data_p
np.random.shuffle(data)
data=data[:10000,:]
train_x=data[:80*10000//100,:]
test_x=data[80*10000//100:,:]

In [None]:
#evaluates SVM over the dataset
ks=[3,5,10,15,20]
results_linear=[]
for i in ks:
  model=K_Means(k=i,max_iter=10000000)
  model.fit(train_x)
  train_x,train_y=make_data(model)
  train_x=np.array(train_x)
  train_y=np.array(train_y)
  test_y=make_y(test_x,model)
  test_y=np.array(test_y)
  test_x=np.array(test_x)


  linear = svm.SVC(kernel='linear',  C=0.1, decision_function_shape='ovo').fit(train_x, train_y)
  accuracy_linear = linear.score(test_x, test_y)
  preds=linear.predict(test_x)
  cml=confusion_matrix(test_y,preds)
  results_linear.append((accuracy_linear,cml))
 

In [None]:
# Training a neural net for the same purpose.
from keras.models import Sequential
from keras.layers import Dense,Dropout
import keras
models_acc=[]
noc=3
for i in [3,5,10,15,20]:
  noc=i
  cluster_model=K_Means(k=i,max_iter=10000000)
  cluster_model.fit(train_x)
  train_x,train_y=make_data(cluster_model)
  train_x=np.array(train_x)
  train_y=np.array(train_y)
  test_y=make_y(test_x,cluster_model)
  test_y=np.array(test_y)
  test_x=np.array(test_x)

  train_y=keras.utils.to_categorical(train_y,i)
  test_y=keras.utils.to_categorical(test_y,i)
  itt=5
  best_acc=-1
  for _ in range(itt):
    nn_model=Sequential()
    nn_model.add(Dense(32,input_shape=(20,),activation='relu'))
    nn_model.add(Dropout(0.2))
    nn_model.add(Dense(28, activation='relu'))
    nn_model.add(Dropout(0.2))
    nn_model.add(Dense(25, activation='relu'))
    nn_model.add(Dropout(0.2))
    nn_model.add(Dense(25, activation='relu'))
    nn_model.add(Dropout(0.2))
    nn_model.add(Dense(25, activation='relu'))
    nn_model.add(Dropout(0.2))
    nn_model.add(Dense(i, activation='sigmoid'))

    nn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  # nn_model.summary()
  
    nn_model.fit(train_x,train_y, epochs=96, batch_size=128)
    _, accuracy = nn_model.evaluate(test_x,test_y)
    if best_acc<accuracy:
      best_acc=accuracy
  models_acc.append(best_acc)
  print("hi")