## ML Estimation assuming Gaussian Distribution followed by Bayes Rule for classification

#### Importing Data

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from utils import mnist_reader
X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train')
X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k')

#### Fixing the ill dimensions and flattening the data

In [2]:
# Initial dimensions of the vectors
print('Number of training images:', X_train.shape)
print('Number of training labels:', y_train.shape)

print('Number of testing images:', X_test.shape)
print('Number of testing labels:', y_test.shape)

Y_train = (y_train.reshape((X_train.shape[0]),1))
Y_test = (y_test.reshape((X_test.shape[0]),1))

print('Number of training images:', X_train.shape)
print('Number of training labels:', Y_train.shape)

print('Number of testing images:', X_test.shape)
print('Number of testing labels:', Y_test.shape)

X_train = X_train/255
X_test = X_test/255

X_trainC = X_train.T
X_testC = X_test.T
Y_testC = Y_test.T
Y_trainC = Y_train.T

print("The matrix used for calculation")
print('Number of training images:', X_trainC.shape)
print('Number of training labels:', Y_trainC.shape)

print('Number of testing images:', X_testC.shape)
print('Number of testing labels:', Y_testC.shape)

Number of training images: (60000, 784)
Number of training labels: (60000,)
Number of testing images: (10000, 784)
Number of testing labels: (10000,)
Number of training images: (60000, 784)
Number of training labels: (60000, 1)
Number of testing images: (10000, 784)
Number of testing labels: (10000, 1)
The matrix used for calculation
Number of training images: (784, 60000)
Number of training labels: (1, 60000)
Number of testing images: (784, 10000)
Number of testing labels: (1, 10000)


#### Defining the mean function which returns the mean and covariance matrix of a given class from the training set gives ans input

In [3]:
def mean(trainDataset): #784 x 6000
    mean = []
    covm = np.cov(trainDataset)
    trainDataset = np.array(trainDataset)
    for index in range (0,trainDataset.shape[0]):
        mean.append(np.mean(trainDataset[index]))
    mean = np.array(mean)
    mean = mean.reshape(mean.shape[0],1)
    #print(covm)
    #print(covm.shape)
    #print(mean.shape)
    return mean, covm

#### Segregating the training classes based on the training labels 

In [4]:
id = 0.5*np.identity(784)
count = 0
train = []
m = []
c = []
mean_for_all_class = []
cov_for_all_class = []
i = 0
for lab in range(0,10):
    while (i != X_train.shape[0]):
        if Y_train[i] == lab:
            count = count +1
            train.append(X_train[i])
        i = i+1
    train = np.array(train)
    m, c = mean(train.T)
    c= c+ id
    mean_for_all_class.append(m)
    cov_for_all_class.append(c)
    #print(train.shape)
    train = []
    i = 0
    count = 0
mean_for_all_class = np.array(mean_for_all_class) #10,784,1
cov_for_all_class = np.array(cov_for_all_class) #10,784,784

In [5]:
print(mean_for_all_class[0].shape)

(784, 1)


#### Defining pdf function to get the value for each test sample, based on which bayes rule for classification is applied

In [6]:
def pdf(sample, mean, covariance): #784,1   784,1   784,784
    x = np.subtract(sample, mean)
    #print(x.shape)
    #print(x.T.shape)
    y = np.matmul(x.T, np.linalg.inv(covariance))
    #print('**',y.shape)
    y = np.dot(y,x) 
    #print('****',y.shape)
    z = (-0.5*y)-(0.5*np.log(np.linalg.det(covariance)))
    #print(z)
    return z
#pdf(X_test[3],mean_for_all_class[3],cov_for_all_class[3])

### Normal ML + Bayes rule of classification (with 784 dimension for each test sample)

In [7]:
pd = []
comp = 0
baye = []
for limit in range(0,X_test.shape[0]):
    for ind in range(0,10):
        X_test2 = X_test[limit].reshape(X_test[limit].shape[0],1)
        #print(X_test2.shape)
        #print(mean_for_all_class[ind].shape)
        #print(cov_for_all_class[ind].shape)
        p = pdf(X_test2,mean_for_all_class[ind],cov_for_all_class[ind])
        if p[0]>comp:
            comp = p[0]
            label = ind
    baye.append(label)
    #print(baye)
    comp = 0
    #print(limit)
#print(len(baye))

In [8]:
# accuracy
eff = 0
for loop in range(0,Y_test.shape[0]):
    if baye[loop]==Y_test[loop]:
        eff = eff + 1
print('The accuracy is- ',(eff*100)/Y_test.shape[0])

The accuracy is-  74.47


### Reducing the dimension using PCA and going through the same pipeline

In [47]:
# using PCA for the ML - bays classifier

from sklearn.decomposition import PCA

pca = PCA(n_components=50) # reducing the dimension from 784 to 50
# pca.fit(X_train)
X_pca_train = pca.fit_transform(X_train)
print("original train shape:   ", X_train.shape)
print("transformed train shape:", X_pca_train.shape)
print(np.sum(pca.explained_variance_ratio_))
# pca.fit(X_test)
X_pca_test = pca.transform(X_test)
print("original test shape:   ", X_test.shape)
print("transformed test shape:", X_pca_test.shape)

original train shape:    (60000, 784)
transformed train shape: (60000, 50)
0.8626437338081597
original test shape:    (10000, 784)
transformed test shape: (10000, 50)


In [48]:
#id = 0.5*np.identity(50)
train_pca = []
m_pca = []
c_pca = []
mean_for_all_class_pca = []
cov_for_all_class_pca = []
i = 0
for lab_pca in range(0,10):
    while (i != X_pca_train.shape[0]):
        if Y_train[i] == lab_pca:
            train_pca.append(X_pca_train[i])
        i = i+1
    train_pca = np.array(train_pca)
    m_pca, c_pca = mean(train_pca.T)
    #c_pca= c_pca+ id
    mean_for_all_class_pca.append(m_pca)
    cov_for_all_class_pca.append(c_pca)
    #print(train.shape)
    train_pca = []
    i = 0
mean_for_all_class_pca = np.array(mean_for_all_class_pca) #10,784,1
cov_for_all_class_pca = np.array(cov_for_all_class_pca) #10,784,784

In [49]:
#print(mean_for_all_class_pca.shape)
#print(np.sum(np.linalg.det(cov_for_all_class_pca)))

In [50]:
comp_pca = 0
baye_pca = []
for limit_pca in range(0,X_pca_test.shape[0]):
    for ind_pca in range(0,10):
        X_test2_pca = X_pca_test[limit_pca].reshape(X_pca_test[limit_pca].shape[0],1)
        #print(X_test2.shape)
        #print(mean_for_all_class[ind].shape)
        #print(cov_for_all_class[ind].shape)
        p_pca = pdf(X_test2_pca,mean_for_all_class_pca[ind_pca],cov_for_all_class_pca[ind_pca])
        #print(p_pca[0]+500)
        if p_pca[0]+500>comp_pca:
            comp_pca = p_pca[0]+500
            label_pca = ind_pca
    baye_pca.append(label_pca)
    #print(baye_pca)
    comp_pca = 0
    #print(limit_pca)
#print(len(baye_pca))

In [51]:
# accuracy
eff = 0
for loop_pca in range(0,Y_test.shape[0]):
    if baye_pca[loop_pca]==Y_test[loop_pca]:
        eff = eff + 1
print('The accuracy is- ',(eff*100)/Y_test.shape[0])

The accuracy is-  79.93


### Reducing the dimension from 50 to 9 usning LDA and giving the PCA data as input and going through the same pipeline

In [62]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
X_lda_train = lda.fit_transform(X_pca_train,y_train)
X_lda_test = lda.fit_transform(X_pca_test,y_test)

print("original train shape:   ", X_pca_train.shape)
print("transformed train shape:", X_lda_train.shape)

print("original test shape:   ", X_pca_test.shape)
print("transformed test shape:", X_lda_test.shape)

original train shape:    (60000, 50)
transformed train shape: (60000, 9)
original test shape:    (10000, 50)
transformed test shape: (10000, 9)


In [63]:
#id = 0.5*np.identity(784)
train_lda = []
m_lda = []
c_lda = []
mean_for_all_class_lda = []
cov_for_all_class_lda = []
i = 0
for lab_lda in range(0,10):
    while (i != X_lda_train.shape[0]):
        if Y_train[i] == lab_lda:
            train_lda.append(X_lda_train[i])
        i = i+1
    train_lda = np.array(train_lda)
    m_lda, c_lda = mean(train_lda.T)
    #c= c+ id
    mean_for_all_class_lda.append(m_lda)
    cov_for_all_class_lda.append(c_lda)
    #print(train.shape)
    train_lda = []
    i = 0
mean_for_all_class_lda = np.array(mean_for_all_class_lda) #10,784,1
cov_for_all_class_lda = np.array(cov_for_all_class_lda) #10,784,784

In [64]:
# printing some datas to confirm the right path
#print(mean_for_all_class_lda.shape)
#print(np.linalg.det(cov_for_all_class_lda[3]))

In [65]:
comp_lda = 0
baye_lda = []
for limit_lda in range(0,X_lda_test.shape[0]):
    for ind_lda in range(0,10):
        X_test2_lda = X_lda_test[limit_lda].reshape(X_lda_test[limit_lda].shape[0],1)
        #print(X_test2.shape)
        #print(mean_for_all_class[ind].shape)
        #print(cov_for_all_class[ind].shape)
        p_lda = pdf(X_test2_lda,mean_for_all_class_lda[ind_lda],cov_for_all_class_lda[ind_lda])
        #print(p_pca[0]+1000)
        if p_lda[0]+500>comp_lda:
            comp_lda = p_lda[0]+500
            label_lda = ind_lda
    baye_lda.append(label_lda)
    #print(baye_lda)
    comp_lda = 0
    #print(limit_lda)
#print(len(baye_lda))

In [66]:
# accuracy
eff = 0
for loop_lda in range(0,Y_test.shape[0]):
    if baye_lda[loop_lda]==Y_test[loop_lda]:
        eff = eff + 1
print('The accuracy is- ',(eff*100)/Y_test.shape[0])

The accuracy is-  80.0
