## Nearest Neighbour rule for classification

#### Importing the datasets

In [14]:
import matplotlib.pyplot as plt
import numpy as np
import math
from utils import mnist_reader
X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train')
X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k')

#### Fixing the ill shapes of the data sets and flattening to work with numpy

In [15]:
# Initial dimensions of the vectors
print('Number of training images:', X_train.shape)
print('Number of training labels:', y_train.shape)

print('Number of testing images:', X_test.shape)
print('Number of testing labels:', y_test.shape)

Y_train = (y_train.reshape((X_train.shape[0]),1))
Y_test = (y_test.reshape((X_test.shape[0]),1))

print('Number of training images:', X_train.shape)
print('Number of training labels:', Y_train.shape)

print('Number of testing images:', X_test.shape)
print('Number of testing labels:', Y_test.shape)

X_train = X_train/255
X_test = X_test/255


Number of training images: (60000, 784)
Number of training labels: (60000,)
Number of testing images: (10000, 784)
Number of testing labels: (10000,)
Number of training images: (60000, 784)
Number of training labels: (60000, 1)
Number of testing images: (10000, 784)
Number of testing labels: (10000, 1)


#### Normal 1 - nearest neighbour classifier (user made)

In [3]:
all_lables = []
for l in range(0, Y_test.shape[0]):
    t = np.array([X_test[l]]*60000)
    sub = X_train - t
    sub = np.square(sub)
    sub2 = np.sum(sub, 1)
    inddd = np.argmin(sub2)
    all_lables.append(Y_train[inddd])
       
            

In [4]:
# accuracy
eff = 0
for loop in range(0,Y_test.shape[0]):
    if all_lables[loop]==Y_test[loop]:
        eff = eff + 1
print('The accuracy is- ',(eff*100)/Y_test.shape[0])

The accuracy is-  84.97


#### Normal nearest neighbour classifier (inbuilt function) Here all the experiments with the k number of components have been done

In [5]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [6]:
# accuracy
eff = 0
for loop in range(0,Y_test.shape[0]):
    if y_pred[loop]==Y_test[loop]:
        eff = eff + 1
print('The accuracy is- ',(eff*100)/Y_test.shape[0])

The accuracy is-  85.54


### Applying PCA to reduce the dimension from 784 to 50 and going through the same pipeline

In [19]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
#pca.fit(X_train)
X_pca_train = pca.fit_transform(X_train)
print("original train shape:   ", X_train.shape)
print("transformed train shape:", X_pca_train.shape)
print(np.sum(pca.explained_variance_ratio_))
#pca.fit(X_test)
X_pca_test = pca.transform(X_test)
print("original test shape:   ", X_test.shape)
print("transformed test shape:", X_pca_test.shape)

original train shape:    (60000, 784)
transformed train shape: (60000, 100)
0.9118044250550499
original test shape:    (10000, 784)
transformed test shape: (10000, 100)


#### Printing after every 500 test samples have been labeled

In [20]:
all_lables_pca = []
for l1 in range(0, Y_test.shape[0]):
    t = np.array([X_pca_test[l1]]*60000)
    sub = X_pca_train - t
    sub = np.square(sub)
    sub2 = np.sum(sub, 1)
    trial = np.argmin(sub2)
    all_lables_pca.append(Y_train[trial])
    if l1%500 == 0:
        print(l1)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500


In [21]:
# accuracy
eff = 0
for loop in range(0,Y_test.shape[0]):
    if all_lables_pca[loop]==Y_test[loop]:
        eff = eff + 1
print('The accuracy is- ',(eff*100)/Y_test.shape[0])

The accuracy is-  84.82


### Using LDA to decrease the dimension from 50 to 9 by giving PCA as the input instead of the original data and going through the same pipeline

In [22]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
X_lda_train = lda.fit_transform(X_pca_train,y_train)
X_lda_test = lda.transform(X_pca_test)

print("original train shape:   ", X_pca_train.shape)
print("transformed train shape:", X_lda_train.shape)

print("original test shape:   ", X_pca_test.shape)
print("transformed test shape:", X_lda_test.shape)

original train shape:    (60000, 100)
transformed train shape: (60000, 9)
original test shape:    (10000, 100)
transformed test shape: (10000, 9)


#### Printing after every 500 test samples have been labeled

In [23]:
all_lables_lda = []
for l2 in range(0, Y_test.shape[0]):
    t = np.array([X_lda_test[l2]]*60000)
    sub = X_lda_train - t
    sub = np.square(sub)
    sub2 = np.sum(sub, 1)
    trial = np.argmin(sub2)
    all_lables_lda.append(Y_train[trial])
    if l2%500 == 0:
        print(l2)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500


In [24]:
# accuracy
eff = 0
for loop in range(0,Y_test.shape[0]):
    if all_lables_lda[loop]==Y_test[loop]:
        eff = eff + 1
print('The accuracy is- ',(eff*100)/Y_test.shape[0])

The accuracy is-  79.06
