# K-Nearest Neighbors classifier implementation using Library Classifier

In [128]:
from keras.datasets import mnist 
from sklearn.metrics import confusion_matrix as cm
from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.utils import shuffle
from sklearn.metrics import classification_report as cr
import timeit as tt
import numpy as np
import pandas as pd
import random

Calculations and definitions of variables

In [129]:
k = 44 #Since our USNs end with 17 and 44, the 'k' value is 44
dp = 7100 #number of data points; (17 + 44) x 100 + 1000 = 7100
trp = 5680 #training points; 80% of 7100 = 5680
tp = 1420 #testing points; 20% of 7100 = 1420
n = 10 # Will be used for matrices and for-loops

Defining training and testing datas

In [130]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()
train_X = train_X[:dp] # Resizing train_X with our required data points
train_X = np.reshape(train_X, (dp, 28*28)) 
train_y = train_y[:dp] # Resizing train_y with our required data points
test_X = test_X[:dp] # Resizing test_X with our required data points
test_X = np.reshape(test_X, (dp, 28*28))
test_y = test_y[:dp] # Resizing test_X with our required data points

Custom shuffle method for training data

In [152]:
# implementing mnist from keras in sklearn's train_test_split was complicated
# Source: Sai Nishwanth, USN: ENG21AM3031
def shufTrainData():
    X_shuf, y_shuf = shuffle(train_X, train_y)
    return (X_shuf, y_shuf)

Custom shuffle method for testing data

In [147]:
def shufTestData():
    X_shuf, y_shuf = shuffle(test_X, test_y)
    return (X_shuf, y_shuf)

Method to calculate the average runtime

In [148]:
def calcAvg(tAvg, total_1):
    if i == 0:
        tAvg = total_1
        return tAvg
    elif i != 0:
        tAvg = (tAvg + total_1)/(2)
        return tAvg

Cross Validation

In [149]:
cm_0 = np.zeros((n, n)) # Creating a temporary null matrix
# To calculate the sum of all the elements in the matrix
# Source: Sai Nishwanth, USN: ENG21AM3031
prec = 0

# To calculate total runtime for k iterations 
# Source: https://stackoverflow.com/questions/5622976/how-do-you-calculate-program-run-time-in-python
start_0 = tt.default_timer() 
t_avg = 0

for i in range(k):
    j = (i + 1)
    start_1 = tt.default_timer() # To calculate runtime  for each iteration
    print(f'Starting iteration - {i}') #To keep a track of how many iterations have finished
    knn = kNN(n_neighbors = k)
    X_train, y_train = shufTrainData()
    X_test, y_test = shufTestData()
    X_train = X_train[:trp]
    y_train = y_train[:trp]
    knn.fit(X_train, y_train)
    X_test = X_test[:tp]
    y_test = y_test[:tp]
    y_pred = knn.predict(X_test)
    cm_2 = cm(y_test, y_pred)
    # print(cm_2)
    cm_1 = cm_0 + cm_2
    # print(cm_0)
    cm_0 = cm_1
    prec += cm_0.diagonal()/cm_1.sum(axis = 0)
    print(f'Ending iteration - {i}')
    stop_1 = tt.default_timer() # To calculate runtime  for each iteration
    total_1 = (stop_1 - start_1)
    t_avg = calcAvg(t_avg, total_1)
    print(f'Total runtime for iteration \'{i}\': {total_1} seconds.\n')
    
# To calculate total runtime for k iterations
stop_0 = tt.default_timer()
total_0 = (stop_0 - start_0) # Since time is stored in seconds and it is easier to read in minutes
for i in range(n):
    print(f'Precision of number {i} - {(prec[i]/k)*100} %')

cm_avg = cm_1/k # Creating a confusion matrix to store the average value
print(cm_avg)
print(f'Total runtime for {k} iterations: {total_0} seconds.') 
print(f'Average runtime per iteration: {t_avg} seconds.') 

Starting iteration - 0
[[128   0   0   0   0   1   1   0   0   0]
 [  0 164   0   0   0   0   1   0   0   0]
 [  2  25 126   1   1   0   2   5   5   0]
 [  0   6   0 128   1   0   0   5   3   2]
 [  0   4   0   0 107   0   2   0   0  21]
 [  1   3   0   4   3  90   2   0   1   6]
 [  2   1   0   0   3   1 131   0   0   0]
 [  0   7   0   0   2   0   0 109   0   9]
 [  5   4   1   7   2   1   3   3 117   3]
 [  2   3   0   3   0   0   0   2   0 148]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Ending iteration - 0
Total runtime for iteration '0': 0.2215804580009717 seconds.

Starting iteration - 1
[[157   0   0   0   0   4   2   0   0   0]
 [  0 157   0   0   0   0   0   0   0   0]
 [  5  18 119   1   0  

Creating a Pandas Data Frame for better viewing of the result

In [150]:
df = pd.DataFrame(cm_avg)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,131.590909,0.0,0.022727,0.0,0.090909,1.318182,1.613636,0.181818,0.068182,0.0
1,0.0,161.840909,0.409091,0.340909,0.0,0.0,0.340909,0.0,0.0,0.0
2,2.431818,20.931818,112.659091,2.318182,0.863636,0.090909,1.227273,5.818182,2.772727,0.181818
3,0.0,5.431818,0.681818,130.431818,0.159091,1.136364,0.727273,2.409091,1.522727,1.045455
4,0.159091,6.204545,0.0,0.0,121.795455,0.0,1.636364,0.545455,0.068182,11.931818
5,0.886364,3.840909,0.022727,5.318182,1.659091,108.795455,2.613636,0.795455,0.431818,4.045455
6,1.886364,1.681818,0.0,0.045455,2.681818,0.909091,123.136364,0.045455,0.0,0.0
7,0.0,13.704545,0.0,0.0,0.727273,0.0,0.0,122.181818,0.0,6.772727
8,4.045455,6.272727,0.295455,6.477273,2.25,2.636364,1.363636,2.0,107.886364,6.636364
9,2.0,3.25,0.204545,1.454545,2.840909,0.318182,0.159091,2.136364,0.181818,132.409091


To calculate the sum of all the elements in the matrix

In [151]:
s = 0
for i in range(n):
    for j in range(n):
        s += cm_avg[i][j]
        
s

1420.0