# K-Nearest Neighbors classifier implementation using Library Classifier

In [128]:
from keras.datasets import mnist 
from sklearn.metrics import confusion_matrix as cm
from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.utils import shuffle
from sklearn.metrics import classification_report as cr
import timeit as tt
import numpy as np
import pandas as pd
import random

Calculations and definitions of variables

In [129]:
k = 44 #Since our USNs end with 17 and 44, the 'k' value is 44
dp = 7100 #number of data points; (17 + 44) x 100 + 1000 = 7100
trp = 5680 #training points; 80% of 7100 = 5680
tp = 1420 #testing points; 20% of 7100 = 1420
n = 10 # Will be used for matrices and for-loops

Defining training and testing datas

In [130]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()
train_X = train_X[:dp] # Resizing train_X with our required data points
train_X = np.reshape(train_X, (dp, 28*28)) 
train_y = train_y[:dp] # Resizing train_y with our required data points
test_X = test_X[:dp] # Resizing test_X with our required data points
test_X = np.reshape(test_X, (dp, 28*28))
test_y = test_y[:dp] # Resizing test_X with our required data points

Custom shuffle method for training data

In [152]:
# implementing mnist from keras in sklearn's train_test_split was complicated
def shufTrainData():
    X_shuf, y_shuf = shuffle(train_X, train_y)
    return (X_shuf, y_shuf)

Custom shuffle method for testing data

In [147]:
def shufTestData():
    X_shuf, y_shuf = shuffle(test_X, test_y)
    return (X_shuf, y_shuf)

Method to calculate the average runtime

In [148]:
def calcAvg(tAvg, total_1):
    if i == 0:
        tAvg = total_1
        return tAvg
    elif i != 0:
        tAvg = (tAvg + total_1)/(2)
        return tAvg

Cross Validation

In [164]:
cm_0 = np.zeros((n, n)) # Creating a temporary null matrix
# To calculate the sum of all the elements in the matrix
prec = 0

# To calculate total runtime for k iterations 
# Source: https://stackoverflow.com/questions/5622976/how-do-you-calculate-program-run-time-in-python
start_0 = tt.default_timer() 
t_avg = 0

for i in range(k):
    j = (i + 1)
    start_1 = tt.default_timer() # To calculate runtime  for each iteration
    print(f'Starting iteration - {i}') #To keep a track of how many iterations have finished
    knn = kNN(n_neighbors = k)
    X_train, y_train = shufTrainData()
    X_test, y_test = shufTestData()
    X_train = X_train[:trp]
    y_train = y_train[:trp]
    knn.fit(X_train, y_train)
    X_test = X_test[:tp]
    y_test = y_test[:tp]
    y_pred = knn.predict(X_test)
    cm_2 = cm(y_test, y_pred)
    cm_1 = cm_0 + cm_2
    cm_0 = cm_1
    prec += cm_0.diagonal()/cm_1.sum(axis = 0)
    print(f'Ending iteration - {i}')
    stop_1 = tt.default_timer() # To calculate runtime for each iteration
    total_1 = (stop_1 - start_1)
    t_avg = calcAvg(t_avg, total_1)
    print(f'Total runtime for iteration \'{i}\': {total_1} seconds.\n')
    
# To calculate total runtime for k iterations
stop_0 = tt.default_timer()
total_0 = (stop_0 - start_0) # Since time is stored in seconds and it is easier to read in minutes
for i in range(n):
    print(f'Precision of number {i} - {(prec[i]/k)*100} %')

cm_avg = cm_1/k # Creating a confusion matrix to store the average value
print(cm_avg)
print(f'Total runtime for {k} iterations: {total_0} seconds.') 
print(f'Average runtime per iteration: {t_avg} seconds.') 

Starting iteration - 0
Ending iteration - 0
Total runtime for iteration '0': 0.23373991600237787 seconds.

Starting iteration - 1
Ending iteration - 1
Total runtime for iteration '1': 0.21537570800137473 seconds.

Starting iteration - 2
Ending iteration - 2
Total runtime for iteration '2': 0.2824932919975254 seconds.

Starting iteration - 3
Ending iteration - 3
Total runtime for iteration '3': 0.2945945000028587 seconds.

Starting iteration - 4
Ending iteration - 4
Total runtime for iteration '4': 0.1813179169985233 seconds.

Starting iteration - 5
Ending iteration - 5
Total runtime for iteration '5': 0.20791841600293992 seconds.

Starting iteration - 6
Ending iteration - 6
Total runtime for iteration '6': 0.21190950000163866 seconds.

Starting iteration - 7
Ending iteration - 7
Total runtime for iteration '7': 0.199680083002022 seconds.

Starting iteration - 8
Ending iteration - 8
Total runtime for iteration '8': 0.2043159589957213 seconds.

Starting iteration - 9
Ending iteration - 9

Creating a Pandas Data Frame for better viewing of the result

In [165]:
df = pd.DataFrame(cm_avg)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,131.5,0.0,0.0,0.0,0.090909,1.090909,1.818182,0.204545,0.204545,0.0
1,0.0,160.977273,0.318182,0.295455,0.0,0.0,0.227273,0.0,0.0,0.0
2,1.954545,22.136364,109.545455,2.386364,0.75,0.022727,1.568182,5.340909,2.431818,0.25
3,0.068182,5.363636,0.568182,129.954545,0.181818,1.090909,0.590909,2.590909,1.5,1.0
4,0.113636,6.477273,0.0,0.0,121.25,0.0,1.863636,0.477273,0.022727,11.113636
5,0.954545,3.909091,0.045455,5.272727,1.590909,108.840909,2.613636,0.931818,0.659091,4.522727
6,2.090909,1.772727,0.022727,0.272727,2.863636,0.977273,126.25,0.181818,0.0,0.0
7,0.0,13.295455,0.0,0.0,0.886364,0.0,0.0,122.090909,0.0,6.704545
8,3.795455,5.863636,0.454545,5.659091,2.409091,2.659091,0.954545,1.954545,108.931818,6.363636
9,2.090909,3.454545,0.181818,1.545455,2.863636,0.409091,0.227273,2.0,0.113636,133.977273


To calculate and verify the sum of all the elements in the matrix

In [166]:
s = 0
for i in range(n):
    for j in range(n):
        s += cm_avg[i][j]
        
s

1420.0