In [17]:
#import libraries
import sys
from os.path import dirname, join as pjoin
import random
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import scipy.linalg as la
np.set_printoptions(threshold=1000)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from collections import Counter
import time

In [18]:
#load usps data into arr
usps_data_dir= "usps_all.mat"
usps_data = sio.loadmat(usps_data_dir)
arr = usps_data['fea']
labels = usps_data['gnd']
arr.shape

(9298, 256)

In [19]:
# Split-out dataset into training and validation
X = arr[:,0:255]
y = labels[:,0]
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=2007, random_state=1)

In [20]:
def euclidean_distance(x1, x2):
        return np.sqrt(np.sum((x1 - x2)**2))

In [21]:
class KNN:

    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argsort(distances)[:self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = [self.y_train[i] for i in k_idx]  
        # return the most common class label
        most_common = Counter(k_neighbor_labels).most_common(1)
        return most_common[0][0]

In [23]:
#start the timer
start = time.perf_counter()
#Train the test data using K-nearest neighbor
start = time.perf_counter()
neigh = KNN(k=3)
neigh.fit(X_train,Y_train)
#Predicting labels on the test set.
y_pred =  neigh.predict(X_validation)
#stop the timer
stop = time.perf_counter()
print(f"Time taken for training and prediction is {stop - start:0.4f} seconds")

In [27]:
acc = np.sum(y_pred == Y_validation)/len(Y_validation)
print(acc)

0.9750871948181365


In [26]:
#Evaluate Predictions using Accuracy Score and Confusion Matrix
print('Confusion Matrix: \n', confusion_matrix(Y_validation, y_pred))
print('\nConfusion matrix for each of the digits from 0 to 9 printed below in order')
#displays the confusion matrix for each of the digits
print(multilabel_confusion_matrix(Y_validation, y_pred))
print('\nAccuracy for each of the digits from 0 to 9 printed below ')
#the confusion matrix obtained for each of the digits can be used to calculate the accuracy for each digit
mcm = multilabel_confusion_matrix(Y_validation, y_pred)
worst_digit =-1
worst_accuracy = -1
for i in range(0,10):
    # each of the digits accuracy can be calculated from their respective matrix using the formula
    #((True positive + True negative) / All))
    digit_accuracy = (mcm[i][0][0] +mcm[i][1][1])/(mcm[i][0][0]+ mcm[i][0][1]+mcm[i][1][0] +mcm[i][1][1])
    if worst_accuracy ==-1 or digit_accuracy< worst_accuracy:
        worst_digit=i
        worst_accuracy = digit_accuracy
    print('Accuracy for digit '  , i , ': ' , digit_accuracy)
    
print('\nDigit with worst accuracy is', worst_digit, '. It has an accuracy score of: ', worst_accuracy)

print('\nAccuracy Score on the entire Test Data: ',accuracy_score(Y_validation, y_pred))
print('\nPrecision Score on the entire Test Data: ',precision_score(Y_validation, y_pred, average='macro'))
print('\nRecall Score on the entire Test Data: ',recall_score(Y_validation, y_pred, average='macro'))

Confusion Matrix: 
 [[335   0   0   2   0   0   0   0   0   1]
 [  0 273   0   0   1   0   1   0   0   0]
 [  2   0 198   0   0   0   1   1   0   0]
 [  0   0   1 183   0   5   0   0   1   1]
 [  1   0   1   0 160   0   1   1   0   7]
 [  1   0   1   0   1 162   0   0   2   1]
 [  1   0   1   0   0   0 154   0   0   0]
 [  0   2   0   0   1   0   0 176   2   2]
 [  0   1   0   1   0   2   0   1 150   1]
 [  0   0   0   0   1   0   0   0   0 166]]

Accuracy Score on the Test Data:  0.9750871948181365

Precision Score on the Test Data:  0.9731931358096707

Recall Score on the Test Data:  0.9726601523681587
