In [1]:
import numpy as np
from scipy.io import loadmat
import matplotlib.pyplot as plt
from typing import List, Tuple 

In [2]:
def load_and_shuffle_data():
    #Loading the data
    M = loadmat('MNIST_digit_data.mat')
    images_train,images_test,labels_train,labels_test= M['images_train'],M['images_test'],M['labels_train'],M['labels_test']
    
    #just to make all random sequences on all computers the same.
    np.random.seed(531)
    
    #randomly permute data points
    inds = np.random.permutation(images_train.shape[0])
    images_train = images_train[inds]
    labels_train = labels_train[inds]
    
    inds = np.random.permutation(images_test.shape[0])
    images_test = images_test[inds]
    labels_test = labels_test[inds]

    return images_train, labels_train, images_test, labels_test

def show_image(i: int):
    im = images_train[i,:].reshape((28,28),order='F')
    plt.imshow(im)
    plt.title('Class Label:'+str(labels_train[i][0]))
    plt.show()

In [9]:
def kNN(images_train: np.ndarray, labels_train: np.ndarray, images_test: np.ndarray, labels_test: np.ndarray, k: int) -> Tuple[List, float]:
    # setup
    correct_label_counts = [0] * 10 
    test_label_counts = [0] * 10
    # iterate over all test images
    for i, test_image in enumerate(images_test):
        # find distances from test_image to all training images
        differences = images_train - test_image
        squared_distances = np.sum(differences**2, axis=1)
        # find labels of k-nearest training images
        k_nearest_indices = np.argsort(squared_distances)[:k]
        k_nearest_labels = labels_train[:,0][k_nearest_indices]
        # calculate majority label from k-nearest training images
        predicted_label = np.argmax(np.bincount(k_nearest_labels))
        # check predicted label against test label
        correct_label = labels_test[i,0]
        correct_label_counts[correct_label] += (predicted_label == correct_label)
        test_label_counts[correct_label] += 1
    # calculate accuracy
    avg_acc = np.sum(correct_label_counts) / len(images_test) 
    acc = [correct_label_counts[i] / test_label_counts[i] if test_label_counts[i] > 0 else 0.0 for i in range(10)]
    # return accuracy 
    return [acc, avg_acc]

In [14]:
images_train, labels_train, images_test, labels_test = load_and_shuffle_data()

images_test_1000 = images_test[:1000]
labels_test_1000 = labels_test[:1000]

images_train_10000 = images_train[0:10000,:]
labels_train_10000 = labels_train[0:10000,:]

oneNN_acc = kNN(images_train_10000, labels_train_10000, images_test_1000, labels_test_1000, 1)

In [13]:
oneNN_acc



[[np.float64(0.8488372093023255),
  np.float64(0.9029126213592233),
  np.float64(0.29310344827586204),
  np.float64(0.40336134453781514),
  np.float64(0.5315315315315315),
  np.float64(0.3793103448275862),
  np.float64(0.8478260869565217),
  np.float64(0.7894736842105263),
  np.float64(0.3372093023255814),
  np.float64(0.7093023255813954)],
 np.float64(0.598)]

In [15]:
oneNN_acc

[[np.float64(0.9883720930232558),
  np.float64(1.0),
  np.float64(0.9224137931034483),
  np.float64(0.9327731092436975),
  np.float64(0.9099099099099099),
  np.float64(0.9770114942528736),
  np.float64(0.9782608695652174),
  np.float64(0.9473684210526315),
  np.float64(0.8604651162790697),
  np.float64(0.9534883720930233)],
 np.float64(0.946)]