# Samuel Thelin, samthe-1 & Albin Mårtensson, albmrt-1

In [60]:
import numpy as np
from sklearn.model_selection import KFold
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.datasets import load_digits
from sklearn.datasets import load_breast_cancer


In [61]:
def kNN_predict(testSet, trainingSet, trainingLabel, k_nearest):
    numerable = testSet.shape[0]
    Lpred = np.zeros(numerable, dtype=trainingLabel.dtype)
    
    # cast as int to not get bugs like before
    testSet = testSet.astype(int)
    trainingSet = trainingSet.astype(int)

    for i in range(numerable):
        distances = np.sqrt(np.sum((trainingSet - testSet[i, :]) ** 2, axis=1)) # L2
        nearest_indices = np.argsort(distances)[:k_nearest]
        nearest_labels = trainingLabel[nearest_indices]
        label_counts = np.bincount(nearest_labels, minlength=10)
        
        # break ties if needed.
        if np.count_nonzero(label_counts == label_counts.max()) > 1:
            winner = trainingLabel[nearest_indices[0]]
        else:
            winner = np.argmax(label_counts)
        Lpred[i] = winner
    
    return Lpred

In [62]:
def find_optimal_k(training_set, training_labels, k_values, num_folds=3):
    best_k = None
    best_accuracy = 0
    accuracies_for_k = {k: [] for k in k_values}
    
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=1)
    
    for k in k_values:
        fold_accuracies = []
        
        for train_index, val_index in kf.split(training_set):
            X_train_fold, X_val_fold = training_set[train_index], training_set[val_index]
            y_train_fold, y_val_fold = training_labels[train_index], training_labels[val_index]
            
            predictions = kNN_predict(X_val_fold, X_train_fold, y_train_fold, k)
            
            accuracy = np.mean(predictions == y_val_fold)
            fold_accuracies.append(accuracy)
        
        avg_accuracy = np.mean(fold_accuracies)
        accuracies_for_k[k].append(avg_accuracy)
        
        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_k = k
    
    return best_k, best_accuracy, accuracies_for_k

In [63]:
def test_datasets(dataset, dataset_label):
    X = dataset.data
    y = dataset.target

    X = X.astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    k_values = list(range(1, 11)) 

    best_k, best_accuracy, accuracies_for_k = find_optimal_k(X_train, y_train, k_values, num_folds=5)
    print('\nDataset Label: ',dataset_label,'\n')
    print(f"\noptimal k: {best_k} with cross-validated accuracy: {best_accuracy:.4f}\n")
    for k, accuracy in accuracies_for_k.items():
        print(f"With k = {k}: {accuracy[0]:.4f}")
    #print("accuracies for each k:", accuracies_for_k,'\n')

    y_pred = kNN_predict(X_test, X_train, y_train, best_k)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"\ntest set accuracy with k={best_k}: {test_accuracy:.4f}\n===================================================")

In [64]:
test_datasets(load_iris(),'Iris')
test_datasets(load_wine(),'Wine')
test_datasets(load_breast_cancer(),'Breast Cancer')
test_datasets(load_digits(),'Digits')


Dataset Label:  Iris 


optimal k: 4 with cross-validated accuracy: 0.9417

With k = 1: 0.8833
With k = 2: 0.8833
With k = 3: 0.9333
With k = 4: 0.9417
With k = 5: 0.9250
With k = 6: 0.9000
With k = 7: 0.9250
With k = 8: 0.9333
With k = 9: 0.9250
With k = 10: 0.8917

test set accuracy with k=4: 0.9667

Dataset Label:  Wine 


optimal k: 1 with cross-validated accuracy: 0.7468

With k = 1: 0.7468
With k = 2: 0.7468
With k = 3: 0.6975
With k = 4: 0.7251
With k = 5: 0.7039
With k = 6: 0.6970
With k = 7: 0.7116
With k = 8: 0.7185
With k = 9: 0.6973
With k = 10: 0.6904

test set accuracy with k=1: 0.7778

Dataset Label:  Breast Cancer 


optimal k: 7 with cross-validated accuracy: 0.9209

With k = 1: 0.8945
With k = 2: 0.8945
With k = 3: 0.9165
With k = 4: 0.9121
With k = 5: 0.9165
With k = 6: 0.9165
With k = 7: 0.9209
With k = 8: 0.9209
With k = 9: 0.9187
With k = 10: 0.9165

test set accuracy with k=7: 0.9561

Dataset Label:  Digits 


optimal k: 3 with cross-validated accuracy: 0.9889



# MNIST dataset

In [None]:
from tensorflow.keras.datasets import mnist
from sklearn.metrics import accuracy_score

(X_train, y_train), (X_test, y_test) = mnist.load_data()

num_sample = 500  
Tr_set = X_train[:num_sample].reshape(num_sample, -1).astype(int)
Ltr_set = y_train[:num_sample]

X_test_flat = X_test[:num_sample].reshape(num_sample, -1).astype(int)
y_test_subset = y_test[:num_sample]

k_values = list(range(1, 11))

best_k, best_accuracy, accuracies_for_k = find_optimal_k(Tr_set, Ltr_set, k_values)

print(f"\nOptimal k: {best_k} with cross-validated accuracy: {best_accuracy:.4f}\n")
for k, accuracy in accuracies_for_k.items():
    print(f"With k = {k}: {accuracy[0]:.4f}")

y_pred = kNN_predict(X_test_flat, Tr_set, Ltr_set, best_k)
test_accuracy = accuracy_score(y_test_subset, y_pred)

print(f"\nTest set accuracy with k={best_k}: {test_accuracy:.4f}")
print("=" * 50)



Optimal k: 4 with cross-validated accuracy: 0.8680

With k = 1: 0.8561
With k = 2: 0.8561
With k = 3: 0.8620
With k = 4: 0.8680
With k = 5: 0.8280
With k = 6: 0.8380
With k = 7: 0.8220
With k = 8: 0.8200
With k = 9: 0.8040
With k = 10: 0.8060

Test set accuracy with k=4: 0.7740
