In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

seed = 1234
np.random.seed(seed) 

In [2]:
# Load the features and labels for the MNIST dataset
mnist_X, mnist_y = fetch_openml('Fashion-MNIST', as_frame=False, return_X_y=True)
# Convert labels to integer data type
mnist_y = mnist_y.astype(int)

In [3]:
mnist_X_tr, mnist_X_te, mnist_y_tr, mnist_y_te = train_test_split(mnist_X, mnist_y, 
                                                                  test_size=0.25, random_state=seed, shuffle=True)

In [4]:
label_description = {
    0: "T-shirt/top",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle boot"
}

In [5]:
mnist_X_val = mnist_X_te[12500:17500]
mnist_y_val = mnist_y_te[12500:17500]

mnist_X_te = mnist_X_te[:12500]
mnist_y_te = mnist_y_te[:12500]

In [None]:
ks = [1, 5, 10, 15, 20, 25, 30, 40, 50]
n_tr = [100, 1000, 5000, 10000, 20000, 52500]

train_accuracies = []
val_accuracies = []

for k in ks:
    for n in n_tr:
        knn = KNeighborsClassifier(n_neighbors = k)
        knn.fit(mnist_X_tr[0:n, :], mnist_y_tr[0:n])
        
        y_pred_train = knn.predict(mnist_X_tr)
        train_acc = knn.score(mnist_X_tr[0:n], y_pred_train)
        train_accuracies.append([k, n, train_acc])
        
        y_pred_val = knn.predict(mnist_X_val)
        val_acc = knn.score(mnist_X_val, y_pred_val)
        val_accuracies.append([k, n, val_acc])


In [None]:
fig, axes = plt.subplots()

axes.set_xlabel('Regularization Strength')
axes.set_ylabel('Accuracy')

axes.semilogx(Cs, Cs_train_accuracy, color='blue', label='training dataset')
axes.semilogx(Cs, Cs_val_accuracy, color='red', label='validation dataset')

axes.legend()

In [None]:
def find_best_acc(acc_list):
    max_acc = [0, 0, 0]
    
    for acc in acc_list:
        if acc[2] > max_acc[2]:
            max_acc = acc
    print(max_acc)
    
find_best_acc(train_accuracies)
find_best_acc(val_accuracies)