In [None]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [None]:
from dataset import load_svhn
from knn import KNN
from metrics import binary_classification_metrics, multiclass_accuracy

In [None]:
train_X, train_y, test_X, test_y = load_svhn("data", max_train=35000, max_test=5000)

In [None]:
samples_per_class = 5  # Number of samples per class to visualize
plot_index = 1
for example_index in range(samples_per_class):
    for class_index in range(10):
        plt.subplot(5, 10, plot_index)
        image = train_X[train_y == class_index][example_index]
        plt.imshow(image.astype(np.uint8))
        plt.axis('off')
        plot_index += 1

In [None]:
# First, let's prepare the labels and the source data

# Only select 0s and 9s
binary_train_mask = (train_y == 0) | (train_y == 9)
binary_train_X = train_X[binary_train_mask]
binary_train_y = train_y[binary_train_mask] == 0

binary_test_mask = (test_y == 0) | (test_y == 9)
binary_test_X = test_X[binary_test_mask]
binary_test_y = test_y[binary_test_mask] == 0
# Reshape to 1-dimensional array [num_samples, 32*32*3]
binary_train_X = binary_train_X.reshape(binary_train_X.shape[0], -1)
binary_test_X = binary_test_X.reshape(binary_test_X.shape[0], -1)

In [None]:
# Create the classifier and call fit to train the model
# KNN just remembers all the data
knn_classifier = KNN(k=6)
knn_classifier.fit(binary_train_X, binary_train_y)

In [None]:
# TODO: implement compute_distances_no_loops in knn.py
dists = knn_classifier.compute_distances_no_loops(binary_test_X)
assert np.isclose(dists[15, 120], np.sqrt(np.sum((binary_test_X[15] ** 2) + (binary_train_X[120] ** 2)
                                        - (2 * binary_test_X[15] * binary_train_X[120]))))

In [None]:
prediction = knn_classifier.predict(binary_test_X)

In [None]:
precision, recall, f1, accuracy = binary_classification_metrics(prediction, binary_test_y)
print("KNN with k = %s" % knn_classifier.k)
print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" % (accuracy, precision, recall, f1))

In [None]:
knn_classifier_3 = KNN(k=3)
knn_classifier_3.fit(binary_train_X, binary_train_y)
prediction = knn_classifier_3.predict(binary_test_X)

precision, recall, f1, accuracy = binary_classification_metrics(prediction, binary_test_y)
print("KNN with k = %s" % knn_classifier_3.k)
print("Accuracy: %4.2f, Precision: %4.2f, Recall: %4.2f, F1: %4.2f" % (accuracy, precision, recall, f1))

In [None]:
# Find the best k using cross-validation based on F1 score
num_folds = 5
train_folds_X = []
train_folds_y = []
# TODO: split the training data in 5 folds and store them in train_folds_X/train_folds_y
k_choices = [1,2,3,25]
k_to_f1 = {}  # dict mapping k values to mean F1 scores (int -> float)

l_of_btx = binary_train_X.shape[0]
l_of_btx_dn = l_of_btx // num_folds
for i in range(num_folds):
    train_folds_X.append(binary_train_X[i * (l_of_btx_dn) : (i + 1) * (l_of_btx_dn)])
    train_folds_y.append(binary_train_y[i * (l_of_btx_dn) : (i + 1) * (l_of_btx_dn)])






In [None]:
k_choices

In [None]:
k_to_f1

In [None]:
for k in k_choices:
    # TODO: perform cross-validation
    # Go through every fold and use it for testing and all other folds for training
    # Perform training and produce F1 score metric on the validation dataset
    # Average F1 from all the folds and write it into k_to_f1
    avg_f1 = []
    knn_classifier_4 = KNN(k)
    numbers_for_cv = []
    c = True
    for i in range(num_folds):
        j = 0
        while(j < i):
            numbers_for_cv.append(j)
            j += 1
        p = i + 1
        while(p < num_folds):
            numbers_for_cv.append(p)
            p += 1
        #print(numbers_for_cv)
        bin_train_cv_X = train_folds_X[numbers_for_cv[0]]
        bin_train_cv_y = train_folds_y[numbers_for_cv[0]]
        for num in numbers_for_cv:
            if(c):
                c = False
                continue
            else:
                #print(str(i) + "\t" + str(num))
                bin_train_cv_X = np.concatenate((bin_train_cv_X, train_folds_X[num]))
                bin_train_cv_y = np.concatenate((bin_train_cv_y, train_folds_y[num]))

        knn_classifier_4.fit(bin_train_cv_X, bin_train_cv_y)
        prediction = knn_classifier_4.predict(train_folds_X[i])
        precision, recall, f1, accuracy = binary_classification_metrics(prediction, train_folds_y[i])
        avg_f1.append(f1)
    k_to_f1[k] = np.sum(avg_f1) / len(avg_f1)

k_to_f1




In [None]:
# Now let's use all 10 classes
train_X = train_X.reshape(train_X.shape[0], -1)
test_X = test_X.reshape(test_X.shape[0], -1)

knn_classifier = KNN(k=1)
knn_classifier.fit(train_X, train_y)

In [None]:
predict = knn_classifier.predict(test_X)

In [None]:
accuracy = multiclass_accuracy(predict, test_y)
print("Accuracy: %4.2f" % accuracy)

In [None]:

knn_classifier = KNN(k=25)
knn_classifier.fit(train_X, train_y)
predict = knn_classifier.predict(test_X)
accuracy = multiclass_accuracy(predict, test_y)
print("Accuracy: %4.5f" % accuracy)

In [None]:
num_folds = 5
train_folds_X = []
train_folds_y = []

k_choices = [1, 25, 35]
k_to_accuracy = {}

l = train_X.shape[0] // num_folds
for i in range(num_folds):
    train_folds_X.append(train_X[i * l : (i + 1) * l])
    train_folds_y.append(train_y[i * l : (i + 1) * l])


In [None]:
for k in k_choices:
    # Go through every fold and use it for testing and all other folds for validation
    # Perform training and produce accuracy metric on the validation dataset
    # Average accuracy from all the folds and write it into k_to_accuracy

    avg_accuracy = []
    knn_classifier_9 = KNN(k)
    c = True
    for i in range(num_folds):
        numbers_for_cv = []
        j = 0
        while(j < i):
            numbers_for_cv.append(j)
            j += 1
        p = i + 1
        while(p < num_folds):
            numbers_for_cv.append(p)
            p += 1
        tr_X = train_folds_X[numbers_for_cv[0]]
        tr_y = train_folds_y[numbers_for_cv[0]]
        print(numbers_for_cv)
        for num in numbers_for_cv :
            if (c == True):
                c = False
                continue
            else:
                tr_X = np.concatenate((tr_X, train_folds_X[num]))
                tr_y = np.concatenate((tr_y, train_folds_y[num]))
        knn_classifier_54 = KNN(k)
        knn_classifier_54.fit(tr_X, tr_y)
        predict_cv = knn_classifier_54.predict(train_folds_X[i])
        accuracy_cv = multiclass_accuracy(predict_cv, train_folds_y[i])
        avg_accuracy.append(accuracy_cv)
    k_to_accuracy[k] = np.sum(avg_accuracy) / len(avg_accuracy)

In [None]:
k_to_accuracy

In [None]:

# TODO Set the best k as a best from computed
best_k = 25

best_knn_classifier = KNN(k=best_k)
best_knn_classifier.fit(train_X, train_y)
prediction = best_knn_classifier.predict(test_X)

# Accuracy should be around 20%!
accuracy = multiclass_accuracy(prediction, test_y)
print("Accuracy: %4.10f" % accuracy)
