# Load The Data

In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [11]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
import tqdm
import itertools

# load the SPAM email training dataset

X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1

# load the SPAM email test dataset
test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()

#getting the validation data 
val_idx = np.random.choice(range(len(X_test)), int(len(X_test) / 5), replace=False)
X_val = X_test[val_idx]
y_val = yy[val_idx]


#getting the training data 
train_idx = []
for i in range(len(X_test)):
    if i not in val_idx:
        train_idx.append(i)
X_train = X_test[train_idx]
y_train = yy[train_idx]

# KNN Implementation

In [62]:
import statistics

class KNN(object):
    def __init__(self, k):
        self.X_train = None 
        self.y_train = None 
        self.k = k
    
    def train(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        predictions = np.zeros((X_test.shape[0], 1))
        ##################################################################################
        #  YOUR CODE HERE. The goal here is to iterate accross inputs, and predict each 
        #  input. For each input, we want to calculate the k closest training inputs 
        #  and find the most common label among them. Assign these predictions to the 
        #  variable 'predictions'.
        ##################################################################################
        for i in range(len(X_test)):
            dists = np.linalg.norm(X_test[i] - self.X_train, axis = 1)
            closest_k = y_train[np.argsort(dists)[:self.k]]
            predictions[i] = statistics.mode(closest_k)
        ##################################################################################
        #  END OF YOUR CODE
        ##################################################################################
        return predictions 

# Training & Searching For The Best K

In [63]:
best_K = 0
best_val = 0
best_KNN = None

Ks = np.arange(1, 12, 2)

for K in tqdm.tqdm(itertools.product(Ks), unit="pair"):
    K = K[0]

    # set up the KNN and learn the parameters
    knn = KNN(k = K)
    knn.train(X_train,y_train)
    

    # calculate accuracy and update
    val_acc = np.mean(y_val == knn.predict(X_val))
    if val_acc > best_val:
        best_K = K
        best_KNN = knn
        best_val = val_acc
    
print('Best K: %e, Best Val: %e' % (best_K, best_val))

6pair [00:11,  1.98s/pair]

Best K: 7.000000e+00, Best Val: 6.487500e-01





# Model Evaluation

In [64]:
##################################################################################
# YOUR CODE HERE for testing your best model's performance                       #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################

print('Train Accuracy:', np.mean(y_train == best_KNN.predict(X_train)))

yy_test = np.ones(y_test.shape)
yy_test[y_test == 0] = -1

print('Test Accuracy:', np.mean(yy_test == best_KNN.predict(X_test)))

Train Accuracy: 0.6486875
Test Accuracy: 0.673184
