In [None]:
PA1 - Pete Sheurpukdi

In [1]:
import numpy as np
import pandas as pd

In [122]:
train = pd.read_csv('pa1train.txt',sep=" ", header = None)
X_train = np.asarray(train.loc[:,:783].values)
y_train = np.asarray(train.loc[:,784].values)

valid = pd.read_csv('pa1validate.txt',sep=" ", header = None)
X_valid = np.asarray(valid.loc[:,:783].values)
y_valid = np.asarray(valid.loc[:,784].values)


test = pd.read_csv('pa1test.txt',sep=" ", header = None)
X_test = np.asarray(test.loc[:,:783].values)
y_test = np.asarray(test.loc[:,784].values)

In [125]:
from collections import Counter
from sklearn.metrics.pairwise import euclidean_distances

# prediction for one point
def predict(X_train, y_train, X_test, k):
    distances = []
    labels = []

    # compute all train points distance to test point and add to list with index of distance's corresponding point
    for i in range(len(X_train)):
        distance = np.sqrt(np.sum(np.power(np.subtract(X_train[i], X_test), 2)))
        distances.append(np.asarray([distance, i]))

    # sort distances list by distance
    distances = np.asarray(distances)
    distances = distances[distances[:,0].argsort()]
    
    # get k labels of closest neighbors according to saved index in distances
    for i in range(k):
        index = int(distances[i][1])
        labels.append(y_train[index])

    # return most common target
    label = Counter(labels).most_common(1)[0][0]
    return label

def kNN(X_train, y_train, X_test, k):
    # loop over all X_test
    preds = []
    for i in range(len(X_test)):
        preds.append(predict(X_train, y_train, X_test[i], k))
    return preds

def error(preds, y):
    err=0
    for a, b in zip(preds, y):
        if a != b:
            err += 1
    err = err / len(preds)
    return err

In [126]:
%%timeit -r 1
# each k-value to try
for k in [1,5,9,15]: 
    preds = kNN(X_train, y_train, X_train, k)
    print("Training Error with k=" + str(k) + ": " + str(error(preds, y_train)))
    preds = kNN(X_train, y_train, X_valid, k)
    print("Validation Error with k=" + str(k) + ": " + str(error(preds, y_valid)))


Training Error with k=1: 0.0
Validation Error with k=1: 0.082
Training Error with k=5: 0.0485
Validation Error with k=5: 0.097
Training Error with k=9: 0.0625
Validation Error with k=9: 0.1
Training Error with k=15: 0.0845
Validation Error with k=15: 0.102
Training Error with k=1: 0.0
Validation Error with k=1: 0.082
Training Error with k=5: 0.0485
Validation Error with k=5: 0.097
Training Error with k=9: 0.0625
Validation Error with k=9: 0.1
Training Error with k=15: 0.0845
Validation Error with k=15: 0.102
16min 20s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


The best classifier is k=1

In [127]:
preds = kNN(X_train, y_train, X_test, 1)
print("Test Error with k=" + str(1) + ": " + str(error(preds, y_test)))

Test Error with k=1: 0.094


In [128]:
project = pd.read_csv('projection.txt',sep=" ", header = None)
project.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.015626,-0.019702,-0.005087,0.027078,0.015511,-0.048531,-0.005318,-0.007703,0.029581,0.018437,0.052615,-0.033708,0.064847,-0.033585,-0.017796,-0.061265,-0.025245,-0.019012,-0.053294,-0.059311
1,-0.043534,0.038514,0.061698,-0.026509,-0.014271,-0.010595,0.016936,-0.008244,0.04312,0.020635,-0.015361,0.023042,0.035583,0.056888,-0.041535,0.041684,0.02807,0.03185,0.002475,0.000526
2,-0.016051,-0.016189,0.030413,-0.041727,-0.035487,0.006792,-0.043063,-0.016762,-0.07631,-0.005724,-0.030372,-0.010438,0.044302,-0.003053,-0.026926,-0.019525,0.023908,-0.008592,0.036698,0.006946
3,-0.007883,0.040432,-0.013534,-0.004234,-0.028055,0.018167,-0.043165,-0.02702,0.051053,-0.01405,-0.047476,0.007069,-0.001732,-0.038721,0.048611,-0.018313,-0.009919,0.042606,-0.018157,-0.019787
4,0.039536,0.003373,0.048432,-0.016864,-0.036708,-0.032352,-0.004686,-0.044178,-0.058599,-0.049298,0.039729,0.004514,-0.007552,-0.068464,-0.007477,0.00458,-0.025271,0.050935,-0.056193,0.023619


In [129]:
p_X_test = np.dot(X_test, project)
p_X_train = np.dot(X_train, project)
p_X_valid = np.dot(X_valid, project)

In [130]:
preds = kNN(p_X_train, y_train, p_X_train, 3)
print("Training Error with k=" + str(3) + ": " + str(error(preds, y_train)))

Training Error with k=3: 0.0785


In [136]:
%%timeit -n1 -r1
# each k-value to try
for k in [1,5,9,15]: 
    preds = kNN(p_X_train, y_train, p_X_train, k)
    print("Training Error with k=" + str(k) + ": " + str(error(preds, y_train)))
    preds = kNN(p_X_train, y_train, p_X_valid, k)
    print("Validation Error with k=" + str(k) + ": " + str(error(preds, y_valid)))


Training Error with k=1: 0.0
Validation Error with k=1: 0.32
Training Error with k=5: 0.1495
Validation Error with k=5: 0.285
Training Error with k=9: 0.198
Validation Error with k=9: 0.281
Training Error with k=15: 0.2325
Validation Error with k=15: 0.285
9min 43s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [137]:
preds = kNN(p_X_train, y_train, p_X_test, 9)
print("Test Error with k=" + str(1) + ": " + str(error(preds, y_test)))

Test Error with k=1: 0.282


There is about 2-3x higher error rate by projecting the data onto the projection matrix, however the computational time is 1/2 of the original.