In [1]:
import numpy as np
import matplotlib as mpl 
import matplotlib.pyplot as plt
import scipy
import sklearn.neighbors
import time

from data_utils import load_dataset

In [2]:
def knn_classification(x_train, y_train, x_test, k, l=2):
    y_star = np.empty((x_test.shape[0], 1))
    if (l==1):
        tree = sklearn.neighbors.KDTree(x_train, metric='cityblock')
    elif (l==2):
        tree = sklearn.neighbors.KDTree(x_train, metric='euclidean')
    else:
        pass
    k_nearest_neighbours = tree.query(x_test, k=k, return_distance=False, sort_results=False)
    #print(k_nearest_neighbours)
    for i, x_test_i in enumerate(x_test):
        vals, counts = np.unique(y_train[k_nearest_neighbours[i]], return_counts=True)
        #print(vals)
        #print(counts)
        if np.sum(counts==np.max(counts)) != 1:
            #print(k_nearest_neighbours[0])
            #print(y_train[k_nearest_neighbours[0]])
            y_star[i, 0] = y_train[k_nearest_neighbours[0,0]]   # pick closest neighbour if tie
        else:
            y_star[i, 0] = vals[np.argmax(counts)]
    #print(y_star)
    return y_star

In [3]:
np.random.seed(1000)

x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('iris')

# convert onehot encoding to integer class
# np.tile, np.arange faster than np.argmax method
y_train = np.tile(np.arange(y_train.shape[1]).reshape((1,-1)), (y_train.shape[0], 1))[y_train].reshape((-1, 1))
y_valid = np.tile(np.arange(y_valid.shape[1]).reshape((1,-1)), (y_valid.shape[0], 1))[y_valid].reshape((-1, 1))
y_test = np.tile(np.arange(y_test.shape[1]).reshape((1,-1)), (y_test.shape[0], 1))[y_test].reshape((-1, 1))

n = int(x_train.shape[0])
t0 = time.time()
accuracy = np.empty((int(np.sqrt(n)), 2))
for a in range(1, int(np.sqrt(n))+1, 1):
    for b in range(1, 3, 1):
        y_star = knn_classification(x_train, y_train, x_valid, a, b)
        accuracy[a-1, b-1] = np.mean(y_star == y_valid)
        print("k={a}, l={b}, accuracy={accuracy}".format(a=a, b=b, accuracy=round(accuracy[a-1, b-1], 6)))

best = np.unravel_index(np.argmax(accuracy), accuracy.shape)
print("best params at k={a}, l={b} with accuracy={accuracy}".format(a=best[0]+1, b=best[1]+1, accuracy=round(accuracy[best], 6)))
print("took {t}s".format(t=round(time.time()-t0, 2)))
y_star = knn_classification(x_train, y_train, x_test, best[0]+1, best[1]+1)
print('test a ccuracy with best model: {accuracy}'.format(accuracy=round(np.mean(y_star == y_test), 6)))


k=1, l=1, accuracy=0.774194
k=1, l=2, accuracy=0.774194
k=2, l=1, accuracy=0.806452
k=2, l=2, accuracy=0.806452
k=3, l=1, accuracy=0.774194
k=3, l=2, accuracy=0.806452
k=4, l=1, accuracy=0.806452
k=4, l=2, accuracy=0.806452
k=5, l=1, accuracy=0.806452
k=5, l=2, accuracy=0.83871
k=6, l=1, accuracy=0.774194
k=6, l=2, accuracy=0.806452
k=7, l=1, accuracy=0.806452
k=7, l=2, accuracy=0.870968
k=8, l=1, accuracy=0.806452
k=8, l=2, accuracy=0.83871
k=9, l=1, accuracy=0.83871
k=9, l=2, accuracy=0.870968
k=10, l=1, accuracy=0.83871
k=10, l=2, accuracy=0.870968
best params at k=7, l=2 with accuracy=0.870968
took 0.02s
test a ccuracy with best model: 1.0


In [None]:
np.random.seed(1000)

x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('mnist_small')

# convert onehot encoding to integer class
# np.tile, np.arange faster than np.argmax method
y_train = np.tile(np.arange(y_train.shape[1]).reshape((1,-1)), (y_train.shape[0], 1))[y_train].reshape((-1, 1))
y_valid = np.tile(np.arange(y_valid.shape[1]).reshape((1,-1)), (y_valid.shape[0], 1))[y_valid].reshape((-1, 1))
y_test = np.tile(np.arange(y_test.shape[1]).reshape((1,-1)), (y_test.shape[0], 1))[y_test].reshape((-1, 1))

print('dataset mnist_small')
n = int(x_train.shape[0])
print(n)
t0 = time.time()
accuracy = np.empty((int(np.sqrt(n)), 2))
for a in range(1, int(np.sqrt(n))+1, 1):
    for b in range(1, 3, 1):
        predictions = knn_classification(x_train, y_train, x_valid, a, b)
        accuracy[a-1, b-1] = np.mean(predictions == y_valid)
        print("k={a}, l={b}, accuracy={accuracy}".format(a=a, b=b, accuracy=round(accuracy[a-1, b-1], 6)))
best = np.unravel_index(np.argmax(accuracy), accuracy.shape)
print("best params at k={a}, l={b} with accuracy={accuracy}".format(a=best[0]+1, b=best[1]+1, accuracy=round(accuracy[best], 6)))
print("took {t}s".format(t=round(time.time()-t0, 2)))
y_star = knn_classification(x_train, y_train, x_test, best[0]+1, best[1]+1)
print('test accuracy with best model: {accuracy}'.format(accuracy=round(np.mean(y_star == y_test), 6)))
