In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy as scp
import sklearn.neighbors
import time
from data_utils import load_dataset

# test

Below are the 3 regression datasets

In [None]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('mauna_loa')

In [None]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('rosenbrock', n_train=1000, d=2)

In [None]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('pumadyn32nm')

Below are the 2 classification datasets

In [None]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('iris')

In [None]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('mnist_small')

Below is me understanding how the datasets work...

In [None]:
print("x_test shape: {}\n".format(x_test.shape))
print(x_test)

In [None]:
print("y_test shape: {}\n".format(y_test.shape))
print(y_test)

In [None]:
x = x_test[0]
a = np.abs(x_train-x)
s = np.sum(a,axis=1).reshape((a.shape[0],-1))
partition = np.argpartition(s,kth=3,axis=0)
kNN = partition[:3]

print("x_train shape: {}".format(x_train.shape))
print("x_test point shape: {}".format(x.shape))
print("abs diff shape: {}".format(a.shape))
print("sum shape: {}\n".format(s.shape))

print("x_train:\n",x_train)
print("x_test point:",x)
print("abs diff:\n",a)
print("sum:\n",s)
print("3 kNN indices:\n",kNN)
print("3 kNN values:\n", s[kNN,0])

In [None]:
x = x_test[0].reshape((1,-1))
a = np.abs(x_train-x)
s = np.sum(a,axis=1).reshape((a.shape[0],-1))
partition = np.argpartition(s,kth=3,axis=0)
kNN = partition[:3]

print("x_train shape: {}".format(x_train.shape))
print("x_test point shape: {}".format(x.shape))
print("abs diff shape: {}".format(a.shape))
print("sum shape: {}\n".format(s.shape))

print("x_train:\n",x_train)
print("x_test point:",x)
print("abs diff:\n",a)
print("sum:\n",s)
print("3 kNN indices:\n",kNN)
print("3 kNN values:\n", s[kNN,0])

Below is an implementation of the kNN algorithm for regression.

In [None]:
# first, implement distance metric as separate function
def minkowski_dists(x_train, x_test_i, l=2):
    
    """
    Returns Minkowski distance between each element
    of x_train and x_test_i as a (D,1) np array
    
    x_train is a (D,n) array
    x_test_i is a (n,) array
    
    dists is a (D,1) array
    """
    
    p = l
    x_test_i = x_test_i.reshape((1,-1)) # converts x_test_i to a (1,n) array
    # not necessary... will still work w/out this b/c broadcasting
    diff = np.abs(x_train-x_test_i)
    power = np.power(diff,p)
    sigma = np.sum(power,axis=1).reshape((x_train.shape[0],-1)) # axis = 1 --> sum along rows
    dists = np.power(sigma,1/p)
    return dists
    

In [None]:
def kNN_regress(x_train, y_train, x_test, k=1, l=2):
    
    """
    Returns y_test results corresponding to x_test inputs using kNN algorithm
    
    x_train is a (D,n) array
    y_train is a (D,1) array (regression: single output)
    x_test is a (T,n) array
    
    y_test is a (T,1) array (regression: single output)
    """
    
    num_test_points = x_test.shape[0]
    y_test = np.empty((num_test_points,1),)
    
    # compute distances (for each test point)
    # assume distance metric is the Minkowski distance
    for i, x_test_i in enumerate(x_test):
        dists = minkowski_dists(x_train, x_test_i,l)
        partition = np.argpartition(dists,kth=k,axis=0) # returns indices of dists
                                                        # axis = 0 --> partition along column
        kNN = partition[:k]
        y_test[i,0] = np.average(y_train[kNN,0])
    
    return y_test
    

Testing regression algorithm:

In [None]:
print(x_train.shape)
print(x_train)

In [None]:
print(x_test[0])

In [None]:
# test minkowski function
x_test_i = x_test[0]
dists_one = minkowski_dists(x_train,x_test_i,l=1)
dists_two = minkowski_dists(x_train,x_test_i,l=2)
print(dists_one.shape)
print(dists_two.shape)
print("l=1:",dists_one)
print("l=2:",dists_two)
print(dists_one - dists_two)

In [None]:
# test kNN function
k = 3
y_test_one = kNN_regress(x_train, y_train, x_test, k, l=1)
y_test_two = kNN_regress(x_train, y_train, x_test, k, l=2)

print(y_test_one)
print("\n\n\n")
print(y_test_two)
print("\n\n\n")
print(y_test_one - y_test_two)

Implementation of v-fold cross-validation with v = 5:

In [None]:
def v_fold_cross_validation_RMSE(x_train, y_train, N, k=1, max_l=2, v=5, testing=False):
    
    """
    Conducts cross validation on x_train and y_train with v folds
    
    x_train and y_train are (N,n) arrays
    trains using kNN validation with k
    repeats model with different minkowski distance metric with p up to max_l
    
    Returns the average RMSE across the folds and for each p up to max_l
    RMSE_avg is a (1,max_l) array
    """
    
    # divide data into v equal folds
    # split data into folds (by index)
    idx = np.arange(N,dtype=int)
    if (N%v != 0):
        # as N is not be a perfect multiple of v, add in 'fake' numbers that will be cleaned later
        extra = np.random.choice(idx, size=N%v, replace=False)
        idx = np.delete(idx,extra)
        idx = np.random.permutation(idx).reshape((v,-1))
        temp = -1 * np.ones(idx.shape[0], dtype=int)
        temp[:extra.shape[0]] = extra
        temp = temp.reshape(idx.shape[0],1)
        idx = np.hstack((idx,temp))
    else:
        idx = np.random.permutation(idx).reshape((v,-1))
    
    RMSE = np.empty((v,max_l))
    
    # train and test for each fold
    for i in range(v):
        ith_fold_idx = idx[i]
        ith_fold_idx = [i for i in ith_fold_idx if i>=0]
        training_idx = np.delete(idx, i, axis=0)
        training_idx = training_idx.reshape(-1) # flatten the array
        training_idx = [i for i in training_idx if i>=0]
        
        x_tr, y_tr = x_train[training_idx], y_train[training_idx]
        x_val, y_val = x_train[ith_fold_idx], y_train[ith_fold_idx]
        
        for l in range(1,max_l+1,1):
            y_test = kNN_regress(x_tr, y_tr, x_val, k=k, l=l)
            RMSE[i,l-1] = np.sqrt(np.mean(np.square(y_val-y_test)))
        
    RMSE_avg = np.average(RMSE, axis=0)
    
    if not testing:
        print("k={a}, l={b}, RMSE={e_avg}".format(a=k, b=1, e_avg=round(RMSE_avg[0], 6)))
        print("k={a}, l={b}, RMSE={e_avg}".format(a=k, b=2, e_avg=round(RMSE_avg[1], 6)))
    
    return RMSE_avg 

def estimate_best_param(x_train, y_train, v=5, max_l=2):
    
    """
    Estimates the best k and l values for optimal model results (i.e., tunes the model parameters)
    Uses cross validation with v folds for tuning
    Repeats training with different minkowski distance metric with p up to max_l
    
    x_train and y_train are (N,n) arrays
    
    Returns average RMSE across folds for each k up to sqrt(N) and for each p up to max_l
    RMSE_avg is (sqrt(N),max_l) array
    Returns best_k and best_l --> ints
    """
    
    # assume training data includes both training and validation sets
    t0 = time.time()
    print("Running...")
    
    N = x_train.shape[0] # number of training points
    k_max = int(np.sqrt(N)) # rule of thumb: k < sqrt(N)
    RMSE_avg = np.empty((k_max,max_l)) # RMSE_avg[k][l] stores the average RMSE value for k, l distance metric
    
    # timing estimate
    t0 = time.time()
    print("Estimating running time...")
    v_fold_cross_validation_RMSE(x_train, y_train, N=N, k=k_max//2, max_l=max_l, v=v, testing=True)
    print("Estimated running time: {}s".format(round((time.time()-t0)*k_max,2)))
    
    # start parameter tuning
    t0 = time.time()
    print("Beginning parameter tuning...")
    
    for k in range(1, k_max+1, 1):
        RMSE_avg[k-1] = v_fold_cross_validation_RMSE(x_train, y_train, N=N, k=k, max_l=max_l, v=v)
    
    best_k, best_l = np.unravel_index(np.argmin(RMSE_avg), RMSE_avg.shape)
    best_k += 1
    best_l += 1
    print("Best: k={}, l={}; with min avg RMSE={}".format(best_k, best_l, round(RMSE_avg[best_k-1,best_l-1],6)))
    print("took {}s".format(round(time.time()-t0,2)))
    return RMSE_avg, best_k, best_l

In [None]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('pumadyn32nm')
x_train = np.vstack([x_valid, x_train])
y_train = np.vstack([y_valid, y_train])

RMSE_avg, best_k, best_l = estimate_best_param(x_train, y_train, v=5, max_l=2)

In [None]:
R = np.ones((4,5,2),dtype=int) # (4,5,2) array
print(R)
print(R[0].shape) # (5,2) array

In [None]:
for i in range(0,8,2):
    print(i)

In [None]:
e = np.empty((int(np.sqrt(26)),2))
print(e)
print(e.shape)

In [None]:
type(x_train.shape[0])

In [None]:
a = -1*np.ones(7+5-7%5, dtype=int)
print(a.shape)
print(a)

a[:7] = np.arange(7,dtype=int)
print(a)
a = np.random.permutation(a)
print(a)
a = a.reshape((5,-1))
print(a)

In [None]:
i = 1
v = a[i]
# v = a[i,a[i]>=0]
print(v)
v = [i for i in v if i>=0]
print(v)

In [None]:
t_ = np.delete(a,i,axis=0)
print(t_)

t = np.delete(a,i,axis=0).reshape(-1) # flattens the array
print(t)

In [None]:
t_ = t_[t_>=0]
print(t_)

t = t[t>=0]
print(t)

In [None]:
data = np.arange(6).reshape((3, 2))
print(data)

In [None]:
print(np.average(data,axis=0))

In [None]:
N = 12
v = 5
idx = np.arange(N,dtype=int)
print(idx)
if (N%v != 0):
    extra = np.random.choice(idx, size=N%v, replace=False)
    print(extra)
    idx = np.delete(idx,extra)
    idx = np.random.permutation(idx).reshape((v,-1))
    print(idx)
    temp = -1 * np.ones(idx.shape[0], dtype=int)
    temp[:extra.shape[0]] = extra
    temp = temp.reshape(idx.shape[0],1)
    print(temp)
    idx = np.hstack((idx,temp))
    print(idx)