In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from sklearn.neighbors import KDTree
from scipy.stats import mode
from scipy.stats import rankdata

# Question 1

## 1.1

In [2]:
train = np.loadtxt('pa1train.txt', encoding = 'ascii')

In [3]:
test = np.loadtxt('pa1test.txt', encoding = 'ascii')

In [4]:
validate = np.loadtxt('pa1validate.txt', encoding = 'ascii')

In [5]:
print(train.shape)
print(test.shape)
print(validate.shape)

(2000, 785)
(1000, 785)
(1000, 785)


In [6]:
# splite the data set into X and y: X as feature, y as target.
X_train = train[:, :-1]
y_train = train[:, -1]
X_test = test[:,:-1]
y_test = test[:,-1]
X_valid = validate[:, :-1]
y_valid = validate[:,-1]

In [10]:
def knn_classifier(X_train, y_train, X_test, y_test, k):
    '''get the accuracy error rate with training set and testing set
    '''
    distance_mat = cdist(X_train, X_test) #Get the distance of two pair of matrix
    ind = np.argpartition(distance_mat, k, axis=0)[:k] #Get kth largest neighbors index in vector form
    prediction = np.concatenate(mode(y_train[ind].astype(int), axis = 0)[0])  #Get prediction from y_train
    return 1 - (prediction == y_test).sum()/len(y_test)  #Calculate error rate

In [11]:
#KDTree VERSION
# def knn_classifier(X_train, y_train, X_test, y_test, n):
#     '''get the accuracy error rate with training set and testing set
#     '''
#     tree = KDTree(X_train) # initialize a kdtree with training set
#     ind = tree.query(X_test, k = n)[1] # get the index of the nearest k points' indecies
#     prediction = np.concatenate(mode(y_train[ind].astype(int), axis = 1)[0]) 
#     #get prediction set by taking the most frequent element
#     return 1 - (prediction == y_test).sum()/len(y_test) # calculating training error

In [12]:
knn_classifier(X_train, y_train, X_train, y_train, 3) # with 3nn classfier training error is

0.04349999999999998

In [13]:
train_err = []
valid_err = []
for i in [1,5,9,15]:
    train_err += [knn_classifier(X_train, y_train,X_train, y_train, i)]
    valid_err += [knn_classifier(X_train, y_train,X_valid, y_valid, i)]

In [14]:
output1 = pd.DataFrame(
        {
            'Training Error': train_err,
            'Validation Error': valid_err
        }, index = pd.Series([1,5,9,15], name = 'k')
)

In [15]:
display(output1)

Unnamed: 0_level_0,Training Error,Validation Error
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,0.082
5,0.0565,0.095
9,0.0685,0.104
15,0.0925,0.108


According to the table above, we can see k = 1 works the best classifier of the training data and validation error

In [16]:
# computing test error for k = 1

In [17]:
print('Test Error for the k = 1 knn classifier is {}'.format(knn_classifier(X_train, y_train, X_test, y_test, 1)))

Test Error for the k = 1 knn classifier is 0.09399999999999997


## 1.2

In [18]:
projection = np.loadtxt('projection.txt', encoding = 'ascii')

In [19]:
print(projection.shape)

(784, 20)


In [20]:
X_train_proj = X_train.dot(projection)
X_test_proj = X_test.dot(projection)
X_valid_proj = X_valid.dot(projection)

In [21]:
train_err = []
valid_err = []
for i in [1,5,9,15]:
    train_err += [knn_classifier(X_train_proj, y_train,X_train_proj, y_train, i)]
    valid_err += [knn_classifier(X_train_proj, y_train,X_valid_proj, y_valid, i)]

In [22]:
output2 = pd.DataFrame(
        {
            'Training Error': train_err,
            'Validation Error': valid_err
        }, index = pd.Series([1,5,9,15], name = 'k')
)

In [23]:
display(output2)

Unnamed: 0_level_0,Training Error,Validation Error
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,0.32
5,0.1945,0.299
9,0.2305,0.302
15,0.257,0.289


Based on the validation error, the best classifier is 15-NN classifier. 

In [24]:
print('Test Error for the 15-knn classifier is {}'.format(knn_classifier(X_train_proj, \
                                                                         y_train, X_test_proj, y_test, 15)))

Test Error for the 15-knn classifier is 0.29600000000000004


In [25]:
print('After we done project the original data, the accuracy of the prediction was siginicantly reduced.')
print('With full dimentioan dataset, the best validation error is {}'.format(output1['Validation Error'].min()))
print('Wihile, with 20 dimentional projection dataset, the best validation error is {}'.format(output2['Validation Error'].min()))

After we done project the original data, the accuracy of the prediction was siginicantly reduced.
With full dimentioan dataset, the best validation error is 0.08199999999999996
Wihile, with 20 dimentional projection dataset, the best validation error is 0.28900000000000003


In [26]:
print('the time cost for training and predicting a 15-nn classfier with full dimention training dataset')
%timeit knn_classifier(X_train, y_train, X_test, y_test, 15)
print('the time cost for training and predicting a 15-nn classfier with 20 projected dimention training dataset')
%timeit knn_classifier(X_train_proj, y_train, X_test_proj, y_test, 15)

the time cost for training and predicting a 15-nn classfier with full dimention training dataset
1.99 s ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
the time cost for training and predicting a 15-nn classfier with 20 projected dimention training dataset
82.7 ms ± 437 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


according to the previous cells, the time cost was significantly imporved by using projection of dataset

In conclusion, projection with lower dimention will imporve our time cost with trade off of prediction accuracy.