In [1]:
import numpy as np
import glob
import pandas as pd
import stats
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import random

In [2]:
filenames = glob.glob("*.csv")
train_data_files = []
train_label_files = []

for filename in filenames:
    if('trainData' in filename):
        train_data_files.append(filename)
    if('trainLabels' in filename):
        train_label_files.append(filename)

In [3]:
train_data_files = sorted(train_data_files)
train_label_files = sorted(train_label_files)
print(train_data_files)
print(train_label_files)

['trainData1.csv', 'trainData10.csv', 'trainData2.csv', 'trainData3.csv', 'trainData4.csv', 'trainData5.csv', 'trainData6.csv', 'trainData7.csv', 'trainData8.csv', 'trainData9.csv']
['trainLabels1.csv', 'trainLabels10.csv', 'trainLabels2.csv', 'trainLabels3.csv', 'trainLabels4.csv', 'trainLabels5.csv', 'trainLabels6.csv', 'trainLabels7.csv', 'trainLabels8.csv', 'trainLabels9.csv']


In [4]:
train_data_folds = []
train_labels_folds = []

for i in range(len(train_data_files)):
    print('Reading ' + train_data_files[i])
    train_data_folds.append(np.genfromtxt(train_data_files[i], delimiter=','))
    print('Reading ' + train_label_files[i])
    train_labels_folds.append(np.genfromtxt(train_label_files[i], delimiter=','))

Reading trainData1.csv
Reading trainLabels1.csv
Reading trainData10.csv
Reading trainLabels10.csv
Reading trainData2.csv
Reading trainLabels2.csv
Reading trainData3.csv
Reading trainLabels3.csv
Reading trainData4.csv
Reading trainLabels4.csv
Reading trainData5.csv
Reading trainLabels5.csv
Reading trainData6.csv
Reading trainLabels6.csv
Reading trainData7.csv
Reading trainLabels7.csv
Reading trainData8.csv
Reading trainLabels8.csv
Reading trainData9.csv
Reading trainLabels9.csv


In [5]:
test_data = np.genfromtxt('testData.csv', delimiter=',')
test_labels = np.genfromtxt('testLabels.csv', delimiter=',')

In [6]:
def cross_validation(train_data_list, train_labels_list, k, k_fold):
    val_data = train_data_list[k_fold]
    val_labels = train_labels_list[k_fold]
    train_x = train_data_list[:k_fold] + train_data_list[k_fold+1:]
    train_y = train_labels_list[:k_fold] + train_labels_list[k_fold+1:]
    train_data = np.concatenate(train_x, axis = 0)
    train_labels = np.concatenate(train_y, axis = 0)
    
    preds = []
    for i,row in enumerate(val_data):
        #print(i)
        instance = row.reshape(1,-1)
        idx = np.argpartition(np.sqrt(np.sum((train_data-instance)**2,axis = 1)), k)
        #print(k)
        #print(idx)
        knn_labels = train_labels[idx[:k]]
        try:
            pred = stats.mode(knn_labels)
        except:
            pred = random.choice(knn_labels)
        preds.append(pred)
    
    return(accuracy_score(val_labels,preds))

In [7]:
def evaluate(train_X, train_y, test_X, test_y,k):
    preds = []
    for i,row in enumerate(test_X):
        instance = row.reshape(1,-1)
        idx = np.argpartition(np.sqrt(np.sum((train_X-instance)**2,axis = 1)), k)
        knn_labels = train_y[idx[:k]]
        try:
            pred = stats.mode(knn_labels)
        except:
            pred = random.choice(knn_labels)
        preds.append(pred)
    
    return(accuracy_score(test_y,preds))

In [8]:
k_max = 30
k_folds = 10
accuracy_list = []
for k in range(k_max):
    accuracy_k_fold_list = []
    for k_fold in range(k_folds): 
        accuracy_k_fold = cross_validation(train_data_folds, train_labels_folds, k+1, k_fold)
        accuracy_k_fold_list.append(accuracy_k_fold)
    accuracy_k = np.mean(accuracy_k_fold_list)
    accuracy_list.append(accuracy_k)

k_optimized = accuracy_list.index(max(accuracy_list)) + 1
print(k_optimized)

19


In [9]:
accuracy_list[k_optimized]

0.8109999999999999

In [10]:
final_train_x = np.concatenate(train_data_folds, axis = 0)
final_train_y = np.concatenate(train_labels_folds, axis = 0)

In [11]:
evaluate(final_train_x, final_train_y, test_data, test_labels, k_optimized)

0.7272727272727273

In [12]:
neigh = KNeighborsClassifier(n_neighbors=k_optimized)
cv_results = cross_validate(neigh, final_train_x, final_train_y, cv =10)

In [13]:
np.mean(cv_results['test_score'])

0.8099999999999999

In [14]:
neigh.fit(final_train_x,final_train_y)
sk_pred = neigh.predict(test_data)
accuracy_score(test_labels,sk_pred)

0.7272727272727273

In [15]:
parameters = {'n_neighbors':list(np.arange(1,31))}
neigh = KNeighborsClassifier()
clf = GridSearchCV(neigh, parameters)
clf.fit(final_train_x, final_train_y)

clf.best_params_

{'n_neighbors': 29}

In [16]:
neigh = KNeighborsClassifier(n_neighbors=29)
neigh.fit(final_train_x,final_train_y)
sk_pred = neigh.predict(test_data)
accuracy_score(test_labels, sk_pred)

0.7636363636363637

In [17]:
evaluate(final_train_x, final_train_y, test_data, test_labels, 29)

0.7636363636363637