In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import subprocess
import seaborn as sns
import time
from IPython.display import display, clear_output


# K-Fold Cross Validation

Crea los k folds (Por cada uno crea 3 csv's: Train, Test y Expected)

In [None]:
def create_k_folds(train, k):
    train = train.sample(frac=1)
    folds = []
    for i in range(k):
        folds.append(train[i*len(train)//k : (i+1)*len(train)//k])
    for i in range(k):
        
        expected = pd.DataFrame().assign(label=folds[i]['label'])
        expected.to_csv('./k-fold/expected_' + str(i) + '.csv', index=False)

        new_test = folds[i]
        new_test.drop(['label'], axis=1).to_csv('./k-fold/test_' + str(i) + '.csv', index=False)

        new_train = pd.concat(folds[:i] + folds[i+1:])
        new_train.to_csv('./k-fold/train_' + str(i) + '.csv', index=False)

Corre el algoritmo de PCA para cada uno de los folds, por cada fold crea un csv: Out

In [None]:
def run_PCA(folds, alpha, k):
    for i in range(folds):
        subprocess.run(['./PCA', f'./k-fold/pca/train_{i}.csv', f'./k-fold/pca/test_{i}.csv', f'./k-fold/pca/out_{i}.csv', str(alpha), str(k)], stdout=subprocess.PIPE, encoding='ascii')

In [None]:
def run_kNN(folds, k):
    for i in range(folds):
        subprocess.run(['./kNN', f'./k-fold/knn/train_{i}.csv', f'./k-fold/knn/test_{i}.csv', f'./k-fold/knn/out_{i}.csv', str(k)], stdout=subprocess.PIPE, encoding='ascii')

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
folds = 10
create_k_folds(train, folds)

### Pecision, Recall, Confusion Matrix

In [None]:
alpha = 4
k_for_pca = 5
run_PCA(folds, alpha, k_for_pca)
k_for_knn = -1 # TODO falta buscar cual es el k optimo para knn
run_kNN(folds, k_for_knn)

In [None]:
def confusion_matrix(folds, method):
    confusion = np.zeros((10, 10), dtype=int)
    for i in range(folds):
        out = pd.read_csv(f'./k-fold/{method}/out_{i}.csv')
        expected = pd.read_csv(f'./k-fold/{method}/expected_{i}.csv')
        for j in range(len(out)):
            expected_label = int(expected.iloc[[j]]['label'])
            predicted_label = int(out.iloc[[j]]['Label'])
            confusion[expected_label][predicted_label] += 1
    return confusion

In [None]:
def get_accuracy(confusion):
    TP = np.diag(confusion)
    FP = np.sum(confusion, axis=0) - TP
    FN = np.sum(confusion, axis=1) - TP
    return TP/(TP+FP+FN)

def get_precision(confusion):
    TP = np.diag(confusion)
    FP = np.sum(confusion, axis=0) - TP
    return TP / (TP+FP)

def get_recall(confusion):
    TP = np.diag(confusion)
    FN = np.sum(confusion, axis=1) - TP
    return TP / (TP+FN)


In [None]:
confusion_pca = confusion_matrix(folds, 'pca')
print("Accuracy:", get_accuracy(confusion_pca))
print("Precision:", get_precision(confusion_pca))
print("Recall:", get_recall(confusion_pca))

In [None]:
confusion_knn = confusion_matrix(folds, 'knn')
print("Accuracy:", get_accuracy(confusion_knn))
print("Precision:", get_precision(confusion_knn))
print("Recall:", get_recall(confusion_knn))

In [None]:
plt.imshow(confusion_pca, cmap='gray')
plt.xticks(np.arange(0,10))
plt.yticks(np.arange(0,10));

In [None]:
plt.imshow(confusion_knn, cmap='gray')
plt.xticks(np.arange(0,10))
plt.yticks(np.arange(0,10));

# PCA metrics

In [None]:
def get_kaggle_accuracy(sub, correct_answers):
    count = 0
    for i in range(len(sub)):
        if int(sub.iloc[[i]]['Label']) == int(correct_answers.iloc[[i]]['Label']):
            count += 1
    return count / len(sub)

In [None]:
kaggle = pd.read_csv('kaggle.csv')

alphas = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25, 30, 35, 40, 45, 50]
ks = [1, 3, 5, 10, 25, 50]

data = pd.DataFrame([], columns=['alpha', 'k', 'accuracy', 'time'])
for alpha in alphas:
    for k in ks:
        time = subprocess.run(['./PCA', 'train.csv', 'test.csv', 'out.csv', str(alpha), str(k)], stdout=subprocess.PIPE, encoding='ascii').stdout.split('\n')[0]
        score = get_kaggle_accuracy(pd.read_csv('out.csv'), kaggle)
        data = data.append({'alpha': alpha, 'k': k, 'accuracy': score, 'time': float(time)}, ignore_index = True)


In [None]:
data.head()

In [None]:
sns.relplot(data=data, x="alpha", y="accuracy", hue="k", aspect=1.5)

In [None]:
p = sns.relplot(data=data, x="alpha", y="time", hue="k", kind="line", aspect=1.5)
p.set(xlabel='alpha', ylabel='time [seg]')

In [None]:
# El mejor es:
column = data["accuracy"]
max_index = column.idxmax()
data.iloc[[max_index]]

# kNN Metrics

In [None]:
kaggle = pd.read_csv('kaggle.csv')

ks = [1, 3, 5, 7, 9, 11, 13, 15, 20, 25, 30, 40, 50, 100]

data = pd.DataFrame([], columns=['k', 'accuracy', 'time'])
i = 0
start = time.time()
time_previous = 0
for k in ks:
    clear_output(wait=True)
    display(f'kNN tiempos - Experimento: {i+1} / {len(ks)} - Tiempo (dataset): {time.time() - start} segs - Tiempo (último): {time_previous}')        
    time_experiment = subprocess.run(['./kNN_tiempos', 'train.csv', 'test.csv', 'out.csv', str(k)], stdout=subprocess.PIPE, encoding='ascii').stdout.split('\n')[0]
    score = get_kaggle_accuracy(pd.read_csv('out.csv'), kaggle)
    data = data.append({'k': k, 'accuracy': score, 'time': float(time)}, ignore_index = True)
    time_previous = time_experiment
    i += 1
data.to_csv('knn_accuracy_time.csv')