In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import subprocess
import seaborn as sns
import time
from IPython.display import display, clear_output
from sklearn.metrics import cohen_kappa_score

# K-Fold Cross Validation

Crea los k folds (Por cada uno crea 3 csv's: Train, Test y Expected)

In [None]:
def create_k_folds(train, k):
    train = train.sample(frac=1)
    folds = []
    for i in range(k):
        folds.append(train[i*len(train)//k : (i+1)*len(train)//k])
    for i in range(k):
        
        expected = pd.DataFrame().assign(label=folds[i]['label'])
        expected.to_csv('./k-fold/expected_' + str(i) + '.csv', index=False)

        new_test = folds[i]
        new_test.drop(['label'], axis=1).to_csv('./k-fold/test_' + str(i) + '.csv', index=False)

        new_train = pd.concat(folds[:i] + folds[i+1:])
        new_train.to_csv('./k-fold/train_' + str(i) + '.csv', index=False)

Corre el algoritmo de PCA para cada uno de los folds, por cada fold crea un csv: Out

In [None]:
def run_PCA(folds, alpha, k):
    for i in range(folds):
        subprocess.run(['./PCA', f'./k-fold/train_{i}.csv', f'./k-fold/test_{i}.csv', f'./k-fold/pca/out_a{alpha}_k{k}_{i}.csv', str(alpha), str(k)], stdout=subprocess.PIPE, encoding='ascii')

In [None]:
def run_kNN(folds, k, modo):
    for i in range(folds):
        subprocess.run(['./kNN', f'./k-fold/train_{i}.csv', f'./k-fold/test_{i}.csv', f'./k-fold/knn/out_{i}.csv', str(k), modo], stdout=subprocess.PIPE, encoding='ascii')

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
folds = 10
create_k_folds(train, folds)

### Pecision, Recall, Confusion Matrix

In [None]:
def confusion_matrix(folds, output_file_path):
    confusion = np.zeros((10, 10), dtype=int)
    for i in range(folds):
        out = pd.read_csv(output_file_path + str(i) + '.csv')
        expected = pd.read_csv(f'./k-fold/expected_{i}.csv')
        for j in range(len(out)):
            expected_label = int(expected.iloc[[j]]['label'])
            predicted_label = int(out.iloc[[j]]['Label'])
            confusion[expected_label][predicted_label] += 1
    return confusion


In [None]:
def get_accuracy(confusion):
    TP = np.diag(confusion)
    FP = np.sum(confusion, axis=0) - TP
    FN = np.sum(confusion, axis=1) - TP
    return TP / (TP+FP+FN)


def get_precision(confusion):
    TP = np.diag(confusion)
    FP = np.sum(confusion, axis=0) - TP
    return TP / (TP+FP)


def get_recall(confusion):
    TP = np.diag(confusion)
    FN = np.sum(confusion, axis=1) - TP
    return TP / (TP+FN)


In [None]:
k_for_pca = 3
alphas = (1, 2, 3, 5, 8, 13, 21, 34, 55, 89)

In [None]:
for alpha in alphas:
    run_PCA(folds, alpha, k_for_pca)


In [None]:
mean_accuracies = []
mean_precisions = []
mean_recalls = []
mean_f1_scores = []
accuracies = []
precisions = []
recalls = []
f1_scores = []

for alpha in alphas:
    confusion = confusion_matrix(
        folds, f'./k-fold/pca/out_a{alpha}_k{k_for_pca}_')
    accuracy = get_accuracy(confusion)
    precision = get_precision(confusion)
    recall = get_recall(confusion)
    f1_score = 2 * (precision * recall) / (precision + recall)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1_score)

    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1_score = 2 * (mean_precision * mean_recall) / (mean_precision + mean_recall)
    mean_accuracies.append(np.mean(accuracy))
    mean_precisions.append(mean_precision)
    mean_recalls.append(mean_recall)
    mean_f1_scores.append(mean_f1_score)

# Así se nos hace más fácil graficar
accuracies = np.transpose(accuracies)
precisions = np.transpose(precisions)
recalls = np.transpose(recalls)
f1_scores = np.transpose(f1_scores)


In [None]:
plt.plot(alphas, mean_accuracies, label='Accuracy')
plt.plot(alphas, mean_precisions, label='Precision')
plt.plot(alphas, mean_recalls, label='Recall')
plt.plot(alphas, mean_f1_scores, label='F1-Score')
plt.legend(loc="lower right")
plt.xlabel("alpha")
plt.title("Efectividad media de PCA | k = 3")

In [None]:
for i in range(len(accuracies)):
    accuracy_i = accuracies[i]
    precision_i = precisions[i]
    recall_i = recalls[i]
    f1_score_i = f1_scores[i]
    plt.figure()
    plt.plot(alphas, accuracy_i, label='Accuracy')
    plt.plot(alphas, precision_i, label='Precision')
    plt.plot(alphas, recall_i, label='Recall')
    plt.plot(alphas, f1_score_i, label='F1-Score')
    plt.legend(loc="lower right")
    plt.xlabel("alpha")
    plt.title(f"Efectividad de PCA para dígito '{i}' | k = 3")
    plt.show()

In [None]:
# Idem pero zoomeado
for i in range(len(accuracies)):
    accuracy_i = accuracies[i]
    precision_i = precisions[i]
    recall_i = recalls[i]
    f1_score_i = f1_scores[i]
    plt.figure()
    plt.plot(alphas, accuracy_i, label='Accuracy')
    plt.plot(alphas, precision_i, label='Precision')
    plt.plot(alphas, recall_i, label='Recall')
    plt.plot(alphas, f1_score_i, label='F1-Score')
    plt.xlim(8, 90)
    plt.ylim(0.6, 1)
    plt.legend(loc="lower right")
    plt.xlabel("alpha")
    plt.title(f"Efectividad de PCA para dígito '{i}' | k = 3")
    plt.show()

Hacemos lo mismo pero variando el k en vez del alpha

In [None]:
alpha = 50
ks = (1, 2, 3, 5, 8, 13, 21, 34, 55, 89)
for k_for_pca in ks:
    run_PCA(folds, alpha, k_for_pca)

In [None]:
mean_accuracies = []
mean_precisions = []
mean_recalls = []
mean_f1_scores = []
accuracies = []
precisions = []
recalls = []
f1_scores = []
for k in ks:
    confusion = confusion_matrix(
        folds, f'./k-fold/pca/out_a{alpha}_k{k}_')
    accuracy = get_accuracy(confusion)
    precision = get_precision(confusion)
    recall = get_recall(confusion)
    f1_score = 2 * (precision * recall) / (precision + recall)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1_score)

    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1_score = 2 * (mean_precision * mean_recall) / (mean_precision + mean_recall)
    mean_accuracies.append(np.mean(accuracy))
    mean_precisions.append(mean_precision)
    mean_recalls.append(mean_recall)
    mean_f1_scores.append(mean_f1_score)

# Así se nos hace más fácil graficar
accuracies = np.transpose(accuracies)
precisions = np.transpose(precisions)
recalls = np.transpose(recalls)
f1_scores = np.transpose(f1_scores)

In [None]:
plt.plot(alphas, mean_accuracies, label='Accuracy')
plt.plot(alphas, mean_precisions, label='Precision')
plt.plot(alphas, mean_recalls, label='Recall')
plt.plot(alphas, mean_f1_scores, label='F1-Score')
plt.legend(loc="lower right")
plt.xlabel("k")
plt.title("Efectividad media de PCA | alpha = 50")

In [None]:
for i in range(len(accuracies)):
    accuracy_i = accuracies[i]
    precision_i = precisions[i]
    recall_i = recalls[i]
    f1_score_i = f1_scores[i]
    plt.figure()
    plt.plot(alphas, accuracy_i, label='Accuracy')
    plt.plot(alphas, precision_i, label='Precision')
    plt.plot(alphas, recall_i, label='Recall')
    plt.plot(alphas, f1_score_i, label='F1-Score')
    plt.ylim(0.8, 1.0)
    plt.legend(loc="lower right")
    plt.xlabel("k")
    plt.title(f"Efectividad de PCA para dígito '{i}' | alpha = 50")
    plt.show()

In [None]:
confusion_pca = confusion_matrix(folds, './k-fold/pca/out_a50_k3_')
accuracy = get_accuracy(confusion_pca)
precision = get_precision(confusion_pca)
recall = get_recall(confusion_pca)
f1_score = 2 * (precision * recall) / (precision + recall)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1_score)

In [None]:
confusion_knn = confusion_matrix(folds, './k-fold/knn/out_')
accuracy = get_accuracy(confusion_knn)
precision = get_precision(confusion_knn)
recall = get_recall(confusion_knn)
f1_score = 2 * (precision * recall) / (precision + recall)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1_score)

In [None]:
plt.imshow(confusion_pca, cmap='gray')
plt.xticks(np.arange(0,10))
plt.yticks(np.arange(0,10));

In [None]:
plt.imshow(confusion_knn, cmap='gray')
plt.xticks(np.arange(0,10))
plt.yticks(np.arange(0,10));

# Kappa de Cohen

In [None]:
def output_labels(path):
    labels = []
    for i in range(folds):
        out = pd.read_csv(path + str(i) + '.csv')
        labels += out['Label'].tolist()
    return labels

In [None]:
pca_pred = output_labels('./k-fold/pca/out_a50_k3_')
knn_pred = output_labels('./k-fold/knn/out_')
cohen_kappa_score(pca_pred, knn_pred)

# Accuracy

### PCA

In [None]:
def get_accuracy(sub, correct_answers):
    count = 0
    for i in range(len(sub)):
        if int(sub.iloc[[i]]['Label']) == int(correct_answers.iloc[[i]]['Label']):
            count += 1
    return count / len(sub)

In [None]:
kaggle = pd.read_csv('kaggle.csv')

alphas = list(range(1, 21)) + [25, 30, 35, 40, 45, 50]
ks = [1, 3, 5, 7, 9, 11, 13, 15, 20, 25, 30, 40, 50, 100]

data = pd.DataFrame([], columns=['alpha', 'k', 'accuracy', 'time'])
for alpha in alphas:
    for k in ks:
        time = subprocess.run(['./PCA', 'train.csv', 'test.csv', 'out.csv', str(alpha), str(k)], stdout=subprocess.PIPE, encoding='ascii').stdout.split('\n')[0]
        score = get_accuracy(pd.read_csv('out.csv'), kaggle)
        data = data.append({'alpha': alpha, 'k': k, 'accuracy': score, 'time': float(time)}, ignore_index = True)

In [None]:
data = pd.read_csv('/results/PCA_accuracy.csv').astype({'alpha': 'int32', 'k': 'int32'})
sns.relplot(data=data, x="alpha", y="accuracy", hue="k", aspect=1.5, legend="full")
plt.xlabel("Alpha")
plt.ylabel("Accuracy")
plt.title("α y k vs. Accuracy")

### kNN

In [None]:
neighbors = pd.read_csv('neighbors.csv')
neighbors.head()

In [None]:
distance_matrix = np.zeros((len(test), 10))
neighbor_count = 0
accuracies = []
for k in ks:
    accuracy = 0
    for i in range(len(test)):
        curr_neighbors = neighbors.loc[neighbors["ImageId"] == i+1][neighbor_count:k]
        for j in range(len(curr_neighbors)):
            curr_neighbor = curr_neighbors.iloc[[j]]
            distance_matrix[i][curr_neighbor["Class"]] += 1 / curr_neighbor["Distance"]
        if distance_matrix[i].argmax() == int(kaggle.iloc[[i]]['Label']):
            accuracy += 1
    neighbor_count = k
    accuracies.append(accuracy / len(test)) 

In [None]:
data3 = data[data["k"].isin(range(20))]
data3 = data3[data3["alpha"].isin(range(16, 50))]
sns.relplot(data=data3, x="k", y="accuracy", hue="alpha", kind="line", aspect=1.5, legend="full")
plt.plot(ks[:8], accuracies[:8], marker="o", label="kNN")
plt.legend()
plt.title("Accuracy de kNN vs PCA")

# Accuracy vs Training Size

In [None]:
training_sizes = [100, 500, 1000, 2500, 5000, 10000, 25000, 40000]
for size in training_sizes:
    train_subset = pd.DataFrame()
    new_train_index = []
    for i in range(10):
        new_train_index += train.index[train['label'] == i][:size//10].tolist()
    new_train = train.loc[new_train_index]
    new_train.to_csv(f'./train_subsets/train_subset_{size}.csv', index=False)

In [None]:
pca_accuracy = []
knn_accuracy = []
for size in training_sizes:
    subprocess.run(['./PCA', f'./train_subsets/train_subset_{size}.csv', 'test.csv', 'out2.csv', "50", "3"], stdout=subprocess.PIPE, encoding='ascii')
    sub = pd.read_csv('./out2.csv')
    pca_accuracy.append(get_accuracy(sub, kaggle))
    subprocess.run(['./knn', f'./train_subsets/train_subset_{size}.csv', 'test.csv', 'out2.csv', "3", '1'], stdout=subprocess.PIPE, encoding='ascii')
    sub = pd.read_csv('./out2.csv')
    knn_accuracy.append(get_accuracy(sub, kaggle))

In [None]:
plt.figure(figsize=(7,4))
plt.plot(training_sizes[2:], pca_accuracy[2:], marker='.')
plt.plot(training_sizes[2:], knn_accuracy[2:], marker='.')
plt.xlabel('Imágenes de entrenamiento')
plt.ylabel('Accuracy')
plt.xticks([1000, 5000, 10000, 25000, 40000])
plt.legend(['PCA', 'kNN'])
plt.title('Cantidad de Imágenes de entrenamiento vs. Accuracy')
plt.show()

# Time Complexity

### kNN

In [None]:
ks = [1, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
knn_time = []
for k in ks:
    print(k)
    curr_knn = []
    for i in range(10):
        time = subprocess.run(['./kNN', './train_subsets/train_subset_5000.csv', 'test_subset_3000.csv', 'out2.csv', str(k), '0'], stdout=subprocess.PIPE, encoding='ascii').stdout.split('\n')[0]
        curr_knn.append(time)
    knn_time.append(curr_knn)

In [None]:
knn_time = np.array(knn_time).astype(float)
average_times = np.mean(knn_time, axis=1)
plt.plot(ks, average_times, marker='.')
plt.xlabel("k")
plt.ylabel("Tiempo [s]")
plt.title("k vs. Tiempo")

### PCA

In [None]:
alphas = [1, 10, 25, 50, 100]
ks = [1, 50, 100, 150, 200, 250, 300, 350, 400]
pca_times = pd.DataFrame(columns=['alpha', 'k', 'time'])
for k in ks:
    for alpha_index, alpha in enumerate(alphas):
        for i in range(10):
            time = subprocess.run(['./PCA', './train_subsets/train_subset_5000.csv', 'test_subset_3000.csv', 'out2.csv', str(alpha), str(k)], stdout=subprocess.PIPE, encoding='ascii').stdout.split('\n')[0]
            pca_times.loc[len(pca_times)] = [alpha, k, time]

In [None]:
pca_time_data = pd.read_csv("./results/PCA_time.csv")
pca_graph_data = pd.DataFrame(columns=['alpha', 'k', 'time'])
for k in ks:
    for alpha in alphas:
        curr_data = pca_time_data[(pca_time_data['alpha'] == alpha) & (pca_time_data['k'] == k)]
        avg_distance = curr_data['time'].mean()
        pca_graph_data.loc[len(pca_graph_data)] = [alpha, k, avg_distance]

In [None]:
plt.plot(ks, pca_graph_data[pca_graph_data["alpha"] == 50]["time"], marker="o", color="orange")
plt.xlabel("k")
plt.ylabel("Tiempo [s]")
plt.title("k vs. Tiempo")

In [None]:
sns.relplot(data=pca_graph_data.astype({'alpha': 'int32', 'k': 'int32'}), x="alpha", y="time", hue="k", aspect=1.25, kind="line", legend="full")
plt.xlabel("Alpha")
plt.ylabel("Tiempo [s]")
plt.title("α y  k vs. Tiempo")