In [3]:
import pandas as pd
import random as rnd
import numpy as np
from collections import Counter

In [4]:
def data_split(data_arr):
    num_rows, num_cols = data_arr.shape
    pred_data = data_arr[:, :-1].copy()
    tar_data = data_arr[:, -1]
    tar_data = tar_data.reshape(len(tar_data), 1)
    return pred_data, tar_data

In [5]:
def confusion_matrix(test_tar, predictions):
    test_tar, predictions = test_tar.flatten(), predictions.flatten()
    uni_class = np.unique(test_tar)
    matrix = np.zeros((uni_class.shape[0], uni_class.shape[0]))
    for i, actual in enumerate(uni_class):
        for j, pred in enumerate(uni_class):
            matrix[i, j] = np.sum(np.logical_and(test_tar == actual, predictions == pred))
    return matrix

In [6]:
def calculate_metrics(confusion_mtx):
    tp = np.diag(confusion_mtx)
    fn = np.sum(confusion_mtx, axis=1) - tp
    fp = np.sum(confusion_mtx, axis=0) - tp
    tn = np.sum(confusion_mtx) - (tp + fn + fp)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = np.sum(np.diag(confusion_mtx)) / np.sum(confusion_mtx)
    return precision, recall, accuracy

In [7]:
def feature_normalizer(arr):
    arr_min = arr.min(axis=1, keepdims=True)  # Taking advantage of numpy broadcasting
    arr_max = arr.max(axis=1, keepdims=True)  # Smaller array is "broadcast" across the larger array
    arr_norm = (arr - arr_min) / (arr_max - arr_min)
    return arr_norm

In [8]:
def kNN_alg(train_pred, test_pred, train_tar, test_tar):
    distances = np.zeros(shape=(len(test_tar), 9))
    mydict = dict()
    for i in range(len(test_pred)):
        point, train_pred = test_pred[i].astype(float), train_pred.astype(float)
        diff_arr = train_pred - point  # Numpy broadcast applies
        dist_arr = np.sqrt(np.sum(diff_arr * diff_arr, axis=1))
        distances[i] = np.argsort(dist_arr)[:9]
    for k in range(1, 10, 2):
        guess_class = np.zeros(shape=(len(test_tar), k))
        temp = distances[:, :k].astype(int)
        for j in range(len(temp)):
            guess_class[j] = train_tar[temp[j]].flatten()
        guess_class = guess_class.astype(int)
        predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=guess_class).reshape(
            len(test_tar), 1)
        conf_mtx = confusion_matrix(test_tar.flatten(), predictions.flatten())
        precisions, recalls, accuracy = calculate_metrics(conf_mtx)
        recall_macro_avg = np.sum(recalls) / len(recalls)
        prec_macro_avg = np.sum(precisions) / len(precisions)
        mydict[k] = {"accuracy": "{:.2%}".format(accuracy), "Recall Macro Average": "{:.2%}".format(recall_macro_avg),
                     "Precision Macro Average": "{:.2%}".format(prec_macro_avg)}
    print(mydict)
        
        

In [9]:
df = pd.read_csv("16P.csv", encoding="Latin1" )
df = df.drop("Response Id", axis=1)

df.loc[df["Personality"] == "ESTJ", "Personality"] = 0
df.loc[df["Personality"] == "ENTJ", "Personality"] = 1
df.loc[df["Personality"] == "ESFJ", "Personality"] = 2
df.loc[df["Personality"] == "ENFJ", "Personality"] = 3
df.loc[df["Personality"] == "ISTJ", "Personality"] = 4
df.loc[df["Personality"] == "ISFJ", "Personality"] = 5
df.loc[df["Personality"] == "INTJ", "Personality"] = 6
df.loc[df["Personality"] == "INFJ", "Personality"] = 7
df.loc[df["Personality"] == "ESTP", "Personality"] = 8
df.loc[df["Personality"] == "ESFP", "Personality"] = 9
df.loc[df["Personality"] == "ENTP", "Personality"] = 10
df.loc[df["Personality"] == "ENFP", "Personality"] = 11
df.loc[df["Personality"] == "ISTP", "Personality"] = 12 
df.loc[df["Personality"] == "ISFP", "Personality"] = 13
df.loc[df["Personality"] == "INTP", "Personality"] = 14
df.loc[df["Personality"] == "INFP", "Personality"] = 15

data_arr = np.array(df)
pred_data, tar_data = data_split(data_arr)
pred1, pred2, pred3, pred4, pred5 = np.array_split(pred_data, 5)
tar1, tar2, tar3, tar4, tar5 = np.array_split(tar_data, 5)

<b style='color:blue !important;'>Without Feature Normalization</b>


<b style='color:red !important;'>First Fold</b>

In [10]:
kNN_alg(np.concatenate((pred2, pred3, pred4, pred5), axis=0), pred1,
        np.concatenate((tar2, tar3, tar4, tar5), axis=0), tar1)

{1: {'accuracy': '97.87%', 'Recall Macro Average': '97.86%', 'Precision Macro Average': '97.87%'}, 3: {'accuracy': '98.86%', 'Recall Macro Average': '98.85%', 'Precision Macro Average': '98.87%'}, 5: {'accuracy': '98.92%', 'Recall Macro Average': '98.92%', 'Precision Macro Average': '98.93%'}, 7: {'accuracy': '98.94%', 'Recall Macro Average': '98.94%', 'Precision Macro Average': '98.95%'}, 9: {'accuracy': '98.97%', 'Recall Macro Average': '98.96%', 'Precision Macro Average': '98.97%'}}


<b style='color:red !important;'>Second Fold</b>

In [11]:
kNN_alg(np.concatenate((pred1, pred3, pred4, pred5), axis=0), pred2,
        np.concatenate((tar1, tar3, tar4, tar5), axis=0), tar2)

{1: {'accuracy': '97.68%', 'Recall Macro Average': '97.68%', 'Precision Macro Average': '97.69%'}, 3: {'accuracy': '98.88%', 'Recall Macro Average': '98.88%', 'Precision Macro Average': '98.88%'}, 5: {'accuracy': '98.92%', 'Recall Macro Average': '98.92%', 'Precision Macro Average': '98.92%'}, 7: {'accuracy': '98.94%', 'Recall Macro Average': '98.94%', 'Precision Macro Average': '98.94%'}, 9: {'accuracy': '98.95%', 'Recall Macro Average': '98.95%', 'Precision Macro Average': '98.95%'}}


<b style='color:red !important;'>Third Fold</b>

In [12]:
kNN_alg(np.concatenate((pred1, pred2, pred4, pred5), axis=0), pred3,
        np.concatenate((tar1, tar2, tar4, tar5), axis=0), tar3)

{1: {'accuracy': '97.68%', 'Recall Macro Average': '97.69%', 'Precision Macro Average': '97.69%'}, 3: {'accuracy': '98.88%', 'Recall Macro Average': '98.89%', 'Precision Macro Average': '98.89%'}, 5: {'accuracy': '98.95%', 'Recall Macro Average': '98.95%', 'Precision Macro Average': '98.95%'}, 7: {'accuracy': '98.98%', 'Recall Macro Average': '98.98%', 'Precision Macro Average': '98.98%'}, 9: {'accuracy': '98.98%', 'Recall Macro Average': '98.98%', 'Precision Macro Average': '98.98%'}}


<b style='color:red !important;'>Fourth Fold</b>

In [13]:
kNN_alg(np.concatenate((pred1, pred2, pred3, pred5), axis=0), pred4,
        np.concatenate((tar1, tar2, tar3, tar5), axis=0), tar4)

{1: {'accuracy': '97.88%', 'Recall Macro Average': '97.88%', 'Precision Macro Average': '97.89%'}, 3: {'accuracy': '98.73%', 'Recall Macro Average': '98.73%', 'Precision Macro Average': '98.74%'}, 5: {'accuracy': '98.88%', 'Recall Macro Average': '98.87%', 'Precision Macro Average': '98.88%'}, 7: {'accuracy': '98.88%', 'Recall Macro Average': '98.87%', 'Precision Macro Average': '98.88%'}, 9: {'accuracy': '98.87%', 'Recall Macro Average': '98.87%', 'Precision Macro Average': '98.87%'}}


<b style='color:red !important;'>Fifth Fold</b>

In [14]:
kNN_alg(np.concatenate((pred1, pred2, pred3, pred4), axis=0), pred5,
        np.concatenate((tar1, tar2, tar3, tar4), axis=0), tar5)

{1: {'accuracy': '97.76%', 'Recall Macro Average': '97.75%', 'Precision Macro Average': '97.75%'}, 3: {'accuracy': '98.83%', 'Recall Macro Average': '98.83%', 'Precision Macro Average': '98.83%'}, 5: {'accuracy': '98.89%', 'Recall Macro Average': '98.89%', 'Precision Macro Average': '98.89%'}, 7: {'accuracy': '98.88%', 'Recall Macro Average': '98.88%', 'Precision Macro Average': '98.88%'}, 9: {'accuracy': '98.90%', 'Recall Macro Average': '98.90%', 'Precision Macro Average': '98.90%'}}


<b style='color:blue !important;'>With feature normalization</b>

In [18]:
pred_normal = feature_normalizer(pred_data)

In [20]:
predn1, predn2, predn3, predn4, predn5 = np.array_split(pred_normal, 5)


<b style='color:red !important;'>First Fold</b>

In [22]:
kNN_alg(np.concatenate((predn2, predn3, predn4, predn5), axis=0), predn1,
        np.concatenate((tar2, tar3, tar4, tar5), axis=0), tar1)

{1: {'accuracy': '97.72%', 'Recall Macro Average': '97.72%', 'Precision Macro Average': '97.72%'}, 3: {'accuracy': '98.83%', 'Recall Macro Average': '98.83%', 'Precision Macro Average': '98.84%'}, 5: {'accuracy': '98.88%', 'Recall Macro Average': '98.88%', 'Precision Macro Average': '98.89%'}, 7: {'accuracy': '98.89%', 'Recall Macro Average': '98.89%', 'Precision Macro Average': '98.90%'}, 9: {'accuracy': '98.91%', 'Recall Macro Average': '98.90%', 'Precision Macro Average': '98.91%'}}


In [23]:
kNN_alg(np.concatenate((predn1, predn3, predn4, predn5), axis=0), predn2,
        np.concatenate((tar1, tar3, tar4, tar5), axis=0), tar2)

{1: {'accuracy': '97.69%', 'Recall Macro Average': '97.69%', 'Precision Macro Average': '97.69%'}, 3: {'accuracy': '98.76%', 'Recall Macro Average': '98.76%', 'Precision Macro Average': '98.76%'}, 5: {'accuracy': '98.82%', 'Recall Macro Average': '98.82%', 'Precision Macro Average': '98.82%'}, 7: {'accuracy': '98.83%', 'Recall Macro Average': '98.82%', 'Precision Macro Average': '98.82%'}, 9: {'accuracy': '98.86%', 'Recall Macro Average': '98.86%', 'Precision Macro Average': '98.86%'}}


In [24]:
kNN_alg(np.concatenate((predn1, predn2, predn4, predn5), axis=0), predn3,
        np.concatenate((tar1, tar2, tar4, tar5), axis=0), tar3)

{1: {'accuracy': '97.70%', 'Recall Macro Average': '97.70%', 'Precision Macro Average': '97.70%'}, 3: {'accuracy': '98.83%', 'Recall Macro Average': '98.84%', 'Precision Macro Average': '98.84%'}, 5: {'accuracy': '98.97%', 'Recall Macro Average': '98.97%', 'Precision Macro Average': '98.97%'}, 7: {'accuracy': '98.93%', 'Recall Macro Average': '98.94%', 'Precision Macro Average': '98.94%'}, 9: {'accuracy': '98.96%', 'Recall Macro Average': '98.96%', 'Precision Macro Average': '98.96%'}}


<b style='color:red !important;'>Error Analysis for Classification</b>

For k values in the first fold <font color='red'>without</font> feature normalization:

1: {'accuracy': '97.87%'

3: {'accuracy': '98.86%'

5: {'accuracy': '98.92%'

7: {'accuracy': '98.94%'

9: {'accuracy': '98.97%'

We can see that as k value increases, the accuracy of the algorithm increases as well.

For k values in the first fold <font color='red'>with</font> feature normalization:

1: {'accuracy': '97.72%'

3: {'accuracy': '98.83%'

5: {'accuracy': '98.88%'

7: {'accuracy': '98.89%'

9: {'accuracy': '98.91%'




We can see that Feature Normalization yields lesser accuracy.