# Question: 2

In [209]:
# Read and load the data
import numpy as np

def d_cleaning(line):
    return line.replace('(', '').replace(')', '').replace(' ', '').strip().split(',')

def d_fetching(filename):
    with open(filename, 'r') as f:
        d_input = f.readlines()
        clean_input = list(map(d_cleaning, d_input))
        f.close()
    return clean_input

def f_reading(dataset_path):
    d_input = d_fetching(dataset_path)
    input_np = np.array(d_input)
    return input_np

d_training = './dataset/1a_2a-training.txt'
d_testing = './dataset/1a_2a-test.txt'
d_large_120 = './dataset/1cd_2cd-data.txt'

tp_train = f_reading(d_training)
print(tp_train)
tp_test = f_reading(d_testing)
print(tp_test)
tp_large = f_reading(d_large_120)

[['1.6530190426733' '72.871146648479' '24' 'W']
 ['1.6471384909498' '72.612785314988' '34' 'W']
 ['1.6472055785348' '73.53968351051' '33' 'M']
 ['1.7323008914951' '76.067870338779' '30' 'M']
 ['1.6750702657911' '81.05582111533' '30' 'M']
 ['1.5780970716644' '64.926084680188' '30' 'W']
 ['1.6587629355524' '69.38092449041' '30' 'M']
 ['1.6763295980234' '77.062295990149' '31' 'M']
 ['1.7187224085504' '62.112923317057' '37' 'W']
 ['1.5202218226439' '66.151444019603' '27' 'W']
 ['1.5552689261884' '66.076386143769' '31' 'W']
 ['1.6969333189258' '77.45386244568' '34' 'M']
 ['1.6887980792886' '76.489640732464' '37' 'M']
 ['1.5213552893624' '63.952944947832' '35' 'W']]
[['1.62065758929' '59.376557437583' '32']
 ['1.7793983848363' '72.071775670801' '36']
 ['1.7004576585974' '66.267508112786' '31']
 ['1.6591086215159' '61.751621901787' '29']]


In [210]:
# Preparing the data
train_X = []
for i in range(tp_train.shape[0]):
    train_X.append(np.array(tp_train[i][:-1], dtype=np.float32))
train_X = np.array(train_X)

test_X = np.array(tp_test, dtype=np.float32)
    
large_X = []
for i in range(tp_large.shape[0]):
    large_X.append(np.array(tp_large[i][:-1], dtype=np.float32))
large_X = np.array(large_X)
    
train_Y = []
for i in range(tp_train.shape[0]):
    train_Y.append([tp_train[i][-1]])
train_Y = np.array(train_Y, dtype=object)

large_Y = []
for i in range(tp_large.shape[0]):
    large_Y.append([tp_large[i][-1]])
large_Y = np.array(large_Y, dtype=object)

data_train = np.concatenate((train_X, train_Y), axis=1)
data_large = np.concatenate((large_X, large_Y), axis=1)
data_test = np.concatenate((test_X, np.empty((test_X.shape[0],1), dtype=object)), axis=1)

# Part a, b

In [211]:
# Spliting the data by classes
def separating_class(data):
    cs_d = dict()
    for i in range(len(data)):
        row = data[i]
        cls_val = row[-1]
        if cls_val not in cs_d:
            cs_d[cls_val] = list()
        cs_d[cls_val].append(row)
    return cs_d

In [212]:
# Calculating the mean, stdev and count of each column
def summary_d(data):
    data = np.array(np.delete(data, len(data[0])-1, axis=1), dtype=np.float32)
    temp = [(np.mean(col), np.std(col), len(col)) for col in zip(*data)]
    return temp

def summary_d_by_class(data):
    cs_d = separating_class(data)
    temp_data = dict()
    for class_value, row in cs_d.items():
        temp_data[class_value] = summary_d(row)
    return temp_data

In [213]:
import math

# Calculating the Gaussian Probability Distribution
def cal_probablities(x, mean, stdev):
    exp = math.exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exp

In [214]:
# Calculating the probabilities of each class
def calculate_class_probabilities(data, row):
    tmp_summary_data = summary_d_by_class(data)
    rows = sum([tmp_summary_data[i][0][2] for i in tmp_summary_data])
    prob_val = dict()
    for class_value, class_tmp_summary_data in tmp_summary_data.items():
        prob_val[class_value] = tmp_summary_data[class_value][0][2]/float(rows)
        for i in range(len(class_tmp_summary_data)):
            mean, std, _ = class_tmp_summary_data[i]
            prob_val[class_value] *= cal_probablities(row[i], mean, std)
    return prob_val

In [215]:
# Predicting the class label with the highest probability
def predict_nb(data, row):
    prob_val = calculate_class_probabilities(data, row)
    predict_val = max(zip(prob_val.values(), prob_val.keys()))[1]
    return predict_val

In [216]:
# Prediction on the test data
for i in range(len(data_test)):
    print(f"For {data_test[i][:-1]} the predicted label is: {predict_nb(data_train, data_test[i])}")

For [1.6206575632095337 59.376556396484375 32.0] the predicted label is: W
For [1.7793984413146973 72.07177734375 36.0] the predicted label is: W
For [1.7004576921463013 66.26750946044922 31.0] the predicted label is: W
For [1.6591086387634277 61.75162124633789 29.0] the predicted label is: W


In [217]:
# Implementing the Leave One Out Evaluation function
def leave_one_out_evaluation_nb(data):
    predictions = []
    for i in range(len(data)):
        test_data = data[i]
        train_data = np.delete(data, i, axis=0)
        predictions.append(predict_nb(train_data, test_data))
    predictions = np.array(predictions, dtype=object)
    
    temp = 0
    for j in range(len(data)):
        if data[j][-1] == predictions[j]:
            temp+=1
    
    return temp/len(data)

# Part c

In [218]:
# Accuracy of the Gaussian Naive Bayes Model
print(f"Accuracy: {round(leave_one_out_evaluation_nb(data_large)*100, 2)}%")

Accuracy: 70.0%


# Part d

In [219]:
# Accuracy of the Gaussian Naive Bayes Model after removing the age data
n_large_data = np.delete(data_large, 2, axis=1)

print(f"Accuracy: {round(leave_one_out_evaluation_nb(n_large_data)*100, 2)}%")

Accuracy: 70.83%


# Part e

By comparing the results of the two models discussed above, we can conclude that: 
In case 1, where all three features are taken into account, Gaussian Naive Bayes outperforms KNN.
In case 2, with only two features, the previous two algorithms perform better overall, and in addition, Gaussian Naive Bayes and KNN (with K=3) perform similarly.
We can therefore draw the conclusion that the Gaussian Naive Bayes algorithm performs better than KNN.