# Question: 1

In [243]:
# Read and load the data
import numpy as np

def d_cleaning(line):
    return line.replace('(', '').replace(')', '').replace(' ', '').strip().split(',')

def d_fetching(filename):
    with open(filename, 'r') as f:
        d_input = f.readlines()
        clean_input = list(map(d_cleaning, d_input))
        f.close()
    return clean_input

def f_reading(dataset_path):
    d_input = d_fetching(dataset_path)
    input_np = np.array(d_input)
    return input_np

d_training = './dataset/1a_2a-training.txt'
d_testing = './dataset/1a_2a-test.txt'
d_large_120 = './dataset/1cd_2cd-data.txt'

tp_train = f_reading(d_training)
print(tp_train)
tp_test = f_reading(d_testing)
print(tp_test)
tp_large = f_reading(d_large_120)

[['1.6530190426733' '72.871146648479' '24' 'W']
 ['1.6471384909498' '72.612785314988' '34' 'W']
 ['1.6472055785348' '73.53968351051' '33' 'M']
 ['1.7323008914951' '76.067870338779' '30' 'M']
 ['1.6750702657911' '81.05582111533' '30' 'M']
 ['1.5780970716644' '64.926084680188' '30' 'W']
 ['1.6587629355524' '69.38092449041' '30' 'M']
 ['1.6763295980234' '77.062295990149' '31' 'M']
 ['1.7187224085504' '62.112923317057' '37' 'W']
 ['1.5202218226439' '66.151444019603' '27' 'W']
 ['1.5552689261884' '66.076386143769' '31' 'W']
 ['1.6969333189258' '77.45386244568' '34' 'M']
 ['1.6887980792886' '76.489640732464' '37' 'M']
 ['1.5213552893624' '63.952944947832' '35' 'W']]
[['1.62065758929' '59.376557437583' '32']
 ['1.7793983848363' '72.071775670801' '36']
 ['1.7004576585974' '66.267508112786' '31']
 ['1.6591086215159' '61.751621901787' '29']]


In [244]:
# Preparing the data
train_X = []
for i in range(tp_train.shape[0]):
    train_X.append(np.array(tp_train[i][:-1], dtype=np.float32))
train_X = np.array(train_X)

test_X = np.array(tp_test, dtype=np.float32)
    
large_X = []
for i in range(tp_large.shape[0]):
    large_X.append(np.array(tp_large[i][:-1], dtype=np.float32))
large_X = np.array(large_X)
    
train_Y = []
for i in range(tp_train.shape[0]):
    train_Y.append([tp_train[i][-1]])
train_Y = np.array(train_Y, dtype=object)

large_Y = []
for i in range(tp_large.shape[0]):
    large_Y.append([tp_large[i][-1]])
large_Y = np.array(large_Y, dtype=object)

data_train = np.concatenate((train_X, train_Y), axis=1)
data_large = np.concatenate((large_X, large_Y), axis=1)
data_test = np.concatenate((test_X, np.empty((test_X.shape[0],1), dtype=object)), axis=1)

# Part a, b

In [245]:
# Cartesian distance
import math

def cartesian_distance(x, y):
    dist = 0
    
    # Calculate Cartesian distance
    for d in range(len(x)-1):
        dist += (x[d] - y[d])**2
        
    dist = math.sqrt(dist)
    return dist

In [246]:
# Manhattan distance
def manhattan_distance(x, y):
    dist = 0
    
    # Calculate Manhattan distance
    for d in range(len(x)-1):
        dist += abs(x[d] - y[d])
    return dist

In [247]:
# Minkowski distance
def minkowski_distance(x, y, order=3):
    dist= 0
    
    # Calculate minkowski distance using order
    for d in range(len(x)-1):
        dist += abs(x[d] - y[d])**order
        
    dist = dist**(1/order)
    return dist

In [248]:
# Locate the most similar neighbors
def select_neighbors(train, row_testing, num_neighbors, dist_type):
    dist_values= list()
    for row_training in train:
        dist = dist_type(row_testing, row_training)
        dist_values.append((row_training, dist))
    dist_values.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(dist_values[i][0])
    return neighbors

In [249]:
# Classification prediction with neighbors
def predict(train, test, num_neighbors, dist_type):
    l_predictions = []
    for row_testing in test:
        neighbors = select_neighbors(train, row_testing, num_neighbors, dist_type)
        o_val = [row[-1] for row in neighbors]
        l_predictions.append(max(set(o_val), key=o_val.count))
    return l_predictions

In [250]:
# Predictions on the test data
for dist_type in [cartesian_distance, manhattan_distance, minkowski_distance]:
    for num_neighbors in [1, 3, 7]:
        print(f"With {dist_type} and predicted labels are: {predict(data_train, data_test, num_neighbors, dist_type)}")

With <function cartesian_distance at 0x7fd7c0723b80> and predicted labels are: ['W', 'W', 'W', 'W']
With <function cartesian_distance at 0x7fd7c0723b80> and predicted labels are: ['W', 'M', 'W', 'W']
With <function cartesian_distance at 0x7fd7c0723b80> and predicted labels are: ['W', 'M', 'W', 'W']
With <function manhattan_distance at 0x7fd7a051b040> and predicted labels are: ['W', 'W', 'W', 'W']
With <function manhattan_distance at 0x7fd7a051b040> and predicted labels are: ['W', 'M', 'W', 'W']
With <function manhattan_distance at 0x7fd7a051b040> and predicted labels are: ['W', 'M', 'W', 'W']
With <function minkowski_distance at 0x7fd7903f38b0> and predicted labels are: ['W', 'W', 'W', 'W']
With <function minkowski_distance at 0x7fd7903f38b0> and predicted labels are: ['W', 'M', 'W', 'W']
With <function minkowski_distance at 0x7fd7903f38b0> and predicted labels are: ['W', 'M', 'W', 'W']


# Part c

In [251]:
# Implementing the Leave-One-Out Evaluation function
def leave_one_out_evaluation(data, K):
    l_predictions = []
    for i in range(len(data)):
        data_test = [data[i]]
        data_train = np.delete(data, i, axis=0)
        l_predictions.append(predict(data_train, data_test, K, cartesian_distance))
    l_predictions = np.array(l_predictions, dtype=object)
    
    temp = 0
    for i in range(len(data)):
        if data[i][-1] == l_predictions[i]:
            temp+=1
    
    return temp/len(data)

In [252]:
# Check the performance of the KNN algorithm (with cartesian distance) with the given values of K
for k_val in [1, 3, 5, 7, 9, 11]:
    print(f"For K= {k_val}, Accuracy: {round(leave_one_out_evaluation(data_large, k_val)*100,2)}%")

For K= 1, accuracy= 55.0%
For K= 3, accuracy= 61.67%
For K= 5, accuracy= 61.67%
For K= 7, accuracy= 60.83%
For K= 9, accuracy= 63.33%
For K= 11, accuracy= 59.17%


# Part d

In [253]:
# Check the performance of the KNN algorithm (with cartesian distance) with the given values of K, after removing the feature 'age'
n_data_large = np.delete(data_large, 2, axis=1)

for K in [1, 3, 5, 7, 9, 11]:
    print(f"For K= {K}, Accuracy: {round(leave_one_out_evaluation(n_data_large, K)*100,2)}%")

For K= 1, accuracy= 62.5%
For K= 3, accuracy= 70.83%
For K= 5, accuracy= 65.0%
For K= 7, accuracy= 63.33%
For K= 9, accuracy= 60.0%
For K= 11, accuracy= 57.5%


# Comparing the results

Dropped the 'age' data and again evaluated the performance of the algorithm using Leave One Out Evaluation. I observed that the minimum, average, and maximum accuracies for all values of k utilizing all similarities reduced when the age was removed from the dataset. This brings us to the conclusion that in this situation, age is a crucial component to predict the label. The age is not the most crucial feature, but it is one that definitely helps the predictions because the decrease is not very significant.