In [28]:
import numpy as np
import pandas as pd
import math

def calculate_measure(test_dataset, predictions, fold_value):
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    test_label=[]
    x = len(test_dataset[0])
    for i in range(len(test_dataset)):
        test_label.append(test_dataset[:len(test_dataset)][i][x-1])
    
    for i in range(len(test_label)):
        if(test_label[i]==1 and predictions[i]==1):
            TP += 1
        elif(test_label[i]==1 and predictions[i]==0):
            FN += 1
        elif(test_label[i]==0 and predictions[i]==1):
            FP += 1
        elif(test_label[i]==0 and predictions[i]==0):
            TN += 1

    accuracy = (TP + TN)/(TP + FP + TN + FN)
    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    f1 = (2*precision*recall)/(precision + recall)
    
    print("----------------------------------------------------------------")
    print("For Test Dataset from fold",fold_value+1)
    print("Accuracy: ",accuracy)
    print("Precision: ",precision)
    print("Recall: ",recall)
    print("F-1: ",f1)
    return accuracy,precision,recall,f1

def split_data_n_folds(dataset, fold_value, k_value):
    split_length = int(len(dataset) / fold_value)
    train_data_idx = set()
    A = []
    P = []
    R = []
    F = []
    for i in range(fold_value):
        train_dataset = []
        test_dataset = []
        if i != fold_value - 1:
            start_index = i * split_length
            end_index = start_index + split_length
            test_data_idx = set(range(start_index, end_index))
        else:
            test_data_idx = set(range(i * split_length, len(dataset)))
        
        train_data_idx = set(range(len(dataset))).difference(test_data_idx)
        for index in range(len(dataset)):
            if index in train_data_idx:
                train_dataset.append(dataset[index])
            else:
                test_dataset.append(dataset[index])

        predictions = []
        for test_dataset_row in test_dataset:
            output = kNN(train_dataset, test_dataset_row, k_value)
            predictions.append(output)
    
        accuracy,precision,recall,f1 = calculate_measure(test_dataset, predictions, i)
        A.append(accuracy)
        P.append(precision)
        R.append(recall)
        F.append(f1)
    
    
    accuracy = sum(A) / fold_value
    precision = sum(P) / fold_value
    recall = sum(R) / fold_value
    f1 = sum(F) / fold_value
    print("----------------------------------------------------------------")
    print("Average values for Dataset")
    print("Accuracy: ",accuracy)
    print("Precision: ",precision)
    print("Recall: ",recall)
    print("F-1: ",f1)
        
        
    

def find_distance(d, s):
    distance = 0.0
    intermediate_distance = 0.0
    for i in range(len(d)-1):
        intermediate_distance += (d[i] - s[i])**2
    distance = math.sqrt(intermediate_distance)
    return distance

def sort_list(distance_list):
    distance_list = sorted(distance_list, key = lambda x:x[1])
    return distance_list
    
def find_neighbors(train_dataset, test_dataset_row, k_value):
    distance_list = []
    neighbor_list = []
    for train_dataset_row in train_dataset:
        distance = find_distance(test_dataset_row, train_dataset_row)
        distance_list.append((train_dataset_row, distance))
    distance_list = sort_list(distance_list)
    for i in range(k_value):
        neighbor_list.append(distance_list[i][0])
    return neighbor_list

def calculate_maximum_prediction(neighbor_list):
    predicted_values = [x[len(x)-1] for x in neighbor_list]
    max_count = -1
    max_label = -1
    for i in predicted_values:
        count = predicted_values.count(i)
        if count > max_count:
            max_count = count
            max_label = i
    return max_label
    
def kNN(train_dataset, test_dataset_row, k_value):
    neighbor_list = find_neighbors(train_dataset, test_dataset_row, k_value)
    predicted_label = calculate_maximum_prediction(neighbor_list)
    return predicted_label


print('Enter Value of K:')
k_value = int(input())

dataset=pd.read_csv("project3_dataset1.txt",delimiter="\t", header=None)
for i in range(len(dataset.iloc[0])):
    if type(dataset.iloc[0][i]) == str:
        l = set()
        for j in dataset.iloc[:,i]:
            l.add(j)
        li=list(l)
        for k in range(len(dataset.iloc[:,i])):
            for j in range(len(li)):
                if(dataset.iloc[:,i][k]==li[j]):
                    dataset.iloc[:,i][k]=j
d = dataset.to_numpy()
split_data_n_folds(d, 10, k_value)


Enter Value of K:
4
----------------------------------------------------------------
For Test Dataset from fold 1
Accuracy:  0.9285714285714286
Precision:  0.9090909090909091
Recall:  0.9090909090909091
F-1:  0.9090909090909091
----------------------------------------------------------------
For Test Dataset from fold 2
Accuracy:  0.8571428571428571
Precision:  0.8571428571428571
Recall:  0.6666666666666666
F-1:  0.75
----------------------------------------------------------------
For Test Dataset from fold 3
Accuracy:  0.9464285714285714
Precision:  0.9090909090909091
Recall:  0.8333333333333334
F-1:  0.8695652173913043
----------------------------------------------------------------
For Test Dataset from fold 4
Accuracy:  0.9285714285714286
Precision:  0.9473684210526315
Recall:  0.8571428571428571
F-1:  0.9
----------------------------------------------------------------
For Test Dataset from fold 5
Accuracy:  0.9285714285714286
Precision:  0.95
Recall:  0.8636363636363636
F-1:  0.