In [7]:
import numpy as np
import math

def calculate_measure(test_dataset, predictions, fold_value):
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    test_label=[]
    x = len(test_dataset[0])
    for i in range(len(test_dataset)):
        test_label.append(test_dataset[:len(test_dataset)][i][x-1])
    
    for i in range(len(test_label)):
        if(test_label[i]==1 and predictions[i]==1):
            TP += 1
        elif(test_label[i]==1 and predictions[i]==0):
            FN += 1
        elif(test_label[i]==0 and predictions[i]==1):
            FP += 1
        elif(test_label[i]==0 and predictions[i]==0):
            TN += 1

    accuracy = (TP + TN)/(TP + FP + TN + FN)
    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    f1 = (2*precision*recall)/(precision + recall)
    
    print("----------------------------------------------------------------")
    print("For Test Dataset from fold",fold_value+1)
    print("Accuracy: ",accuracy)
    print("Precision: ",precision)
    print("Recall: ",recall)
    print("F-1: ",f1)
    return accuracy,precision,recall,f1

def split_data_n_folds(dataset, fold_value):
    split_length = int(len(dataset) / fold_value)
    train_data_idx = set()
    A = []
    P = []
    R = []
    F = []
    for i in range(fold_value):
        train_dataset = []
        test_dataset = []
        if i != fold_value - 1:
            start_index = i * split_length
            end_index = start_index + split_length
            test_data_idx = set(range(start_index, end_index))
        else:
            test_data_idx = set(range(i * split_length, len(dataset)))
        
        train_data_idx = set(range(len(dataset))).difference(test_data_idx)
        for index in range(len(dataset)):
            if index in train_data_idx:
                train_dataset.append(dataset[index])
            else:
                test_dataset.append(dataset[index])

       # Call algorithm
        summary = create_summary_by_class(train_dataset)
        predictions = []
        for test_dataset_row in test_dataset:
            answer = predict_for_row(summary, test_dataset_row)
            predictions.append(answer)
    
        accuracy,precision,recall,f1 = calculate_measure(test_dataset, predictions, i)
        A.append(accuracy)
        P.append(precision)
        R.append(recall)
        F.append(f1)
    
    
    accuracy = sum(A) / fold_value
    precision = sum(P) / fold_value
    recall = sum(R) / fold_value
    f1 = sum(F) / fold_value
    print("----------------------------------------------------------------")
    print("Average values for Dataset")
    print("Accuracy: ",accuracy)
    print("Precision: ",precision)
    print("Recall: ",recall)
    print("F-1: ",f1)
    
    
def create_class_seperated_data(dataset):
    class_separeted_data = dict()
    for i in range(len(dataset)):
        class_index = len(dataset[0])-1
        class_value = dataset[i][class_index]
        if (class_value not in class_separeted_data):
            class_separeted_data[class_value] = list()
        class_separeted_data[class_value].append(dataset[i])
    return class_separeted_data

def create_summery_list(dataset):
    summary_list = []
    for col in zip(*dataset):
        avg = sum(col)/float(len(col))
        variance = sum([(x-avg)**2 for x in col]) / float(len(col)-1)
        std = math.sqrt(variance)
        length = len(col)
        tup = (avg,std,length)
        summary_list.append(tup)
    del summary_list[-1]
    return summary_list

def create_summary_by_class(dataset):
    class_separeted_data = create_class_seperated_data(dataset)
    summary_list = dict()
    for class_value, rows in class_separeted_data.items():
        summary_list[class_value] = create_summery_list(rows)
    return summary_list

 
def class_probabilities_calculation(summary, row):
    probability = dict()
    total_rows = 0
    for class_label in summary:
        total_rows += summary[class_label][0][2]
    for class_value, class_summaries in summary.items():
        probability[class_value] = summary[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, std, total_rows_for_class = class_summaries[i]
            row_prob = (1 / (math.sqrt(2 * math.pi) * std)) * (math.exp(-((row[i]-mean)**2 / (2 * std**2 ))))
            probability[class_value] *= row_prob
    return probability
 
def predict_for_row(summary, row):
    final_label = None 
    final_prob = -1
    probability = class_probabilities_calculation(summary, row)
    for class_value, probability_value in probability.items():
        if final_label is None or probability_value > final_prob:
            final_prob = probability_value
            final_label = class_value
    return final_label

import pandas as pd
dataset=pd.read_csv("project3_dataset1.txt",delimiter="\t", header=None)
for i in range(len(dataset.iloc[0])):
    if type(dataset.iloc[0][i]) == str:
        l = set()
        for j in dataset.iloc[:,i]:
            l.add(j)
        li=list(l)
        for k in range(len(dataset.iloc[:,i])):
            for j in range(len(li)):
                if(dataset.iloc[:,i][k]==li[j]):
                    dataset.iloc[:,i][k]=j
d = dataset.to_numpy()
split_data_n_folds(d, 10)

----------------------------------------------------------------
For Test Dataset from fold 1
Accuracy:  0.9464285714285714
Precision:  0.9130434782608695
Recall:  0.9545454545454546
F-1:  0.9333333333333332
----------------------------------------------------------------
For Test Dataset from fold 2
Accuracy:  0.9285714285714286
Precision:  0.8888888888888888
Recall:  0.8888888888888888
F-1:  0.8888888888888888
----------------------------------------------------------------
For Test Dataset from fold 3
Accuracy:  0.9642857142857143
Precision:  0.8571428571428571
Recall:  1.0
F-1:  0.923076923076923
----------------------------------------------------------------
For Test Dataset from fold 4
Accuracy:  0.9107142857142857
Precision:  0.9
Recall:  0.8571428571428571
F-1:  0.8780487804878048
----------------------------------------------------------------
For Test Dataset from fold 5
Accuracy:  0.9107142857142857
Precision:  0.9473684210526315
Recall:  0.8181818181818182
F-1:  0.87804878