In [1]:
import pandas as pd
from math import sqrt
from math import exp
from math import pi
from math import log
import numpy as np
from datetime import datetime


startTime = datetime.now()


def mean(val):
    return sum(val)/len(val)


def class_stats(df,col,col_class):
    
    df_class = df[col_class]
    possible_classes = dict()
    l1 = []
    for i in df_class:
        l1.append(i)
    l1 = set(l1)
    
    df_class_cal = dict()
    
    for class_val in l1:
        df_class_cal[int(class_val)] = df.loc[df[col_class] == class_val]
    
    values = dict()
    
    for i in l1:
        l = []
        for j in col:
            l.append(mean(df_class_cal[i][j]))
        values[i] = l
        
    return values

def prob(att_value,mean):
    ld = 1/mean
    val = att_value * ld
    e = exp(-val)
    return ld * e
    

def cal_row_prob_for_class(val,row,prior_prob):
    probability = dict()
    
    for class_selected, class_att_values in val.items():
        probability[class_selected] = log(prior_prob[class_selected])
        
        for i in range(len(class_att_values)):
            mean = class_att_values[i]
            probability[class_selected] += log(prob(row[i],mean))
    
    return probability

def predict_class(val,row,prior_prob):
    class_probs = cal_row_prob_for_class(val,row,prior_prob)

    label = 0
    cl_prob = 0
    
    for i in class_probs.keys():
        if(class_probs[i] > cl_prob):
            label = i
            cl_prob = class_probs[i]
            
    return label

def split_data(k_folds,df):
    df_list =[]
    count = int(len(df)/k_folds)
    
    for i in range(k_folds):
        beg_index = i * count
        end_index = (i+1) * count
        if i == k_folds-1:
            end_index = len(df)

        df_list.append(df[beg_index:end_index])
        
    return df_list

def accuracy(o,y):
    count = 0
    for i in range(len(o)):
        if o[i] == y[i]:
            count += 1
            
    return count/len(o)

def naive_bayes(train_df,test_df,col,col_class,prior_prob):
    
    val = class_stats(train_df,col,col_class)
    X_test = np.array(test_df[col])
    y_test = np.array(test_df[col_class])
    
    o = []
    
    for i in X_test:
        o.append(predict_class(val,i,prior_prob))
        
    return accuracy(o,y_test)

def evaluate(df,k_folds,col,col_class,prior_prob):
    
    df_list = split_data(k_folds,df)
    score = []
    
    for i in range(k_folds):
        train_df = pd.DataFrame()
        test_df = pd.DataFrame()
        
        for j in range(0,i):
            train_df = pd.concat([train_df,df_list[j]], ignore_index=True)
            
        for j in range(i+1,k_folds):
            train_df = pd.concat([train_df,df_list[j]], ignore_index=True)
            
        test_df = df_list[i]
        
        acc = naive_bayes(train_df,test_df,col,col_class,prior_prob)
        score.append(acc)
        
    return score

def normalize(x):
    result = x.copy()
    for feature_name in x.columns:
        max_value = x[feature_name].max()
        min_value = x[feature_name].min()
        result[feature_name] = (x[feature_name] - min_value) / (max_value - min_value)
    return result
        
        


df = pd.read_csv('ILPD.csv')
df = normalize(df)

df = df.sample(frac=1)

prior_prob = dict()

df_liver = df.loc[df['Selector'] == 0]
df_noliver = df.loc[df['Selector'] == 1]

prior_prob[0] = len(df_liver)/len(df)
prior_prob[1] = len(df_noliver)/len(df)

col = ['Age','Gender','TB','DB','Alkphos','Sgpt','Sgot','TP','ALB','A/G']
col_class = 'Selector'

score = evaluate(df,5,col,col_class,prior_prob)


for i in range(len(score)):
    print("Accuracy during testing of fold ",i+1," is :","{:5.4f}".format(score[i]))


mean = sum(score)/len(score)

print("\n" + "Average accuracy is : ","{:5.4f}".format(mean))


print("\n")
print("Execution time in seconds = ", datetime.now() - startTime)

Accuracy during testing of fold  1  is : 0.6873
Accuracy during testing of fold  2  is : 0.6827
Accuracy during testing of fold  3  is : 0.6981
Accuracy during testing of fold  4  is : 0.7090
Accuracy during testing of fold  5  is : 0.6909

Average accuracy is :  0.6936


Execution time in seconds =  0:00:00.167880
