In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
#Check the categorical data
def check_categorical(data):
    categorical_index = []
    for i,value in enumerate(data.dtypes):
        if(value == 'object'):
            categorical_index.append(i)
    j = 0
    data = data.to_numpy()

    
    while j < len(categorical_index):
        temp = data[:,categorical_index[j]]
        u_categorical = np.unique(temp)
        
        num_indexes = []
        for i,value in enumerate(u_categorical):
            num_indexes.append(i)
        dictionary = dict(zip(u_categorical, num_indexes))
        for k in range(len(temp)):
            data[k][categorical_index[j]] = dictionary.get(data[k][categorical_index[j]]) + 1
        j+=1
    
    return data.astype(float),categorical_index

In [3]:
#Calculating Mean and Standard Deviation
def mean_std_helper(data,value):
    mean = np.mean([row[0:-1] for row in data if row[-1] == value],axis = 0)
    std  = np.std([row[0:-1] for row in data if row[-1] == value], axis = 0)
    return mean,std

def mean_std(train,categorical_index):
    train_data= np.delete(train,categorical_index,axis=1)
    test_data_float = np.delete(train,categorical_index,axis=1)
    mean_0,std_0 = mean_std_helper(train_data,0)
    mean_1,std_1 = mean_std_helper(train_data,1)
    return {'mean_0': mean_0 , 'std_0': std_0 ,'mean_1': mean_1 , 'std_1': std_1}

In [4]:
#Calculating the prior probabilities 
def calc_prior_prob(train,categorical_index,class_0,class_1):
    prior_prob_class_1 = {}
    prior_prob_class_0= {} 
    
    if len(categorical_index) != 0:
        prior_prob_class_1 = {}
        prior_prob_class_0= {}    
        for j in categorical_index:
            prior_prob_class_1[j] = {}
            prior_prob_class_0[j] = {}
            for k in np.unique(train[:,j]):
                prior_0 = float(list(class_0[:,j]).count(k))/len(class_0)
                prior_1 = float(list(class_1[:,j]).count(k))/len(class_1)
                prior_prob_class_0[j][k] = prior_0
                prior_prob_class_1[j][k] = prior_1
                
    prior_prob_0 = float(list(train[:,-1]).count(0))/len(train)
    prior_prob_1 = float(list(train[:,-1]).count(1))/len(train)
    
    return {'prior_0' : prior_prob_0,'prior_1':prior_prob_1,'prior_class0':prior_prob_class_0,'prior_class1':prior_prob_class_1}

In [5]:
#Calculating the probailities
def calc_probabilty(mean, std_dev, test):    
    prob = test[:,0:-1] - mean
    prob = np.multiply(prob,prob)
    prob = -1 * prob / (2 * np.multiply(std_dev,std_dev))
    prob = np.exp(prob)
    prob = prob/(math.sqrt(math.pi*2)*std_dev)
    prob = np.prod(prob, axis = 1)
    return prob

def get_probability(test,prior_prob,meanStd,categorical_index):
    test_data = np.delete(test,categorical_index,axis=1)
    
    cat_data_prob0 = np.empty(test.shape[0])
    cat_data_prob0.fill(1.0)
    cat_data_prob1 = np.empty(test.shape[0])
    cat_data_prob1.fill(1.0)
    
#     print(cat_data_prob0)
#     print(cat_data_prob1)
    
    if len(categorical_index) != 0:
        for t in range(len(test)):
            for i in categorical_index:
                cat_data_prob1[t] *= prior_prob['prior_class1'][i][test[t][i]]
                cat_data_prob0[t] *= prior_prob['prior_class0'][i][test[t][i]]
    
    t0 = np.multiply(calc_probabilty(meanStd['mean_0'], meanStd['std_0'], test_data), cat_data_prob0)
    t1 = np.multiply(calc_probabilty(meanStd['mean_1'], meanStd['std_1'], test_data) , cat_data_prob1)
    
    prob_0 = prior_prob['prior_0'] * t0
    prob_1 = prior_prob['prior_1'] * t1
    
    return {'prob0':prob_0,'prob1':prob_1}

In [6]:
#Calculation of Confusion Matrix
def confusion_matrix(original, predicted):
    accuracy = precision = recall = f_measure = 0
    TP = FN = FP = TN = 0
    for i in range(len(original)):
        if original[i] == 1 and predicted[i] == 1:
            TP += 1
        elif original[i] == 1 and predicted[i] == 0:
            FN += 1
        elif original[i] == 0 and predicted[i] == 1:
            FP += 1
        else:
            TN += 1
            
    if(TP + FN + FP + TN) != 0:
        accuracy = (float(TP + TN)/(TP + FN + FP + TN))
    if(TP + FP) != 0:
        precision = (float(TP)/(TP + FP))
    if(TP + FN)!= 0:
        recall = (float(TP)/(TP + FN))
    f_measure = (float(2 * TP) / ((2 * TP) + FN + FP))
            
    return accuracy, precision, recall, f_measure

In [7]:
#Main Function
def __main__():
    accuracy = precision = recall = f_measure = 0
    predicted_labels = []
    
    #Load the file in a Data Frame
    file = 'project3_dataset1.txt'
    data = pd.read_csv(file,sep="\t", header=None)    
    data = data.iloc[:,:data.shape[1]]
    data, categorical_index = check_categorical(data)

    
    #Number of Folds
    folds = np.array_split(data,10)
    
    #Number of loops
    i=0
    while(i<len(folds)):
        class_0 = []
        class_1 = []
        
        #test_set
        test = np.asarray(folds[i])
        train = []
        predicted_labels = []
        for index,value in enumerate(folds):
            if(index != i):
                train.append(value)
                
        #train_set
        train = np.asarray(np.vstack(train))
        meanStd = mean_std(train,categorical_index)
    
        for i1 in range(len(train)):
            if(int(train[i1,-1])) == 1:
                class_1.append(train[i1,:])
            elif (int(train[i1,-1])) == 0:
                class_0.append(train[i1,:])
                
        class_0 = np.asarray(class_0)
        class_1 = np.asarray(class_1)
        
        
        #Calculation of probabilities
        prior_probilities = calc_prior_prob(train,categorical_index,class_0,class_1)
        probabilities = get_probability(test,prior_probilities,meanStd,categorical_index)

        prob0 = probabilities['prob0']
        prob1 = probabilities['prob1']
     
        #Estimated Predicted Labels
        for j in range(len(test)):
            _ = predicted_labels.append(1) if prob1[j] > prob0[j] else predicted_labels.append(0)

        #Accuracy, Precision, Recall, Precision calculations
        acc, prec, rec, f_meas = confusion_matrix(test[:,-1], np.asarray(predicted_labels))
        accuracy += acc
        precision += prec
        recall += rec
        f_measure += f_meas
                
        i+=1
        
    #Outputs
    print('Naive Bayes Results for the file {} --------------'.format(file))
    print('Accuracy : {}'.format(str(accuracy * 10)))
    print('Precision: {}'.format(str(precision * 10)))
    print('Recall : {}'.format(str(recall * 10)))
    print('F-1 Measure: {}'.format(str(f_measure * 10)))

__main__()

Naive Bayes Results for the file project3_dataset1.txt --------------
Accuracy : 93.48997493734333
Precision: 91.78709362532891
Recall : 90.44426356826324
F-1 Measure: 91.00318381263209
