In [1]:
import pandas as pd
import numpy as np
import random 
pd.set_option('display.max_rows', 200)

In [2]:
def classpriorprob(df):
    classes = list(df.iloc[:,-1].unique())
    cls_prior_prob = {}
    for cls in classes:
        cnt = df[df.iloc[:,-1] == cls].shape[0]
        cls_prior_prob[cls] = cnt/df.shape[0]
    return cls_prior_prob

In [3]:
def descriptorposprob(df):
    features = df.iloc[:,:-1]
    columns = list(features.columns)
    classes = list(df.iloc[:,-1].unique())
    des_pos_prob = []
    for col in columns:
        df1 = pd.DataFrame(columns = classes)
        df1["label"] = df.iloc[:,col].unique()
        for i in range(len(df1)):
            for j in classes:
                lab = df1.loc[i, "label"]
                # Laplacian Correction
                den = df[df.iloc[:,-1] == j].shape[0]+len(classes)
                num = df.loc[(df.iloc[:,-1] == j) & (df[col] == lab)].shape[0]+1
                df1.loc[i, j] = num / den
        des_pos_prob.append(df1)
    return des_pos_prob

In [4]:
def naivebayes(df, test):
    cls_prior_prob = classpriorprob(df)
    des_pos_prob = descriptorposprob(df)
    classes = list(df.iloc[:,-1].unique())
    c = {}
    for i,j in test.iterrows():
        cls_post_prob = {}
        for cls in classes:
            temp = []
            for k in range(len(j)):
                prob = des_pos_prob[k][des_pos_prob[k]["label"] == j[k]].iloc[0][cls]
                temp.append(prob)
            cls_post_prob[cls] = cls_prior_prob[cls] * np.prod(temp)
        c[i] = max(cls_post_prob, key=cls_post_prob.get)  
    test["Class"] = c.values()
    return test

In [5]:
def metrics(ts_lb,answer):
    TN = 0
    TP = 0
    FN = 0
    FP = 0
    for i,j in zip(ts_lb,answer):
        if j==1 and i==1:
            TP += 1
        elif(j==1 and i==0):
            FN += 1
        elif(j==0 and i==1):
            FP += 1
        elif(j==0 and i==0):
            TN += 1
    Accuracy = (TP + TN)/(TP + FP + TN + FN)
    Precision = TP/(TP + FP)
    Recall = TP/(TP + FN)
    f1_score = (2*Precision*Recall)/(Precision + Recall)
    return Accuracy, Precision, Recall, f1_score

In [6]:
def k_fold(df):
    k = int(input("Enter k value: "))
    metrics_list = []
    for i in range(k):
        splitdfs = np.array_split(df, k)
        test = splitdfs[i]
        #del(splitdfs[i])
        train = pd.concat(splitdfs)
        test.reset_index(inplace = True, drop = True)
        train.reset_index(inplace = True, drop = True) 
        actual = test.iloc[:,-1]
        test = test.iloc[:,:-1]
        results = naivebayes(train, test)
        Accuracy, Precision, Recall, f1_score = metrics(actual, results["Class"])
        metrics_list.append([Accuracy, Precision, Recall, f1_score])
    metrics_list = np.array(metrics_list)
    metrics_list = np.mean(metrics_list, axis = 0)
    print("Accuracy: ",metrics_list[0])
    print("Precision: ",metrics_list[1])
    print("Recall: ",metrics_list[2])
    print("f1_score: ",metrics_list[3])
    return metrics_list

In [7]:
df1 = pd.read_csv("project3_dataset1.txt", sep = '\t', header=None)

In [8]:
k_fold(df1)

Enter k value: 10
Accuracy:  1.0
Precision:  1.0
Recall:  1.0
f1_score:  1.0


array([1., 1., 1., 1.])

In [9]:
df2 =  pd.read_csv("project3_dataset2.txt", sep = '\t', header=None)

In [10]:
k_fold(df2)

Enter k value: 10
Accuracy:  0.8982886216466234
Precision:  0.99375
Recall:  0.7748962148962149
f1_score:  0.8687764504133757


array([0.89828862, 0.99375   , 0.77489621, 0.86877645])

# Demo 


In [11]:
def naivebayes_demo(df, test):
    cls_prior_prob = classpriorprob(df)
    des_pos_prob = descriptorposprob(df)
    classes = list(df.iloc[:,-1].unique())
    c = {}
    for i,j in test.iterrows():
        cls_post_prob = {}
        for cls in classes:
            temp = []
            for k in range(len(j)):
                prob = des_pos_prob[k][des_pos_prob[k]["label"] == j[k]].iloc[0][cls]
                temp.append(prob)
            cls_post_prob[cls] = cls_prior_prob[cls] * np.prod(temp)
        c[i] = max(cls_post_prob, key=cls_post_prob.get)  
        print("Class Posterior Probability ",cls_post_prob)
    test["Class"] = c.values()
    return test

In [12]:
demo = pd.read_csv("project3_dataset4.txt", sep = '\t', header=None)

In [13]:
d = {'0': ["sunny"], '1': ["cool"], '2': ["high"], '3': ["weak"]}
test = pd.DataFrame(data=d)
test

Unnamed: 0,0,1,2,3
0,sunny,cool,high,weak


In [14]:
naivebayes_demo(demo, test)

Class Posterior Probability  {0: 0.01784970547985958, 1: 0.014753090635885528}


Unnamed: 0,0,1,2,3,Class
0,sunny,cool,high,weak,0
