In [27]:
import pandas as pd
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np
import math

In [28]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
col_type = {'anaemia':np.bool, 'diabetes':np.bool, 
            'high_blood_pressure':np.str, 'sex':np.bool, 
            'smoking':np.bool, 'DEATH_EVENT':np.str}

df = df.astype(col_type)
x_train, x_test, y_train, y_test = train_test_split(df.drop("DEATH_EVENT", axis=1), 
                                                    df['DEATH_EVENT'], train_size=0.67, random_state=42)


In [31]:
def prior_prob_categorical(nums:pd.core.series.Series):
    return nums.value_counts(normalize=True)
    
def get_std_dev(nums: pd.core.series.Series):
    mean = nums.mean()
    return (((nums - mean)**2).aggregate(np.sum)/(len(nums)-1))**0.5
# prior_prob_categorical(df['sex'])["1"]

def calculate_prob(x, mean, std):
    exponent = math.exp((-1 * (x - mean)**2)/(2*(std**2)))
    return (1 / (std * (2*math.pi)**0.5)) * exponent

assert(calculate_prob(1, 1, 1) == 0.3989422804014327)
assert(calculate_prob(2, 1, 1) == 0.24197072451914337)

def train_naive_bayes(x_train, y_train):
    # dictionary that stores all probabiltiies for categorical or mean/std for continuous data
    prob = dict()
    col_y = y_train.name
    classes = np.unique(y_train)
    df = x_train.copy()
    df[col_y] = y_train
    
    class_counts = y_train.value_counts(normalize=True)
    prob['classes'] = {cls: class_counts[cls] for cls in classes}
    
    # go through each class and calculate piori probabilities for each unique column value
    for cls in classes:
        tmp = df[df[col_y] == cls]

        for col in x_train.columns:
            col = str(col)
            if col != col_y:
                if col not in prob.keys():
                    prob[col] = dict()
                
                prob[col][cls] = dict()

                if x_train.dtypes[col] == object:
                    val_probs = prior_prob_categorical(tmp[col])
                    for val in np.unique(tmp[col]):
                        try:
                            prob[col][cls][val] = val_probs[val]
                        
                        except IndexError:
                            prob[col][cls][val] = 0

                else:
                    for cls_2 in classes:
                        prob[col][cls_2] = {
                            'mean': tmp[col].mean(),
                            'std': get_std_dev(tmp[col])}

    return prob

def predict_naive_bayes(x_test, prob):
    classes = list(prob[list(prob.keys())[0]].keys())
    y_pred = list()
    
    # calculate posteri probabilities for each row in test dataset
    for i in range(len(x_test)):
        row = x_test.iloc[i]
        cls_prob = list()
        for cls in classes:
#             cls = str(cls)
            tmp_prob = prob['classes'][cls]
            for col in x_test.columns:
                try:
                    # for categorical data
                    tmp_prob *= prob[col][cls][row[col]]
                
                # for continuous data
                except KeyError:
                    tmp_prob *= calculate_prob(row[col], prob[col][cls]['mean'], prob[col][cls]['std'])

            cls_prob += [(cls, tmp_prob)]

        # normalize
        s = 0
        for c in cls_prob:
            s += c[1]
            
#         print([c[1]/s for c in cls_prob])

        y_pred += [max(cls_prob, key=lambda x: x[1])[0]]
        
    return np.array(y_pred)

def nb_predict(x_train, x_test, y_train, y_test):
    clf = CategoricalNB()
    clf.fit(x_train[categorical], y_train)
    score_clf = clf.score(x_test[categorical], y_test)
    
    prob_categorical = train_naive_bayes(x_train[categorical], y_train)
    score_categorical = accuracy_score(y_test, predict_naive_bayes(x_test[categorical], prob_categorical))
    
    gnb = GaussianNB()
    gnb.fit(x_train[numerical], y_train)
    score_gnb = gnb.score(x_test[numerical], y_test)

    prob_numerical = train_naive_bayes(x_train[numerical], y_train)
    score_numerical = accuracy_score(y_test, predict_naive_bayes(x_test[numerical], prob_numerical))

#     print('Naive Bayes:', score_categorical, score_clf, score_numerical, score_gnb)
    print('Naive Bayes:', score_categorical, score_clf)

In [32]:
kf = KFold(10, shuffle=True, random_state=1)

for train_index, test_index in kf.split(X):

    x_train = X.iloc[train_index]
    y_train = Y[train_index]
    
    x_test = X.iloc[test_index]
    y_test = Y[test_index]
    
    nb_predict(x_train, x_test, y_train, y_test)

Naive Bayes: 0.7333333333333333 0.7333333333333333
Naive Bayes: 0.8 0.8
Naive Bayes: 0.6 0.6
Naive Bayes: 0.6666666666666666 0.6666666666666666
Naive Bayes: 0.4666666666666667 0.4666666666666667
Naive Bayes: 0.7666666666666667 0.7666666666666667
Naive Bayes: 0.6666666666666666 0.6666666666666666
Naive Bayes: 0.6666666666666666 0.6666666666666666
Naive Bayes: 0.7 0.7
Naive Bayes: 0.7241379310344828 0.7241379310344828
