In [1]:
import pandas as pd
from pprint import pprint
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np
import math
import time

In [2]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
col_type = {'anaemia':np.bool, 'diabetes':np.bool, 
            'high_blood_pressure':np.str, 'sex':np.bool, 
            'smoking':np.bool, 'DEATH_EVENT':np.str}

df = df.astype(col_type)
# x_train, x_test, y_train, y_test = train_test_split(df.drop("DEATH_EVENT", axis=1), 
#                                                     df['DEATH_EVENT'], train_size=0.67, random_state=42)


In [3]:
def prior_prob_categorical(nums:pd.core.series.Series):
    return nums.value_counts(normalize=True)
    
def get_std_dev(nums: pd.core.series.Series):
    mean = nums.mean()
    return (((nums - mean)**2).aggregate(np.sum)/(len(nums)-1))**0.5
# prior_prob_categorical(df['sex'])["1"]

def calculate_prob(x, mean, std):
    exponent = math.exp((-1 * (x - mean)**2)/(2*(std**2)))
    return (1 / (std * (2*math.pi)**0.5)) * exponent

assert(calculate_prob(1, 1, 1) == 0.3989422804014327)
assert(calculate_prob(2, 1, 1) == 0.24197072451914337)

def train_naive_bayes(x_train, y_train):
    # dictionary that stores all probabiltiies for categorical or mean/std for continuous data
    prob = dict()
    col_y = y_train.name
    classes = np.unique(y_train)
    df = x_train.copy()
    df[col_y] = y_train
    
    class_counts = y_train.value_counts(normalize=True)
    prob['classes'] = {cls: class_counts[cls] for cls in classes}
    
    # go through each class and calculate piori probabilities for each unique column value
    for cls in classes:
        tmp = df[df[col_y] == cls]

        for col in x_train.columns:
            col = str(col)
            if col != col_y:
                if col not in prob.keys():
                    prob[col] = dict()
                
                prob[col][cls] = dict()

                if x_train.dtypes[col] == object:
                    val_probs = prior_prob_categorical(tmp[col])
                    for val in np.unique(tmp[col]):
                        try:
                            prob[col][cls][val] = val_probs[val]
                        
                        except IndexError:
                            prob[col][cls][val] = 0

                else:
                    prob[col][cls] = {
                        'mean': tmp[col].mean(),
                        'std': get_std_dev(tmp[col])}

    pprint(prob)
    return prob

def predict_naive_bayes(x_test, prob):
    classes = list(prob[list(prob.keys())[0]].keys())
    y_pred = list()
    
    # calculate posteri probabilities for each row in test dataset
    for i in range(len(x_test)):
        row = x_test.iloc[i]
        cls_prob = list()
        for cls in classes:
#             cls = str(cls)
            tmp_prob = prob['classes'][cls]
            for col in x_test.columns:
                try:
                    # for categorical data
                    tmp_prob *= prob[col][cls][row[col]]
                
                # for continuous data
                except KeyError:
                    tmp_prob *= calculate_prob(row[col], prob[col][cls]['mean'], prob[col][cls]['std'])

            cls_prob += [(cls, tmp_prob)]

        # normalize
        s = 0
        for c in cls_prob:
            s += c[1]
            
#         print([c[1]/s for c in cls_prob])
    
        y_pred += [max(cls_prob, key=lambda x: x[1])[0]]
        
    return np.array(y_pred)

def nb_predict(x_train, x_test, y_train, y_test):
#     print('clf')
    start = int(round(time.time() * 1000))
    clf = CategoricalNB()
    clf.fit(x_train[categorical], y_train)
    print(int(round(time.time() * 1000)) - start)
    score_clf = clf.score(x_test[categorical], y_test)
    time_clf = int(round(time.time() * 1000)) - start
    
#     print('our model')
    start = int(round(time.time() * 1000))
    prob_categorical = train_naive_bayes(x_train[categorical], y_train)
#     print(int(round(time.time() * 1000)) - start)
    score_categorical = accuracy_score(y_test, predict_naive_bayes(x_test[categorical], prob_categorical))
    time_clf_dm = int(round(time.time() * 1000)) - start
    
    start = int(round(time.time() * 1000))
    gnb = GaussianNB(var_smoothing=0)
    gnb.fit(x_train[numerical], y_train)
#     print('gnb priori prob', gnb.class_prior_)
#     print('sigma', gnb.sigma_)
#     print('theta', gnb.theta_)
    score_gnb = gnb.score(x_test[numerical], y_test)
    time_gnb = int(round(time.time() * 1000)) - start

    start = int(round(time.time() * 1000))
    prob_numerical = train_naive_bayes(x_train[numerical], y_train)
    score_numerical = accuracy_score(y_test, predict_naive_bayes(x_test[numerical], prob_numerical))
#     pprint(prob_numerical)
    time_numerical = int(round(time.time() * 1000)) - start


    print('Naive Bayes:', score_categorical, score_clf, score_numerical, score_gnb)
#     print('Naive Bayes:', score_categorical, score_clf)
    
#     return (score_clf, score_categorical, score_gnb, score_numerical)
    return (score_clf, score_categorical, 100 - (score_clf-score_categorical)/score_clf*100, time_clf, time_clf_dm,
           100*(time_clf-time_clf_dm)/time_clf_dm, \
            score_gnb, score_numerical, 100 - (score_gnb-score_numerical)/score_gnb*100, time_gnb, time_numerical,
           100*(time_gnb-time_numerical)/time_gnb)


In [4]:
columns = list(df.columns)
categorical = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']
numerical = list(filter(lambda x: x not in categorical and x != 'DEATH_EVENT', columns))

X = df
Y = df['DEATH_EVENT']
kf = KFold(10, shuffle=True, random_state=1)
# result_columns = ['clf_scikit', 'clf_dm', 'gnb_scikit', 'gnb_dm']
result_columns = ['clf_scikit', 'clf_dm', 'clf_accuracy', 'clf_sklearn_time', 'clf_dm_time', '%diff', \
                  'gnb_scikit', 'gnb_dm', 'gnb_accuracy', 'gnb_sklearn_time', 'gnb_dm_time', '%diff']

df_result = pd.DataFrame(columns=result_columns)

for train_index, test_index in kf.split(X):

    x_train = X.iloc[train_index]
    y_train = Y[train_index]
    
    x_test = X.iloc[test_index]
    y_test = Y[test_index]
    
    result = nb_predict(x_train, x_test, y_train, y_test)
    df_result = df_result.append({c: result[i] for i,c in enumerate(result_columns)}, ignore_index=True)
    
df_result

5
{'anaemia': {'0': {'mean': 0.4198895027624309, 'std': 0.49490963142586414},
             '1': {'mean': 0.4772727272727273, 'std': 0.5023455953610855}},
 'classes': {'0': 0.6728624535315985, '1': 0.3271375464684015},
 'diabetes': {'0': {'mean': 0.4198895027624309, 'std': 0.4949096314258641},
              '1': {'mean': 0.4431818181818182, 'std': 0.4996079968037136}},
 'high_blood_pressure': {'0': {'0': 0.6685082872928176,
                               '1': 0.3314917127071823},
                         '1': {'0': 0.6136363636363636,
                               '1': 0.38636363636363635}},
 'sex': {'0': {'mean': 0.6353591160220995, 'std': 0.4826644892760428},
         '1': {'mean': 0.6590909090909091, 'std': 0.4767312946227962}},
 'smoking': {'0': {'mean': 0.30939226519337015, 'std': 0.4635253388085901},
             '1': {'mean': 0.29545454545454547, 'std': 0.45886143224517123}}}
{'age': {'0': {'mean': 59.30939226519337, 'std': 10.79523403923879},
         '1': {'mean': 64.973488636

Naive Bayes: 0.7666666666666667 0.7666666666666667 0.8 0.8
5
{'anaemia': {'0': {'mean': 0.39344262295081966, 'std': 0.48985381956960233},
             '1': {'mean': 0.46511627906976744, 'std': 0.5017070722245297}},
 'classes': {'0': 0.6802973977695167, '1': 0.31970260223048325},
 'diabetes': {'0': {'mean': 0.41530054644808745, 'std': 0.4941257040200663},
              '1': {'mean': 0.4186046511627907, 'std': 0.49622377018413694}},
 'high_blood_pressure': {'0': {'0': 0.6721311475409836,
                               '1': 0.32786885245901637},
                         '1': {'0': 0.5697674418604651,
                               '1': 0.43023255813953487}},
 'sex': {'0': {'mean': 0.644808743169399, 'std': 0.47988419107761193},
         '1': {'mean': 0.6162790697674418, 'std': 0.4891434205932988}},
 'smoking': {'0': {'mean': 0.32786885245901637, 'std': 0.4707246509282609},
             '1': {'mean': 0.313953488372093, 'std': 0.46681972605573024}}}
{'age': {'0': {'mean': 58.94353551912568,

Unnamed: 0,clf_scikit,clf_dm,clf_accuracy,clf_sklearn_time,clf_dm_time,%diff,gnb_scikit,gnb_dm,gnb_accuracy,gnb_sklearn_time,gnb_dm_time,%diff.1
0,0.733333,0.733333,100.0,9.0,28.0,-520.0,0.8,0.8,100.0,5.0,31.0,-520.0
1,0.8,0.8,100.0,7.0,24.0,-600.0,0.9,0.9,100.0,5.0,35.0,-600.0
2,0.6,0.6,100.0,8.0,27.0,-480.0,0.666667,0.666667,100.0,5.0,29.0,-480.0
3,0.666667,0.666667,100.0,7.0,30.0,-400.0,0.733333,0.733333,100.0,7.0,35.0,-400.0
4,0.466667,0.466667,100.0,8.0,26.0,-371.428571,0.6,0.6,100.0,7.0,33.0,-371.428571
5,0.766667,0.766667,100.0,8.0,24.0,-520.0,0.8,0.8,100.0,5.0,31.0,-520.0
6,0.666667,0.666667,100.0,9.0,33.0,-275.0,0.733333,0.733333,100.0,8.0,30.0,-275.0
7,0.666667,0.666667,100.0,7.0,27.0,-387.5,0.833333,0.833333,100.0,8.0,39.0,-387.5
8,0.7,0.7,100.0,10.0,33.0,-985.714286,0.933333,0.933333,100.0,7.0,76.0,-985.714286
9,0.724138,0.724138,100.0,9.0,27.0,-357.142857,0.724138,0.724138,100.0,7.0,32.0,-357.142857
