In [1]:
import pandas as pd
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import math

In [130]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
col_type = {'anaemia':np.bool, 'diabetes':np.bool, 
            'high_blood_pressure':np.str, 'sex':np.bool, 
            'smoking':np.bool, 'DEATH_EVENT':np.str}

df = df.astype(col_type)
x_train, x_test, y_train, y_test = train_test_split(df.drop("DEATH_EVENT", axis=1), 
                                                    df['DEATH_EVENT'], train_size=0.67, random_state=42)


In [132]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
print('Gaussian NB', gnb.fit(x_train, y_train).score(x_test, y_test))
# gnb.predict_proba(x_test)


from sklearn.naive_bayes import CategoricalNB

clf = CategoricalNB()
categorical = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']
print('Categorical NB', clf.fit(x_train[categorical], y_train).score(x_test[categorical], y_test))

# clf.predict_proba(x_test[categorical])

Gaussian NB 0.7575757575757576
Categorical NB 0.5757575757575758


In [127]:
def prior_prob_categorical(nums:pd.core.series.Series):
    return nums.value_counts(normalize=True)
    
def get_std_dev(nums: pd.core.series.Series):
    mean = nums.mean()
    return (((nums - mean)**2).aggregate(np.sum)/(len(nums)-1))**0.5
# prior_prob_categorical(df['sex'])["1"]

def calculate_prob(x, mean, std):
    exponent = math.exp((-1 * (x - mean)**2)/(2*(std**2)))
    return (1 / (std * (2*math.pi)**0.5)) * exponent

assert(calculate_prob(1, 1, 1) == 0.3989422804014327)
assert(calculate_prob(2, 1, 1) == 0.24197072451914337)


In [125]:
def train_naive_bayes(x_train, y_train):
    # dictionary that stores all probabiltiies for categorical or mean/std for continuous data
    prob = dict()
    col_y = y_train.name
    classes = np.unique(y_train)
    df = x_train.copy()
    df[col_y] = y_train
    
    class_counts = y_train.value_counts(normalize=True)
    prob['classes'] = {cls: class_counts[cls] for cls in classes}
    
    # go through each class and calculate piori probabilities for each unique column value
    for cls in classes:
        tmp = df[df[col_y] == cls]

        for col in x_train.columns:
            col = str(col)
            if col != col_y:
                if col not in prob.keys():
                    prob[col] = dict()
                
                prob[col][cls] = dict()

                if x_train.dtypes[col] == object:
                    val_probs = prior_prob_categorical(tmp[col])
                    for val in np.unique(tmp[col]):
                        try:
                            prob[col][cls][val] = val_probs[val]
                        
                        except IndexError:
                            prob[col][cls][val] = 0

                else:
                    for cls_2 in classes:
                        prob[col][cls_2] = {
                            'mean': tmp[col].mean(),
                            'std': get_std_dev(tmp[col])}

    return prob

import pprint
prob = train_naive_bayes(x_train, y_train)
pprint.pprint(prob)

{'age': {'0': {'mean': 64.94444444444444, 'std': 11.832823976990456},
         '1': {'mean': 64.94444444444444, 'std': 11.832823976990456}},
 'anaemia': {'0': {'mean': 0.42592592592592593, 'std': 0.49912572061109167},
             '1': {'mean': 0.42592592592592593, 'std': 0.49912572061109167}},
 'classes': {'0': 0.73, '1': 0.27},
 'creatinine_phosphokinase': {'0': {'mean': 629.3888888888889,
                                    'std': 1212.318306428072},
                              '1': {'mean': 629.3888888888889,
                                    'std': 1212.318306428072}},
 'diabetes': {'0': {'mean': 0.4074074074074074, 'std': 0.4959655541340459},
              '1': {'mean': 0.4074074074074074, 'std': 0.4959655541340459}},
 'ejection_fraction': {'0': {'mean': 33.75925925925926,
                             'std': 13.091744749002167},
                       '1': {'mean': 33.75925925925926,
                             'std': 13.091744749002167}},
 'high_blood_pressure': {'0': {'0':

In [126]:
def predict_naive_bayes(x_test, prob):
    classes = list(prob[list(prob.keys())[0]].keys())
    y_pred = list()
    
    # calculate posteri probabilities for each row in test dataset
    for i in range(len(x_test)):
        row = x_test.iloc[i]
        cls_prob = list()
        for cls in classes:
#             cls = str(cls)
            tmp_prob = prob['classes'][cls]
            for col in x_test.columns:
                try:
                    # for categorical data
                    tmp_prob *= prob[col][cls][row[col]]
                
                # for continuous data
                except KeyError:
                    tmp_prob *= calculate_prob(row[col], prob[col][cls]['mean'], prob[col][cls]['std'])

            cls_prob += [(cls, tmp_prob)]

        # normalize
        s = 0
        for c in cls_prob:
            s += c[1]
            
#         print([c[1]/s for c in cls_prob])

        y_pred += [max(cls_prob, key=lambda x: x[1])[0]]
        
    return np.array(y_pred)
                    
# print(predict_naive_bayes(x_test, prob))
accuracy_score(y_test, predict_naive_bayes(x_test, prob))

0.5757575757575758

In [41]:
assert(calculate_prob(1, 1, 1)==0.3989422804014327)

In [26]:
{k: x for k, x in enumerate(y_train.value_counts(normalize=True))}

{0: 0.73, 1: 0.27}

In [23]:
y_train.value_counts(normalize=True).index


Index(['0', '1'], dtype='object')

In [14]:
y_train

101    0
69     1
31     1
172    0
148    1
      ..
188    0
71     0
106    0
270    0
102    0
Name: DEATH_EVENT, Length: 200, dtype: object

In [28]:
4**0.5

2.0