In [3]:
from sklearn.datasets import load_diabetes
import numpy as np
from itertools import product
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
d=pd.read_csv('heart_failure_clinical_records_dataset.csv')
d.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [5]:
d.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [6]:
features=['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
        'serum_sodium', 'sex', 'smoking', 'time']

data = np.array(d[features])
pd.DataFrame(data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.00,130.0,1.0,0.0,4.0
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,136.0,1.0,0.0,6.0
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.00,129.0,1.0,1.0,7.0
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.00,137.0,1.0,0.0,7.0
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.00,116.0,0.0,0.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0.0,61.0,1.0,38.0,1.0,155000.00,143.0,1.0,1.0,270.0
295,55.0,0.0,1820.0,0.0,38.0,0.0,270000.00,139.0,0.0,0.0,271.0
296,45.0,0.0,2060.0,1.0,60.0,0.0,742000.00,138.0,0.0,0.0,278.0
297,45.0,0.0,2413.0,0.0,38.0,0.0,140000.00,140.0,1.0,1.0,280.0


In [7]:
target_1=np.array(d['DEATH_EVENT'])
target_2=np.array(d['serum_creatinine'])

data = (data-np.mean(data,axis=0))/(np.std(data,axis=0))
pd.DataFrame(data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.192945,-0.871105,0.000166,-0.847579,-1.530560,1.359272,1.681648e-02,-1.504036,0.735688,-0.687682,-1.629502
1,-0.491279,-0.871105,7.514640,-0.847579,-0.007077,-0.735688,7.535660e-09,-0.141976,0.735688,-0.687682,-1.603691
2,0.350833,-0.871105,-0.449939,-0.847579,-1.530560,-0.735688,-1.038073e+00,-1.731046,0.735688,1.454161,-1.590785
3,-0.912335,1.147968,-0.486071,-0.847579,-1.530560,-0.735688,-5.464741e-01,0.085034,0.735688,-0.687682,-1.590785
4,0.350833,1.147968,-0.435486,1.179830,-1.530560,-0.735688,6.517986e-01,-4.682176,-1.359272,-0.687682,-1.577879
...,...,...,...,...,...,...,...,...,...,...,...
294,0.098199,-0.871105,-0.537688,1.179830,-0.007077,1.359272,-1.109765e+00,1.447094,0.735688,1.454161,1.803451
295,-0.491279,-0.871105,1.278215,-0.847579,-0.007077,-0.735688,6.802472e-02,0.539054,-1.359272,-0.687682,1.816357
296,-1.333392,-0.871105,1.525979,1.179830,1.854958,-0.735688,4.902082e+00,0.312044,-1.359272,-0.687682,1.906697
297,-1.333392,-0.871105,1.890398,-0.847579,-0.007077,-0.735688,-1.263389e+00,0.766064,0.735688,1.454161,1.932509


In [8]:
x_train_clf,x_test_clf,y_train_clf,y_test_clf=train_test_split(data,target_1,test_size=0.33,random_state=1)
x_train_reg,x_test_reg,y_train_reg,y_test_reg=train_test_split(data,target_2,test_size=0.33,random_state=1)

In [9]:
def calculate_mean(X):
    return np.mean(X, axis=0)


def calculate_variance(X):
    return np.var(X, axis=0)


In [10]:
def calculate_priors(y):
    classes,counts = np.unique(y,return_counts=True)
    priors = {cls:count/len(y) for cls,count in zip(classes,counts)}
    return priors

In [11]:
def gaussian_probability(x, mean, var):
    
    coeff = 1.0 / np.sqrt(2 * np.pi * var)
    exponent = np.exp(-((x - mean) ** 2) / (2 * var))
    return coeff * exponent

In [30]:
def train_gnb(x,y):
    classes=np.unique(y)
    summaries={}
    for cls in classes:
        x_cls = x[y==cls]
        summaries[cls]={
            "mean":calculate_mean(x_cls),
            "var":calculate_variance(x_cls)
        }
        
    priors = calculate_priors(y)

    return summaries,priors

summ,priors= train_gnb(x_train_clf,y_train_clf)
summ




{np.int64(0): {'mean': array([-0.16709772,  0.00939292, -0.03313337, -0.03966417,  0.2169274 ,
         -0.02686724, -0.01557204,  0.10039542, -0.00463592, -0.01131056,
          0.36857371]),
  'var': array([0.80731436, 1.00251232, 0.71558412, 0.98524829, 0.8516791 ,
         0.98252419, 0.88863242, 0.92155565, 1.00286939, 0.99120277,
         0.73345264])},
 np.int64(1): {'mean': array([ 0.4199616 ,  0.12336368,  0.1323219 , -0.06082331, -0.38099955,
          0.1085493 , -0.10533973, -0.35543331,  0.04779097, -0.08029371,
         -0.76481122]),
  'var': array([1.23687413, 1.01893621, 2.39782019, 0.97609191, 1.12011638,
         1.05590658, 0.92738331, 1.01510393, 0.96791437, 0.9320095 ,
         0.64126356])}}

In [35]:
''' def calculate_posterior(X, summaries, priors):'''
posteriors = []
for x in x_train_clf:
    class_probs = {}
    for cls, params in summ.items():
        print(cls)
        print(params)
        mean = params["mean"]
        var = params["var"]
        
        likelihood = np.prod(gaussian_probability(x, mean, var))
        posterior = priors[cls] * likelihood
        class_probs[cls] = posterior
        posteriors.append(class_probs)
        break


0
{'mean': array([-0.16709772,  0.00939292, -0.03313337, -0.03966417,  0.2169274 ,
       -0.02686724, -0.01557204,  0.10039542, -0.00463592, -0.01131056,
        0.36857371]), 'var': array([0.80731436, 1.00251232, 0.71558412, 0.98524829, 0.8516791 ,
       0.98252419, 0.88863242, 0.92155565, 1.00286939, 0.99120277,
       0.73345264])}
0
{'mean': array([-0.16709772,  0.00939292, -0.03313337, -0.03966417,  0.2169274 ,
       -0.02686724, -0.01557204,  0.10039542, -0.00463592, -0.01131056,
        0.36857371]), 'var': array([0.80731436, 1.00251232, 0.71558412, 0.98524829, 0.8516791 ,
       0.98252419, 0.88863242, 0.92155565, 1.00286939, 0.99120277,
       0.73345264])}
0
{'mean': array([-0.16709772,  0.00939292, -0.03313337, -0.03966417,  0.2169274 ,
       -0.02686724, -0.01557204,  0.10039542, -0.00463592, -0.01131056,
        0.36857371]), 'var': array([0.80731436, 1.00251232, 0.71558412, 0.98524829, 0.8516791 ,
       0.98252419, 0.88863242, 0.92155565, 1.00286939, 0.99120277,
    