In [None]:
import math
import numpy as np


def naive_bayes(data, classes, sample):
    
    
    sample = np.array(sample)

    values = list(data.values())
    total = len(values)


    priors = []
    for c in classes:
        count = values.count(c)
        p = count / total
        priors.append(p)
   
   
    posteriors = []

  
    for i, c in enumerate(classes):

        class_points = np.array([key for key, val in data.items() if val == c])
      
        mean_vec = np.mean(class_points, axis=0)
        var_vec  = np.var(class_points, axis=0)

      

        
        likelihood_dims = (1 / np.sqrt(2*np.pi*var_vec)) * np.exp(-((sample - mean_vec)**2) / (2*var_vec))
        likelihood = np.prod(likelihood_dims)

        post = likelihood * priors[i]
        posteriors.append(post)
        

    # Normalize
    evidence = sum(posteriors)

    
    for i, c in enumerate(classes):
        posteriors[i]=  posteriors[i] / evidence
       
    
    prediction = classes[np.argmax(posteriors)]
   
    return prediction  


In [None]:
import pandas as pd
import numpy as np                                                                                  

data = pd.read_csv("diabetes.csv")


X = data.drop("Outcome", axis=1).values
y = data["Outcome"].values


In [9]:
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]],
      shape=(768, 8))

In [None]:

k = 2
fold_size = len(data) // k


accuracies = []

for fold in range(k):

    print(f"\n=========== FOLD {fold+1} ===========")

    if fold == 0:
        X_train = X[:fold_size]
        y_train = y[:fold_size]
        X_test  = X[fold_size:]
        y_test  = y[fold_size:]
    else:
        X_test  = X[:fold_size]
        y_test  = y[:fold_size]
        X_train = X[fold_size:]
        y_train = y[fold_size:]

  
    train_data = {}
    for i in range(len(X_train)):
        train_data[tuple(X_train[i])] = y_train[i]

    # classify 
    correct = 0

    for i in range(len(X_test)):
        pred = naive_bayes(train_data, [0, 1], X_test[i])
        if pred == y_test[i]:
            correct += 1

    accuracy = correct / len(X_test)
    accuracies.append(accuracy)

    print(f"Fold Accuracy = {accuracy:.4f}")

print("===============================")
print(f"Final Accuracy = {np.mean(accuracies):.4f}")



Fold Accuracy = 0.7708

Fold Accuracy = 0.7370
Final Accuracy = 0.7539


In [17]:
split = int(0.8 * len(data))  
X_train = X[:split]
y_train = y[:split]

X_test  = X[split:]
y_test  = y[split:]

train_data = {}
for i in range(len(X_train)):
    train_data[tuple(X_train[i])] = y_train[i]

correct = 0

for i in range(len(X_test)):
    pred = naive_bayes(train_data, [0, 1], X_test[i])
    if pred == y_test[i]:
        correct += 1

accuracy = correct / len(X_test)

print(f"Final Accuracy = {accuracy:.4f}")


Final Accuracy = 0.7468


In [4]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [12]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
