In [1]:
# NAIVE BAYES CLASSIFICATION - WITH DIABETES DATASET

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.naive_bayes import GaussianNB

In [3]:
# 1. LOAD DATASET

df = pd.read_csv("diabetes.csv")

In [5]:

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [7]:
# 2. IMPLEMENT NAIVE BAYES FROM SCRATCH (GAUSSIAN NB)


class MyNaiveBayes:
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = {}
        self.var = {}
        self.priors = {}
        
        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = X_c.mean(axis=0)
            self.var[c] = X_c.var(axis=0)
            self.priors[c] = len(X_c) / len(X)
    
    def gaussian_pdf(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp(-(x - mean)**2 / (2 * var + 1e-9))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator
    
    def predict(self, X):
        predictions = []
        for i in range(len(X)):
            posteriors = []
            
            for c in self.classes:
                prior = np.log(self.priors[c])
                conditional = np.sum(np.log(self.gaussian_pdf(c, X.iloc[i])))
                posterior = prior + conditional
                posteriors.append(posterior)
            
            predictions.append(self.classes[np.argmax(posteriors)])
        
        return np.array(predictions)

In [9]:
# Train custom NB
my_nb = MyNaiveBayes()
my_nb.fit(X_train, y_train)

y_pred_custom = my_nb.predict(X_test)

In [11]:
# 3. BUILT-IN GAUSSIAN NAIVE BAYES
# ============================================================

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_builtin = nb.predict(X_test)

In [13]:

# ============================================================
# 4. PERFORMANCE METRICS FUNCTION
# ============================================================

def evaluate_model(y_test, y_pred, model_name):
    print(f"\n========== {model_name} ==========")
    
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    specificity = tn / (tn + fp)
    
    print("Confusion Matrix:")
    print(cm)
    
    print("\nMetrics:")
    print(f"Accuracy:    {accuracy:.4f}")
    print(f"Precision:   {precision:.4f}")
    print(f"Recall:      {recall:.4f}")
    print(f"F1 Score:    {f1:.4f}")
    print(f"Specificity: {specificity:.4f}")

# Evaluate both models
evaluate_model(y_test, y_pred_custom, "Custom Naive Bayes")
evaluate_model(y_test, y_pred_builtin, "Built-in GaussianNB")



Confusion Matrix:
[[119  32]
 [ 27  53]]

Metrics:
Accuracy:    0.7446
Precision:   0.6235
Recall:      0.6625
F1 Score:    0.6424
Specificity: 0.7881

Confusion Matrix:
[[119  32]
 [ 27  53]]

Metrics:
Accuracy:    0.7446
Precision:   0.6235
Recall:      0.6625
F1 Score:    0.6424
Specificity: 0.7881
