### Objective
Implement and compare different Na√Øve Bayes models (Bernoulli,
Multinomial, and Gaussian) to understand how probability-based
classifiers behave with different types of data.


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

### Dataset Loading


In [16]:
import kagglehub

path = kagglehub.dataset_download("shashankvichare/diabetes-prediction")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/diabetes-prediction


### Gaussian Naive Bayes Classifier


In [20]:
import os
print(os.listdir(path))

['Diabetespred.csv']


In [21]:
col_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(path + '/Diabetespred.csv', names=col_names)
df = df.iloc[1:]

In [None]:
print(type(df))
df.head()
feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = df[feature_cols]
y = df['Outcome']

# Split dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train Gaussian Naive Bayes model

gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predict test data


y_pred = gnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

<class 'pandas.core.frame.DataFrame'>
Accuracy: 0.7466666666666667


### Multivariate Bernoulli Naive Bayes Classifier


In [26]:
from sklearn.preprocessing import Binarizer


feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X_bernoulli = df[feature_cols]
y_bernoulli = df['Outcome']


binarizer = Binarizer()
X_bernoulli = binarizer.fit_transform(X_bernoulli)

display(X_bernoulli[:5])

array([[1., 1., 1., 1., 0., 1., 1., 1.],
       [1., 1., 1., 1., 0., 1., 1., 1.],
       [1., 1., 1., 0., 0., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 1., 1., 1., 1., 1., 1., 1.]])

In [1]:
X_train_bernoulli, X_test_bernoulli, y_train_bernoulli, y_test_bernoulli = train_test_split(X_bernoulli, y_bernoulli, test_size=0.25, random_state=0)

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

bnb.fit(X_train_bernoulli, y_train_bernoulli)

y_pred_bernoulli = bnb.predict(X_test_bernoulli)

accuracy_bernoulli = accuracy_score(y_test_bernoulli, y_pred_bernoulli)
print("Bernoulli Naive Bayes Accuracy: %.3f%%"%(accuracy_bernoulli*100))

NameError: name 'train_test_split' is not defined

### Multinomial Naive Bayes Classifier

In [28]:
import kagglehub

spam_dataset_path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to spam dataset files:", spam_dataset_path)

Path to spam dataset files: /kaggle/input/sms-spam-collection-dataset


In [29]:
import os
import pandas as pd
print(os.listdir(spam_dataset_path))
try:
    spam_df = pd.read_csv(spam_dataset_path + '/spam.csv', encoding='latin-1')
    spam_df = spam_df[['v1', 'v2']]
    spam_df.columns = ['label', 'message']
    display(spam_df.head())
except FileNotFoundError:
    print("Error: 'spam.csv' not found in the downloaded dataset path.")
except Exception as e:
    print(f"An error occurred: {e}")

['spam.csv']


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
X_multinomial = count_vectorizer.fit_transform(spam_df['message'])
y_multinomial = spam_df['label']
print("Shape of the feature matrix:", X_multinomial.shape)

Shape of the feature matrix: (5572, 8672)


In [None]:
from sklearn.model_selection import train_test_split
# Split dataset into training and testing sets

X_train_multinomial, X_test_multinomial, y_train_multinomial, y_test_multinomial = train_test_split(X_multinomial, y_multinomial, test_size=0.25, random_state=0)
print("Shape of X_train_multinomial:", X_train_multinomial.shape)
print("Shape of X_test_multinomial:", X_test_multinomial.shape)
print("Shape of y_train_multinomial:", y_train_multinomial.shape)
print("Shape of y_test_multinomial:", y_test_multinomial.shape)
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_multinomial, y_train_multinomial)
y_pred_multinomial = mnb.predict(X_test_multinomial)
from sklearn.metrics import accuracy_score
accuracy_multinomial = accuracy_score(y_test_multinomial, y_pred_multinomial)
print("Multinomial Naive Bayes Accuracy: %.2f%%" % (accuracy_multinomial * 100))

Shape of X_train_multinomial: (4179, 8672)
Shape of X_test_multinomial: (1393, 8672)
Shape of y_train_multinomial: (4179,)
Shape of y_test_multinomial: (1393,)
Multinomial Naive Bayes Accuracy: 98.49%


## Observations

- Gaussian Naive Bayes performed better on continuous features compared to Bernoulli NB.
- Multinomial NB worked well when feature counts were non-negative.
- Naive Bayes is fast and simple but assumes feature independence, which is not always true.
- Model performance is sensitive to data distribution.
