In [None]:
#Q1. A company conducted a survey of its employees and found that 70% of the employees use the company's health insurance plan, while 40% of the 
#    employees who use the plan are smokers. What is the probability that an employee is a smoker given that he/she uses the health insurance plan?
"""
I: Event of employee who use company insurance plan 
S: Event of employee who smoke

pr(I)=0.7
pr(I/S)=0.4

By conditional probability:
pr(S and I) = pr(I)*pr(I/S)
            = 0.7 * 0.4
            = 0.28
            
We know that:          
pr(S) + pr(I) - pr(S and I)= 1
pr(S) = 1 + 0.28 - 0.7
      = 0.58
      
pr(S/I)= ( pr(S) * pr(I/S) ) / pr(I)
       = 0.58 * 0.4 / 0.7
       = 0.33
The probability that an employee is a smoker given that he/she uses the health insurance plan is 33 %
"""

In [None]:
#Q2. What is the difference between Bernoulli Naive Bayes and Multinomial Naive Bayes?
"""
Bernoulli Naive Bayes: 
* It is designed for binary or boolean data, where features are represented as 0s and 1s, indicating the absence or presence of a particular feature.
* It assumes that features are independent binary variables and their probabilities follow a Bernoulli distribution. 
    
Multinomial Naive Bayes:  
* It is commonly used for text classification, where features are word frequencies or counts.
* It assumes that features represent the frequencies or counts of events.
"""

In [None]:
#Q3. How does Bernoulli Naive Bayes handle missing values?
"""
Bernoulli Naive Bayes handles missing values by considering them as absent (0) during classification. This means that if a feature's value is missing
for a particular instance, it is treated as if the feature is not present or "off" (assigned a value of 0).
"""

In [None]:
#Q4. Can Gaussian Naive Bayes be used for multi-class classification?
"""

Yes, Gaussian Naive Bayes can be used for multi-class classification. It is a variant of Naive Bayes that assumes the features follow a Gaussian 
distribution. Although it is often used for binary classification problems, it can be extended to handle multi-class classification tasks as well.
"""

In [1]:
#Q5. Download the "Spambase Data Set" from the UCI Machine Learning Repository (https://archive.ics.uci.edu/ml/datasets/Spambase). 
#    This dataset contains email messages, where the goal is to predict whether a message is spam or not based on several input features.

#    Implementation: Implement Bernoulli Naive Bayes, Multinomial Naive Bayes, and Gaussian Naive Bayes classifiers using the scikit-learn library in 
#       Python. Use 10-fold cross-validation to evaluate the performance of each classifier on the dataset. You should use the default hyperparameters
#       for each classifier.

#    Results:Report the following performance metrics for each classifier: Accuracy, Precision, Recall, F1 score

#    Discussion:Discuss the results you obtained. Which variant of Naive Bayes performed the best? Why do you think that is the case? Are there any 
#       limitations of Naive Bayes that you observed?

#    Conclusion: Summarise your findings and provide some suggestions for future work.

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Import dataset
import pandas as pd
df=pd.read_csv("spambase.csv")

# Independent & dependent Feature
x=df.iloc[:,:-1]
y=df.iloc[:,-1:]

# Split into train & test data 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)

# Model Training
from sklearn.naive_bayes import BernoulliNB
bnb=BernoulliNB()
bnb.fit(x_train,y_train)

from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(x_train,y_train)

from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
gnb.fit(x_train,y_train)

# prediction
bnb_y_pred=bnb.predict(x_test)
mnb_y_pred=mnb.predict(x_test)
gnb_y_pred=gnb.predict(x_test)

In [59]:
# Performance metrics for each classifier: Accuracy, Precision, Recall, F1 score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
classifiers = [bnb, mnb, gnb]
pred=[bnb_y_pred,mnb_y_pred,gnb_y_pred]

for i,j in enumerate(pred):
    print(f"Classifier: {classifiers[i].__class__.__name__}")
    print(f" Accuracy: {accuracy_score(j,y_test)}")
    print(f" Precision: {precision_score(j,y_test)}")
    print(f" Recall: {recall_score(j,y_test)}")
    print(f" F1 Score: {f1_score(j,y_test)}")
    print()

Classifier: BernoulliNB
 Accuracy: 0.8790731354091238
 Precision: 0.8128249566724437
 Recall: 0.8882575757575758
 F1 Score: 0.848868778280543

Classifier: MultinomialNB
 Accuracy: 0.782041998551774
 Precision: 0.6949740034662045
 Recall: 0.7623574144486692
 F1 Score: 0.7271078875793291

Classifier: GaussianNB
 Accuracy: 0.8247646632874729
 Precision: 0.9480069324090121
 Recall: 0.7206851119894598
 F1 Score: 0.8188622754491017



In [60]:
#10-fold cross-validation to evaluate the performance of each classifier
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)  # Initialize KFold with 10 splits


from sklearn.model_selection import cross_val_score
classifiers = [bnb, mnb, gnb]
results = []

for clf in classifiers:
    cv_scores = cross_val_score(clf, x, y, cv=kf)  # Perform cross-validation
    results.append(cv_scores)

import numpy as np
for i, clf in enumerate(classifiers):
    print(f"Classifier: {clf.__class__.__name__}")
    print(f" Mean Accuracy: {np.mean(results[i])}")
    print()

Classifier: BernoulliNB
 Mean Accuracy: 0.8852461567480902

Classifier: MultinomialNB
 Mean Accuracy: 0.7913543336791474

Classifier: GaussianNB
 Mean Accuracy: 0.8209054041309063

