In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [39]:
# load the dataset
df = pd.read_csv("D:\\advertising_ef.csv")
print(df.head())

   Daily Time Spent on Site   Age  Area Income  Daily Internet Usage  \
0                     68.95  35.0     61833.90                256.09   
1                       NaN  31.0     68441.85                193.77   
2                     69.47  26.0     59785.94                236.50   
3                     74.15  29.0     54806.18                245.89   
4                     68.37  35.0     73889.99                225.58   

             City  Gender     Country  Clicked on Ad  
0     Wrightburgh  Female     Tunisia              0  
1       West Jodi    Male       Nauru              0  
2        Davidton  Female  San Marino              0  
3  West Terrifurt    Male       Italy              0  
4    South Manuel  Female     Iceland              0  


In [41]:
from sklearn.preprocessing import LabelEncoder
X = df.drop('Clicked on Ad', axis=1)
y = df['Clicked on Ad']

# encoding categorical features
label_enc = LabelEncoder()
X['City'] = label_enc.fit_transform(X['City'])
X['Gender'] = label_enc.fit_transform(X['Gender'])
X['Country'] = label_enc.fit_transform(X['Country'])
# Replace NaN values with the median
X = X.fillna(X.median())

# Print the updated dataframe
print(X)

      Daily Time Spent on Site   Age  Area Income  Daily Internet Usage  City  \
0                        68.95  35.0     61833.90                256.09   953   
1                        68.37  31.0     68441.85                193.77   895   
2                        69.47  26.0     59785.94                236.50   112   
3                        74.15  29.0     54806.18                245.89   931   
4                        68.37  35.0     73889.99                225.58   799   
...                        ...   ...          ...                   ...   ...   
1004                     72.97  30.0     71384.57                208.58   127   
1005                     51.30  45.0     67782.17                134.42   485   
1006                     51.63  51.0     42415.72                120.37   792   
1007                     55.55  19.0     41920.79                187.95   927   
1008                     45.01  26.0     29875.80                178.35   739   

      Gender  Country  
0  

In [43]:
# train test split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_gnb = scaler.fit_transform(X_train[['Daily Time Spent on Site', 'Age', 'Area Income']])
X_test_gnb = scaler.transform(X_test[['Daily Time Spent on Site', 'Age', 'Area Income']])

# model initialize (for numerical data: Daily Time Spent on Site, Age  , Area Income)
gnb = GaussianNB()
# train the model
gnb.fit(X_train_gnb, y_train)


# predict the test data
y_pred_gnb = gnb.predict_proba(X_test_gnb)

In [45]:
# Multinomial Naïve Bayes (for categorical data: City,  Gender,     Country)
mnb = MultinomialNB()
mnb.fit(X_train[['City', 'Gender', 'Country']], y_train)
probs_mnb = mnb.predict_proba(X_test[['City', 'Gender', 'Country']])
# **Ensemble Using Probability Multiplication**
ensemble_probs = y_pred_gnb * probs_mnb  # Multiply probabilities
ensemble_probs = ensemble_probs / ensemble_probs.sum(axis=1, keepdims=True) # Normalize

In [47]:
# Final predictions
final_predictions = np.argmax(ensemble_probs, axis=1)
# Accuracy of the ensemble model
ensemble_accuracy = accuracy_score(y_test, final_predictions)
# Print accuracies
print("Gaussian Naïve Bayes Accuracy:", accuracy_score(y_test, np.argmax(y_pred_gnb, axis=1)))
print("Multinomial Naïve Bayes Accuracy:", accuracy_score(y_test, np.argmax(probs_mnb, axis=1)))
print("Ensemble Model Accuracy:", ensemble_accuracy)

Gaussian Naïve Bayes Accuracy: 0.9130434782608695
Multinomial Naïve Bayes Accuracy: 0.4308300395256917
Ensemble Model Accuracy: 0.7154150197628458


In [49]:
# Convert probabilities to DataFrame for better understanding in the form of table
prob_df = pd.DataFrame({
'Actual Target': y_test.values,
'GNB - P(No)': y_pred_gnb[:, 0], 'GNB - P(Yes)': y_pred_gnb[:, 1],
'MNB - P(No)': probs_mnb[:, 0], 'MNB - P(Yes)': probs_mnb[:, 1],
'Ensemble - P(No)': ensemble_probs[:, 0], 'Ensemble - P(Yes)': ensemble_probs[:, 1],
'Final Prediction': final_predictions
})
# Map 0 -> "No", 1 -> "Yes" for better readability
prob_df['Actual Target'] = prob_df['Actual Target'].map({0: "No", 1: "Yes"})
prob_df['Final Prediction'] = prob_df['Final Prediction'].map({0: "No", 1: "Yes"})

In [51]:
prob_df

Unnamed: 0,Actual Target,GNB - P(No),GNB - P(Yes),MNB - P(No),MNB - P(Yes),Ensemble - P(No),Ensemble - P(Yes),Final Prediction
0,Yes,0.900216,0.099784,0.053610,0.946390,0.338209,6.617913e-01,Yes
1,Yes,0.005873,0.994127,0.009204,0.990796,0.000055,9.999451e-01,Yes
2,Yes,0.008337,0.991663,0.000149,0.999851,0.000001,9.999987e-01,Yes
3,No,0.962734,0.037266,0.684478,0.315522,0.982470,1.753037e-02,No
4,Yes,0.951123,0.048877,0.999323,0.000677,0.999965,3.481540e-05,No
...,...,...,...,...,...,...,...,...
248,No,0.985328,0.014672,0.792038,0.207962,0.996105,3.894627e-03,No
249,No,0.977888,0.022112,0.000257,0.999743,0.011252,9.887484e-01,Yes
250,No,0.954828,0.045172,0.999990,0.000010,1.000000,4.681769e-07,No
251,No,0.987859,0.012141,0.000166,0.999834,0.013317,9.866827e-01,Yes
