In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#Loading the Dataset

In [8]:
data = pd.read_csv('/content/advertising_ef.csv')
data

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Gender,Country,Clicked on Ad
0,68.95,35.0,61833.90,256.09,Wrightburgh,Female,Tunisia,0
1,,31.0,68441.85,193.77,West Jodi,Male,Nauru,0
2,69.47,26.0,59785.94,236.50,Davidton,Female,San Marino,0
3,74.15,29.0,54806.18,245.89,West Terrifurt,Male,Italy,0
4,68.37,35.0,73889.99,225.58,South Manuel,Female,Iceland,0
...,...,...,...,...,...,...,...,...
1004,72.97,30.0,71384.57,208.58,Duffystad,Male,Lebanon,1
1005,51.30,45.0,67782.17,134.42,New Darlene,Male,Bosnia and Herzegovina,1
1006,51.63,51.0,42415.72,120.37,South Jessica,Male,Mongolia,1
1007,55.55,19.0,41920.79,187.95,West Steven,Female,Guatemala,0


In [9]:
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Gender,Country,Clicked on Ad
0,68.95,35.0,61833.9,256.09,Wrightburgh,Female,Tunisia,0
1,,31.0,68441.85,193.77,West Jodi,Male,Nauru,0
2,69.47,26.0,59785.94,236.5,Davidton,Female,San Marino,0
3,74.15,29.0,54806.18,245.89,West Terrifurt,Male,Italy,0
4,68.37,35.0,73889.99,225.58,South Manuel,Female,Iceland,0


# Handling Missing Values

Drop unnecessary columns

In [10]:
data = data.drop(columns=['City', 'Country'])

In [16]:
# Fill missing numerical values with column means (selecting only numeric columns)
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Verify that missing values are filled
(data.isnull().sum())

Unnamed: 0,0
Daily Time Spent on Site,0
Age,0
Area Income,0
Daily Internet Usage,0
Gender,0
Clicked on Ad,0


#Encoding Categorical Variables

In [18]:
# Encode 'Gender' column (Male: 0, Female: 1)
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])

# Display the updated dataset
("\nDataset after Encoding Gender:")
(data.head())

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Gender,Clicked on Ad
0,68.95,35.0,61833.9,256.09,0,0
1,65.009463,31.0,68441.85,193.77,1,0
2,69.47,26.0,59785.94,236.5,0,0
3,74.15,29.0,54806.18,245.89,1,0
4,68.37,35.0,73889.99,225.58,0,0


#Splitting the Dataset

In [19]:
# Define features and target variable
X = data.drop(columns=['Clicked on Ad'])
y = data['Clicked on Ad']

# Split into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shapes of the splits
print("\nTraining Set Shape:", X_train.shape, y_train.shape)
print("Testing Set Shape:", X_test.shape, y_test.shape)


Training Set Shape: (706, 5) (706,)
Testing Set Shape: (303, 5) (303,)


#Applying the Naive Bayes Algorithm

In [20]:
# Initialize and train the Gaussian Naive Bayes model
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = gnb.predict(X_test)

#Evaluating the Model

In [23]:
# Calculate accuracy, confusion matrix, and classification report
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("\nModel Accuracy: {:.2f}%".format(accuracy * 100))
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Model Accuracy: 95.38%

Confusion Matrix:
[[150   6]
 [  8 139]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       156
           1       0.96      0.95      0.95       147

    accuracy                           0.95       303
   macro avg       0.95      0.95      0.95       303
weighted avg       0.95      0.95      0.95       303

