# Q7: Naive Bayes â€” Email Spam Detection
Dataset: `naive_bayes_spam.csv`

In [None]:
# Common imports used across notebooks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
RANDOM_STATE = 42


In [None]:
df = pd.read_csv('/mnt/data/aiml/naive_bayes_spam.csv')
df.head()

In [None]:
# Map target
if df['label'].dtype == 'object':
    df['label'] = df['label'].map({'spam':1,'ham':0}).fillna(df['label'])
print(df['label'].value_counts())

In [None]:
# If the dataset contains text-like features, we'll use feature-based NB
X = df.drop(columns=['label'])
y = df['label']

In [None]:
# Encode categorical features and use GaussianNB
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

num_transform = Pipeline([('imputer', SimpleImputer(strategy='median')), ('identity', StandardScaler())])
cat_transform = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))])

pre = ColumnTransformer([('num', num_transform, num_cols), ('cat', cat_transform, cat_cols)])
pipe = Pipeline([('pre', pre), ('nb', GaussianNB())])

X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.2,random_state=RANDOM_STATE)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d'); plt.show()