# Naive Bayes on Spam Detection Dataset

### Step 1: Import Libraries

In [6]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report


### Step 2: Load Dataset

In [7]:

# Load dataset
file_path = "spam_or_not_spam.csv"
df = pd.read_csv(file_path)

# Display first few rows and info
print(df.info())
print("\nSample Data:")
print(df.head())
print("\nMissing Values:")
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   2999 non-null   object
 1   label   3000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.0+ KB
None

Sample Data:
                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0

Missing Values:
email    1
label    0
dtype: int64


### Step 3: Data Cleaning

In [8]:

# Drop missing emails
df = df.dropna(subset=['email'])
print("After dropping missing values:", df.shape)


After dropping missing values: (2999, 2)


### Step 4: Split Data into Training and Testing Sets

In [9]:

# Features and labels
X = df['email']
y = df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train size:", X_train.shape, "| Test size:", X_test.shape)


Train size: (2399,) | Test size: (600,)


### Step 5: TF-IDF Vectorization

In [10]:

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("TF-IDF matrix shape:", X_train_tfidf.shape)


TF-IDF matrix shape: (2399, 5000)


### Step 6: Train Multinomial and Bernoulli Naive Bayes Models

In [12]:

# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)
mnb_pred = mnb.predict(X_test_tfidf)

# Bernoulli Naive Bayes
bnb = BernoulliNB()
bnb.fit(X_train_tfidf, y_train)
bnb_pred = bnb.predict(X_test_tfidf)

# Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train_tfidf.toarray(), y_train)
gnb_pred = gnb.predict(X_test_tfidf.toarray())


### Step 7: Evaluate Models

In [13]:

# Evaluation
mnb_acc = accuracy_score(y_test, mnb_pred)
bnb_acc = accuracy_score(y_test, bnb_pred)
gnb_acc = accuracy_score(y_test, gnb_pred)

print("Multinomial NB Accuracy:", mnb_acc)
print("Bernoulli NB Accuracy:", bnb_acc)

print("\n--- Multinomial Naive Bayes ---")
print(classification_report(y_test, mnb_pred))

print("\n--- Bernoulli Naive Bayes ---")
print(classification_report(y_test, bnb_pred))

print("\n--- Guassian Naive Bayes ---")
print(classification_report(y_test, gnb_pred))

Multinomial NB Accuracy: 0.98
Bernoulli NB Accuracy: 0.95

--- Multinomial Naive Bayes ---
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       500
           1       0.99      0.89      0.94       100

    accuracy                           0.98       600
   macro avg       0.98      0.94      0.96       600
weighted avg       0.98      0.98      0.98       600


--- Bernoulli Naive Bayes ---
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       500
           1       0.85      0.85      0.85       100

    accuracy                           0.95       600
   macro avg       0.91      0.91      0.91       600
weighted avg       0.95      0.95      0.95       600


--- Guassian Naive Bayes ---
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       500
           1       0.96      0.79      0.87       100

    accuracy                  