In [23]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [24]:
# Load Dataset
from google.colab import files
uploaded = files.upload()
df=pd.read_csv('email.csv',encoding='latin-1')
df.head()


Saving email.csv to email (12).csv


In [25]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [26]:

# Check for missing values
df.isnull().sum()

# Encode labels: ham = 0, spam = 1
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})
df.head()




Unnamed: 0,Category,Message
0,0.0,"Go until jurong point, crazy.. Available only ..."
1,0.0,Ok lar... Joking wif u oni...
2,1.0,Free entry in 2 a wkly comp to win FA Cup fina...
3,0.0,U dun say so early hor... U c already then say...
4,0.0,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
# Text Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


In [28]:
 # Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [29]:
# Apply text preprocessing
df['cleaned_message'] = df['Message']. apply(preprocess_text)
df.head()
df = df[df['cleaned_message'].str.strip() != '']  # remove empty strings
df = df.dropna(subset=['cleaned_message'])        # remove NaNs just in case



In [44]:
# Drop rows with NaNs before splitting
df = df.dropna(subset=['cleaned_message', 'Category'])
# Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_message'])
y = df['Category'].astype(int)


In [48]:

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [51]:
# Train Naive Bayes Classifier

model = MultinomialNB()
model.fit(X_train, y_train)
# Predict and Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9542190305206463

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       951
           1       0.99      0.69      0.82       163

    accuracy                           0.95      1114
   macro avg       0.97      0.85      0.89      1114
weighted avg       0.96      0.95      0.95      1114


Confusion Matrix:
 [[950   1]
 [ 50 113]]


**Insights**<br>
he model achieves high accuracy (95.42%) and excels at predicting class 0 (Ham)(recall: 1.00). However, class 1 has a lower recall (0.69), with 50 false negatives, indicating the model misses some positive cases. The class imbalance (951 vs. 163) may be a factor. We will Consider addressing this with oversampling.

In [57]:
from imblearn.over_sampling import RandomOverSampler
# Oversample training data
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
#reapply the model on resampled data
model = MultinomialNB()
model.fit(X_train_resampled, y_train_resampled)



In [59]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9649910233393177

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       951
           1       0.86      0.91      0.88       163

    accuracy                           0.96      1114
   macro avg       0.92      0.94      0.93      1114
weighted avg       0.97      0.96      0.97      1114


Confusion Matrix:
 [[926  25]
 [ 14 149]]


-After oversampling, the accuracy improved from **95.42%** to **96.50%**.<br>
-*Class 0 (Ham label):* <br>The precision increased (99%), but recall dropped slightly (97%). This means it became even better at correctly identifying Class 0 but missed a few more true Class 0 cases.

-*Class 1 (Spam Label)*: <br>The recall for Class Spam increased significantly to 91%, meaning the model now identifies a higher proportion of true Class 1 cases (only missing 9% of them). Precision dropped to 86%, which means the model now makes more false positive predictions (incorrectly predicting  Spam  when it was actually Ham).


