Spam Email Classification

Objective: Build a model to classify emails as spam or not spam (ham) based on the email content.

Problem Type: Natural Language Processing (NLP)

In [26]:
import pandas as pd
import numpy as np
data = pd.read_csv(r"C:\Users\admin\Downloads\spam_email_data.csv")
print(data.head())

  Email ID             Sender                       Subject  \
0       E1  carol@example.com             Win a free iPhone   
1       E2  alice@example.com            Limited time offer   
2       E3    bob@example.com             Win a free iPhone   
3       E4   dave@example.com  Your account needs attention   
4       E5  alice@example.com             Win a free iPhone   

                          Email Content  Is Spam  
0                   Update your profile        0  
1                   Update your profile        0  
2         Exclusive offer just for you!        0  
3  Your bank account needs verification        0  
4                   Update your profile        0  


In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Email ID       100 non-null    object
 1   Sender         100 non-null    object
 2   Subject        100 non-null    object
 3   Email Content  100 non-null    object
 4   Is Spam        100 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 4.0+ KB


In [28]:
data.describe()

Unnamed: 0,Is Spam
count,100.0
mean,0.16
std,0.368453
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [29]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

1.Data Preprocessing

Clean email

In [30]:
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data['Cleaned Content'] = data['Sender'].apply(lambda text: ' '.join(
    [word for word in re.sub(r'[^a-z\s]', '', text.lower()).split() if word not in stop_words]
))
print(data[['Sender', 'Cleaned Content']].head())

              Sender  Cleaned Content
0  carol@example.com  carolexamplecom
1  alice@example.com  aliceexamplecom
2    bob@example.com    bobexamplecom
3   dave@example.com   daveexamplecom
4  alice@example.com  aliceexamplecom


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tokenize the text into words

In [31]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\admin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [32]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
data['Sentences'] = data['Sender'].apply(sent_tokenize)
data['Words'] = data['Sender'].apply(word_tokenize)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)
data['Cleaned Content'] = data['Sender'].apply(preprocess_text)
print(data[['Sender', 'Sentences', 'Words', 'Cleaned Content']].head())


              Sender            Sentences                    Words  \
0  carol@example.com  [carol@example.com]  [carol, @, example.com]   
1  alice@example.com  [alice@example.com]  [alice, @, example.com]   
2    bob@example.com    [bob@example.com]    [bob, @, example.com]   
3   dave@example.com   [dave@example.com]   [dave, @, example.com]   
4  alice@example.com  [alice@example.com]  [alice, @, example.com]   

   Cleaned Content  
0  carolexamplecom  
1  aliceexamplecom  
2    bobexamplecom  
3   daveexamplecom  
4  aliceexamplecom  


[nltk_data] Downloading package punkt to C:\Users\admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Feature Engineering:

TF-IDF (Term Frequency-Inverse Document Frequency)

In [33]:
vectorizer = TfidfVectorizer(max_features=5000) 
X = vectorizer.fit_transform(data['Cleaned Content']).toarray()
y = data['Is Spam'] 
print("Shape of the feature matrix:", X.shape)

Shape of the feature matrix: (100, 4)


Model Training:

Naive Bayes classifier

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
data.columns = data.columns.str.strip()
X = data['Email Content']
y = data['Is Spam']
stop_words = set(stopwords.words('english'))
X_cleaned = X.apply(lambda text: ' '.join(
    [word for word in re.sub(r'[^a-z\s]', '', text.lower()).split() if word not in stop_words]
))
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X_cleaned)
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Model training complete. Predictions made on the test set.")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=1))

Model training complete. Predictions made on the test set.
Accuracy: 0.8
Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        16
           1       1.00      0.00      0.00         4

    accuracy                           0.80        20
   macro avg       0.90      0.50      0.44        20
weighted avg       0.84      0.80      0.71        20



Evaluation:

Accuracy

Precision

Recall

F1 Score

In [36]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1, zero_division=1)  
recall = recall_score(y_test, y_pred, pos_label=1, zero_division=1)
f1 = f1_score(y_test, y_pred, pos_label=1, zero_division=1)
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Evaluation Metrics:
Accuracy: 0.80
Precision: 1.00
Recall: 0.00
F1 Score: 0.00
