In [31]:
import os
import email
import re
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [11]:
data_dir = "../data/extracted_emails"

In [12]:
def extract_email_text(file_path):
    with open(file_path, "r", encoding="latin-1") as f:
        msg = email.message_from_file(f)
        subject = msg["Subject"] if msg["Subject"] else ""
        content = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    content += part.get_payload(decode=True).decode("latin-1", errors="ignore")
        else:
            content = msg.get_payload(decode=True).decode("latin-1", errors="ignore")
        return subject + " " + content

In [13]:
def load_emails(folder, label):
    emails = []
    for root, _, files in os.walk(folder):
        for file in files:
            file_path = os.path.join(root, file)
            text = extract_email_text(file_path)
            emails.append((text, label))
    return emails

In [14]:
spam_emails = load_emails(os.path.join(data_dir, "spam"), "spam")
not_spam_emails = load_emails(os.path.join(data_dir, "easy_ham"), "not_spam") + \
                   load_emails(os.path.join(data_dir, "hard_ham"), "not_spam")



In [15]:
df = pd.DataFrame(spam_emails + not_spam_emails, columns=["text", "label"])


In [16]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

df["text"] = df["text"].apply(preprocess_text)

In [17]:
print("Cleaned Data Sample:")
print(df.head())

Cleaned Data Sample:
                                                text label
0   mv 1 00001bfc8d64d12b325ff385cca8d07b84288\nm...  spam
1  life insurance  why pay more doctype html publ...  spam
2  ilug guaranteed to lose 1012 lbs in 30 days 10...  spam
3  guaranteed to lose 1012 lbs in 30 days        ...  spam
4  re fw user name  password to membership to 5 s...  spam


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)



In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [19]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.9658
              precision    recall  f1-score   support

    not_spam       0.97      0.99      0.98       547
        spam       0.97      0.91      0.94       214

    accuracy                           0.97       761
   macro avg       0.97      0.95      0.96       761
weighted avg       0.97      0.97      0.97       761



In [25]:
def predict_email(text):
    cleaned_text = preprocess_text(text)
    text_tfidf = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_tfidf)[0]
    return "Spam" if prediction == "spam" else "Not-Spam"

email_text = """ 
Subject: 🎉 YOU JUST WON $5,000,000! CLAIM NOW 🎉  

From: lottery@mega-winner.com  
To: user@example.com  

Dear Winner,  

Congratulations! You have been randomly selected as the **GRAND PRIZE WINNER** of our **$5,000,000 Mega Jackpot**.  

To claim your prize:  
✅ Reply with your **full name, address, and phone number**  
✅ Pay a **small processing fee of $99**  
✅ Receive your winnings within 24 hours!  

Hurry! Your prize will be forfeited if unclaimed.  

Best regards,  
Jack Thompson  
Mega Lottery Promotions  

  


"""
print("Prediction:", predict_email(email_text))

Prediction: Spam


In [32]:
with open(os.path.join("../models", "spam_classifier_LogisticRegressi.pkl"), "wb") as model_file:
    pickle.dump((vectorizer, model), model_file)