In [None]:
import os
import re
from email import message_from_file
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Download NLTK stopwords (first time only)
nltk.download('stopwords')
from nltk.corpus import stopwords

# --------- Load Emails ---------
def load_emails_from_dir(dir_path, label):
    emails = []
    for filename in os.listdir(dir_path):
        filepath = os.path.join(dir_path, filename)
        if not os.path.isfile(filepath):  # skip directories
            continue
        try:
            with open(filepath, 'r', encoding='latin1') as f:
                msg = message_from_file(f)
                payload = msg.get_payload()
                # Some emails have multipart payloads
                if isinstance(payload, list):
                    body = ''.join([str(part.get_payload()) for part in payload])
                else:
                    body = str(payload)
                emails.append((body, label))
        except Exception as e:
            print(f"Error reading {filename}: {e}")
    return emails

# Load spam and ham
ham_emails = load_emails_from_dir('spam assasin/ham', 0)
spam_emails = load_emails_from_dir('spam assasin/spam', 1)
hard_ham_emails = load_emails_from_dir('spam assasin/hard_ham', 0)
all_emails = ham_emails + spam_emails + hard_ham_emails

# Separate texts and labels
texts, labels = zip(*all_emails)

# --------- Preprocessing ---------
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

cleaned_texts = [clean_text(email) for email in texts]

# --------- Vectorization ---------
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(cleaned_texts)
y = labels

# --------- Split Data ---------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --------- Train Model ---------
clf = MultinomialNB()
clf.fit(X_train, y_train)

# --------- Evaluate Model ---------
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.93      0.95       340
           1       0.95      0.98      0.96       471

    accuracy                           0.96       811
   macro avg       0.96      0.95      0.96       811
weighted avg       0.96      0.96      0.96       811

Confusion Matrix:
 [[316  24]
 [ 11 460]]


In [27]:
import joblib

# Save the TF-IDF Vectorizer
joblib.dump(vectorizer, r'C:\Users\rohit\OneDrive\Desktop\tfidf_vectorizer.joblib')

# Save the Multinomial Naive Bayes model
joblib.dump(clf, r'C:\Users\rohit\OneDrive\Desktop\naive_bayes_model.joblib')


['C:\\Users\\rohit\\OneDrive\\Desktop\\naive_bayes_model.joblib']