In [2]:
# ! pip install nltk

In [3]:
# Load Emails
import os

def load_emails_from_folder(folder_list):
    emails = []
    for folder_path in folder_list:
        for filename in os.listdir(folder_path):
            filepath = os.path.join(folder_path, filename)
            if os.path.isfile(filepath):
                with open(filepath, 'r', encoding='latin-1') as file:
                    emails.append(file.read())
    return emails
    
ham_folders = [ 'data/easy_ham_1', 'data/easy_ham_2', 'data/easy_ham_3', 'data/hard_ham_1', 'data/hard_ham_2' ]

spam_folders = [
    'data/spam_1',
    'data/spam_2',
    'data/spam_3',
    'data/spam_4'
]

ham_emails = load_emails_from_folder(ham_folders)
spam_emails = load_emails_from_folder(spam_folders)

x = ham_emails + spam_emails
y = [0] * len(ham_emails) + [1] * len(spam_emails)

print("✅ Total Ham:", len(ham_emails))
print("✅ Total Spam:", len(spam_emails))
print("✅ Total Emails:", len(x))

✅ Total Ham: 6954
✅ Total Spam: 3797
✅ Total Emails: 10751


In [4]:
# Train–Test Split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2, stratify=y)

print(f'Training samples: {len(x_train)}')
print(f'Testing samples: {len(x_test)}')

Training samples: 8600
Testing samples: 2151


In [5]:
# Data Preparation Pipeline
#     Remove headers
#     Lowercase
#     Remove punctuation
#     Replace URLs → "URL"
#     Replace numbers → "NUMBER"

import re
def clean_email(text):
    text = text.split('\n\n',1)[-1]

    text = text.lower()

    text = re.sub(r'http\S+', 'URL', text)

    text = re.sub(r'\d+', 'NUMBER', text)

    text = re.sub(r'[^\w\s]', '', text)

    return text

In [6]:
# Convert Emails → Sparse Feature Vectors
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    preprocessor=clean_email,
    stop_words='english',
    max_features=5000
)

x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

print('Feature vector shape: ', x_train_vec.shape)

Feature vector shape:  (8600, 5000)


In [7]:
# NAIVE BAYES
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(x_train_vec, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [8]:
# LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(x_train_vec, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [9]:
# SUPPORT VECTOR MACHINE
from sklearn.svm import LinearSVC
svm_model = LinearSVC()
svm_model.fit(x_train_vec, y_train)



0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [10]:
# EVALUATION FUNCTION
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

def evaluate_model(model, X_test_vec, y_test):
    y_pred = model.predict(X_test_vec)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print('Classification Report:\n', classification_report(y_test, y_pred))
    print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))


In [11]:
print("\n\nNaive Bayes:")
evaluate_model(nb_model, x_test_vec, y_test)

print("\n\nLogistic Regression:")
evaluate_model(lr_model, x_test_vec, y_test)

print("\n\nSVM:")
evaluate_model(svm_model, x_test_vec, y_test)




Naive Bayes:
Accuracy: 0.9730357973035797
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      1391
           1       0.98      0.94      0.96       760

    accuracy                           0.97      2151
   macro avg       0.98      0.97      0.97      2151
weighted avg       0.97      0.97      0.97      2151

Confusion matrix: 
 [[1379   12]
 [  46  714]]


Logistic Regression:
Accuracy: 0.9934913993491399
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1391
           1       1.00      0.99      0.99       760

    accuracy                           0.99      2151
   macro avg       0.99      0.99      0.99      2151
weighted avg       0.99      0.99      0.99      2151

Confusion matrix: 
 [[1388    3]
 [  11  749]]


SVM:
Accuracy: 0.9920966992096699
Classification Report:
               precision    recall  f1-score   su

USING TF-IDF VECTORIZER

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=8000,
    ngram_range=(1, 3),
    token_pattern=r'\b[a-zA-Z]{2,}\b'
)

x_train_tfvec = tfidf.fit_transform(x_train)
x_test_tfvec = tfidf.transform(x_test)

print("New feature shape:", x_train_tfvec.shape)



New feature shape: (8600, 8000)


In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

param_grid = {
    "alpha": [0.001, 0.01, 0.1, 0.5, 1.0]
}

nb = MultinomialNB()

grid = GridSearchCV(
    nb,
    param_grid,
    scoring="recall",   # ✅ we are maximizing RECALL
    cv=5
)

grid.fit(x_train_tfvec, y_train)

print("✅ Best Alpha:", grid.best_params_)


✅ Best Alpha: {'alpha': 0.001}


In [14]:
best_nb = grid.best_estimator_

best_nb.fit(x_train_tfvec, y_train)


0,1,2
,alpha,0.001
,force_alpha,True
,fit_prior,True
,class_prior,


In [15]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

y_pred = best_nb.predict(x_test_tfvec)

print("✅ FINAL OPTIMIZED MODEL RESULTS")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ FINAL OPTIMIZED MODEL RESULTS
Accuracy: 0.9665271966527197
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1391
           1       0.97      0.93      0.95       760

    accuracy                           0.97      2151
   macro avg       0.97      0.96      0.96      2151
weighted avg       0.97      0.97      0.97      2151

Confusion Matrix:
 [[1370   21]
 [  51  709]]


In [16]:
# LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(x_train_tfvec, y_train)

y_pred = lr_model.predict(x_test_tfvec)

print("✅ FINAL OPTIMIZED MODEL RESULTS")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

✅ FINAL OPTIMIZED MODEL RESULTS
Accuracy: 0.9721059972105998
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1391
           1       0.98      0.94      0.96       760

    accuracy                           0.97      2151
   macro avg       0.97      0.97      0.97      2151
weighted avg       0.97      0.97      0.97      2151

Confusion Matrix:
 [[1375   16]
 [  44  716]]


In [17]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def clean_email(text):
    text = text.split('\n\n',1)[-1]
    text = text.lower()
    text = re.sub(r'http\S+', 'URL', text)
    text = re.sub(r'\d+', 'NUMBER', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Use the same TF-IDF vectorizer you trained
tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=8000,
    ngram_range=(1, 3),
    token_pattern=r'\b[a-zA-Z]{2,}\b'
)

# Fit on your training data
tfidf.fit(x_train)

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save the Logistic Regression model (or choose your best model)
with open('spam_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

print("✅ Model and vectorizer saved successfully!")

✅ Model and vectorizer saved successfully!
