<a href="https://colab.research.google.com/github/seanpaz478/AAI-510-Final-Project-Group7/blob/main/USD%2C_ML%2C_Final_Group_Project%2C_5_Model_Comparison_(YSternberg).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This code trains and compares the following 5 classifiers

* Logistic Regression
* Linear SVM
* Random Forest
* LightGBM
* Naive Bayes

# Final Results (See Full Code Below)

In [23]:
styled_results

Unnamed: 0,Model,Accuracy,Spam F1-Score,MCC,AUC
0,Random Forest (All Original Features),0.9869,0.9861,0.9736,0.9987
1,Random Forest (Streamlined Features),0.9857,0.9849,0.9714,0.9987
2,Linear SVM (Streamlined Features),0.977,0.9758,0.9539,0.9969
3,Linear SVM (All Original Features),0.9752,0.9739,0.9502,0.9963
4,LightGBM (All Original Features),0.9702,0.9688,0.9405,0.9961
5,Logistic Regression (Streamlined Features),0.9728,0.9714,0.9456,0.9959
6,LightGBM (Streamlined Features),0.9679,0.9664,0.9359,0.9957
7,Logistic Regression (All Original Features),0.9709,0.9694,0.9417,0.9951
8,Naive Bayes (All Original Features),0.9457,0.9426,0.891,0.9877
9,Naive Bayes (Streamlined Features),0.9334,0.9285,0.8665,0.9869


# Full Code

In [1]:
import re
import string
import warnings

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (ConfusionMatrixDisplay, PrecisionRecallDisplay,
                           RocCurveDisplay, classification_report, log_loss,
                           matthews_corrcoef, roc_auc_score)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.svm import LinearSVC
from tqdm import tqdm

# --- Global Settings ---
warnings.filterwarnings('ignore')
tqdm.pandas()

CORE HELPERS (PREPROCESSING, EVALUATION, FEATURE SETS)

In [None]:
PATTERNS = {
    "url": re.compile(r"http[s]?://\S+"),
    "email": re.compile(r"\S+@\S+"),
    "non_alnum": re.compile(r"[^a-zA-Z0-9\s]"),
    "hyperlink": re.compile(r"(http|www|\.com)"),
    "digit": re.compile(r"\d"),
    "upper_word": re.compile(r"\b[A-Z]{2,}\b"),
    "phone": re.compile(r"\b(?:\d{3}[-.\s]?)?\d{3}[-.\s]?\d{4}\b"),
    "repeat_char": re.compile(r"(.)\1{2,}"),
    "currency": re.compile(r"[$£€]"),
}
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str): return ""
    text = PATTERNS["url"].sub(" ", text)
    text = PATTERNS["email"].sub(" ", text)
    text = PATTERNS["non_alnum"].sub(" ", text)
    text = text.lower()
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 1]
    return " ".join(lemmatized)

def create_all_features(df, spam_vocab=None):
    """Creates features (added additional features in this run/version))."""
    features_df = df.copy()
    raw_text = features_df["text"].astype(str)
    processed_tokens = features_df["processed_text"].str.split().fillna("").apply(list)
    features_df["feat_char_count"] = raw_text.str.len()
    features_df["feat_word_count"] = processed_tokens.str.len()
    features_df["feat_sentence_count"] = raw_text.apply(lambda x: len(sent_tokenize(x))).clip(lower=1)
    features_df["feat_paragraph_count"] = raw_text.str.count(r'\n\n') + 1
    features_df["feat_word_diversity"] = processed_tokens.apply(lambda x: len(set(x)))
    if spam_vocab:
        features_df["feat_spam_word_count"] = processed_tokens.apply(lambda t: sum(1 for w in t if w in spam_vocab))
    features_df["feat_avg_word_len"] = raw_text.str.split().str.join('').str.len() / features_df["feat_word_count"].clip(lower=1)
    features_df["feat_word_diversity_ratio"] = features_df["feat_word_diversity"] / features_df["feat_word_count"].clip(lower=1)
    features_df["feat_uppercase_char_ratio"] = raw_text.str.count(r"[A-Z]") / features_df["feat_char_count"].clip(lower=1)
    features_df["feat_word_per_sentence"] = features_df["feat_word_count"] / features_df["feat_sentence_count"].clip(lower=1)
    features_df["feat_word_per_paragraph"] = features_df["feat_word_count"] / features_df["feat_paragraph_count"].clip(lower=1)
    features_df["feat_sentence_per_paragraph"] = features_df["feat_sentence_count"] / features_df["feat_paragraph_count"].clip(lower=1)
    features_df["feat_unique_word_per_sentence"] = features_df["feat_word_diversity"] / features_df["feat_sentence_count"].clip(lower=1)
    features_df["feat_unique_word_per_paragraph"] = features_df["feat_word_diversity"] / features_df["feat_paragraph_count"].clip(lower=1)
    features_df["feat_hyperlink_count"] = raw_text.str.lower().str.count(PATTERNS["hyperlink"])
    features_df["feat_exclamation_count"] = raw_text.str.count("!")
    features_df["feat_question_count"] = raw_text.str.count(r"\?")
    features_df["feat_digit_count"] = raw_text.str.count(PATTERNS["digit"])
    features_df["feat_uppercase_word_count"] = raw_text.str.count(PATTERNS["upper_word"])
    features_df["feat_special_char_count"] = raw_text.str.count(fr"[{re.escape(string.punctuation)}]")
    features_df["feat_currency_symbol_count"] = raw_text.str.count(PATTERNS["currency"])
    features_df["feat_phone_pattern_count"] = raw_text.str.count(PATTERNS["phone"])
    features_df["feat_repeat_char_count"] = raw_text.str.count(PATTERNS["repeat_char"])
    if spam_vocab:
        features_df["feat_spam_word_ratio"] = features_df["feat_spam_word_count"] / features_df["feat_word_count"].clip(lower=1)
        features_df["feat_spam_word_per_sentence"] = features_df["feat_spam_word_count"] / features_df["feat_sentence_count"].clip(lower=1)
        features_df["feat_spam_word_per_paragraph"] = features_df["feat_spam_word_count"] / features_df["feat_paragraph_count"].clip(lower=1)
        features_df["feat_spam_word_per_unique_word"] = features_df["feat_spam_word_count"] / features_df["feat_word_diversity"].clip(lower=1)
    feature_cols = [c for c in features_df.columns if c.startswith("feat_")]
    features_df[feature_cols] = features_df[feature_cols].fillna(0)
    return features_df

def create_streamlined_features(df, spam_vocab=None):
    """Creates a simplified set of features for baseline comparison."""
    features_df = df.copy()
    raw_text = features_df["text"].astype(str)
    processed_tokens = features_df["processed_text"].str.split().fillna("").apply(list)
    features_df["feat_char_count"] = raw_text.str.len()
    features_df["feat_word_count"] = processed_tokens.str.len()
    features_df["feat_avg_word_len"] = features_df["feat_char_count"] / features_df["feat_word_count"].clip(lower=1)
    features_df["feat_hyperlink_count"] = raw_text.str.lower().str.count(PATTERNS["hyperlink"])
    features_df["feat_digit_count"] = raw_text.str.count(PATTERNS["digit"])
    features_df["feat_uppercase_word_count"] = raw_text.str.count(PATTERNS["upper_word"])
    if spam_vocab:
        features_df["feat_spam_word_ratio"] = processed_tokens.apply(lambda t: sum(1 for w in t if w in spam_vocab)) / features_df["feat_word_count"].clip(lower=1)
    feature_cols = [c for c in features_df.columns if c.startswith("feat_")]
    features_df[feature_cols] = features_df[feature_cols].fillna(0)
    return features_df

def evaluate_model(model, model_name, X_test, y_test, results_list):
    """Evaluates a model and stores the key metrics in a results list."""
    print(f"\n--- Evaluating: {model_name} ---")
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=['Ham', 'Spam'], output_dict=True)
    mcc = matthews_corrcoef(y_test, y_pred)
    result = {'Model': model_name, 'Accuracy': report['accuracy'], 'Spam F1-Score': report['Spam']['f1-score'], 'MCC': mcc}

    # Calculate AUC
    if hasattr(model, "predict_proba"):
        result['AUC'] = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    elif hasattr(model, "decision_function"):
        result['AUC'] = roc_auc_score(y_test, model.decision_function(X_test))
    else:
        result['AUC'] = float('nan') # Use NaN for sorting purposes

    results_list.append(result)

    # Format AUC for printing separately to avoid f-string issues.
    auc_text = f"{result['AUC']:.4f}" if not np.isnan(result['AUC']) else 'N/A'
    print(f"Accuracy: {result['Accuracy']:.4f}, Spam F1: {result['Spam F1-Score']:.4f}, MCC: {result['MCC']:.4f}, AUC: {auc_text}")

EXPERIMENT RUNNER

In [None]:
def run_bakeoff_experiment(dataframe, feature_set_name, models_to_test, results_list):
    """Takes a dataframe, a set of models, and runs the full bake-off."""
    print(f"\n{'#'*80}\n# RUNNING EXPERIMENT FOR: {feature_set_name}\n{'#'*80}")
    numeric_features = [col for col in dataframe.columns if col.startswith('feat_')]
    X = dataframe[['processed_text'] + numeric_features]
    y = dataframe['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    for model_name, model_obj in models_to_test.items():
        numeric_scaler = MaxAbsScaler() if model_name == 'Naive Bayes' else StandardScaler()
        preprocessor = ColumnTransformer(
            transformers=[
                ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)), 'processed_text'),
                ('numeric', numeric_scaler, numeric_features)
            ],
            remainder='drop'
        )
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model_obj)])
        pipeline.fit(X_train, y_train)
        full_model_name = f"{model_name} ({feature_set_name})"
        evaluate_model(pipeline, full_model_name, X_test, y_test, results_list)

Main

In [None]:
if __name__ == '__main__':
    # --- Step 1: Load and Preprocess Data ---
    # Path to file in Google Drive
    file_path = '/content/drive/My Drive/spam_Emails_data.csv'

    # Load CSV file into a pandas DataFrame (DF)
    df = pd.read_csv(file_path)
    df['processed_text'] = df['text'].progress_apply(preprocess_text)

    # Discover spam vocabulary
    temp_train_df, _ = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
    tfidf_vocab_gen = TfidfVectorizer(max_features=50, stop_words='english')
    tfidf_matrix_vocab = tfidf_vocab_gen.fit_transform(temp_train_df['processed_text'])
    DATA_DRIVEN_SPAM_VOCAB = set(tfidf_vocab_gen.get_feature_names_out())

    # --- Step 2: Define Models for Bake-Off ---
    models_to_test = {
        "Logistic Regression": LogisticRegression(solver='liblinear', random_state=42),
        "Linear SVM": LinearSVC(random_state=42, dual='auto'),
        "Random Forest": RandomForestClassifier(random_state=42, n_jobs=-1),
        "LightGBM": lgb.LGBMClassifier(random_state=42, n_jobs=-1),
        "Naive Bayes": MultinomialNB()
    }

    # --- Step 3: Create Feature Sets and Run Experiments ---
    feature_sets = {
        "Streamlined Features": create_streamlined_features(df, spam_vocab=DATA_DRIVEN_SPAM_VOCAB),
        "All Original Features": create_all_features(df, spam_vocab=DATA_DRIVEN_SPAM_VOCAB)
    }

    all_results = []
    for name, dataframe in feature_sets.items():
        run_bakeoff_experiment(dataframe, name, models_to_test, all_results)

    # --- Step 4: Display Final Comparison Table ---
    print(f"\n{'#'*80}\n# FINAL RESULTS SUMMARY\n{'#'*80}")
    results_df = pd.DataFrame(all_results)
    results_df = results_df.sort_values(by='AUC', ascending=False, na_position='last').reset_index(drop=True)
    print(results_df.to_string())


100%|██████████| 193852/193852 [08:23<00:00, 384.75it/s]



--- Creating feature sets for comparison... ---

################################################################################
# RUNNING EXPERIMENT FOR: Streamlined Features
################################################################################

--- Evaluating: Logistic Regression (Streamlined Features) ---
Accuracy: 0.9728, Spam F1: 0.9714, MCC: 0.9456, AUC: 0.9959

--- Evaluating: Linear SVM (Streamlined Features) ---
Accuracy: 0.9770, Spam F1: 0.9758, MCC: 0.9539, AUC: 0.9969

--- Evaluating: Random Forest (Streamlined Features) ---
Accuracy: 0.9857, Spam F1: 0.9849, MCC: 0.9714, AUC: 0.9987
[LightGBM] [Info] Number of positive: 68769, number of negative: 76620
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 31.887038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1167120
[LightGBM] [Info] Number of data points in the tr

In [None]:
# --- Create a Path for Saving ---
BASE_PROJECT_PATH = '/content/drive/My Drive/SpamClassifierProject_BakeOff'
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
RUN_SPECIFIC_PATH = os.path.join(BASE_PROJECT_PATH, f"run_{timestamp}")
os.makedirs(RUN_SPECIFIC_PATH, exist_ok=True)

print(f"Artifacts for this run will be saved in: {RUN_SPECIFIC_PATH}")

# --- Define File Paths ---
results_path = os.path.join(RUN_SPECIFIC_PATH, 'model_bakeoff_summary.csv')
dataframe_path = os.path.join(RUN_SPECIFIC_PATH, 'best_featured_dataframe.csv')
vocab_path = os.path.join(RUN_SPECIFIC_PATH, 'spam_vocabulary.joblib')

# --- Save the Key Artifacts ---

# 1. Save the final results summary table
results_df.to_csv(results_path, index=False)
print(f"Bake-off summary table saved to: {results_path}")

# 2. Identify the best feature set from the top-performing model
# The 'Model' column is like 'LightGBM (All Original Features)'
top_model_name = results_df.iloc[0]['Model']
if "All Original Features" in top_model_name:
    best_df = feature_sets["All Original Features"]
    print("Identified 'All Original Features' as the best-performing feature set.")
else:
    best_df = feature_sets["Streamlined Features"]
    print("Identified 'Streamlined Features' as the best-performing feature set.")

# 3. Save the corresponding DataFrame
best_df.to_csv(dataframe_path, index=False)
print(f"Best-performing feature DataFrame saved to: {dataframe_path}")

# 4. Save the spam vocabulary set (essential for reproducibility)
joblib.dump(DATA_DRIVEN_SPAM_VOCAB, vocab_path)
print(f"Spam vocabulary set saved to: {vocab_path}")

Artifacts for this run will be saved in: /content/drive/My Drive/SpamClassifierProject_BakeOff/run_2025-06-20_11-50-42
Bake-off summary table saved to: /content/drive/My Drive/SpamClassifierProject_BakeOff/run_2025-06-20_11-50-42/model_bakeoff_summary.csv
Identified 'All Original Features' as the best-performing feature set.
Best-performing feature DataFrame saved to: /content/drive/My Drive/SpamClassifierProject_BakeOff/run_2025-06-20_11-50-42/best_featured_dataframe.csv
Spam vocabulary set saved to: /content/drive/My Drive/SpamClassifierProject_BakeOff/run_2025-06-20_11-50-42/spam_vocabulary.joblib


Load Data & Format Output Nicely

In [21]:
import pandas as pd

# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Base path where results were saved
base_path = '/content/drive/My Drive/SpamClassifierProject_BakeOff'

# Get most recent run folder
import os
latest_run = max(
    [os.path.join(base_path, d) for d in os.listdir(base_path) if d.startswith('run_')],
    key=os.path.getmtime
)

# Load saved results summary
results_df = pd.read_csv(os.path.join(latest_run, 'model_bakeoff_summary.csv'))

# ---- Styled display of results_df ----
pd.set_option('display.precision', 4)

def highlight_best(s):
    return ['background-color: #d4edda; color: #155724; font-weight: bold;' if s.name == 0 else '' for _ in s]

metric_columns = ['Accuracy', 'Spam F1-Score', 'MCC', 'AUC']

styled_results = results_df.sort_values(by='AUC', ascending=False, na_position='last').reset_index(drop=True).style \
    .apply(highlight_best, axis=1) \
    .background_gradient(cmap='Greens', subset=metric_columns) \
    .format("{:.4f}", subset=metric_columns) \
    .set_caption(" Model Performance Comparison") \
    .set_properties(**{
        'border': '1px solid #ddd',
        'text-align': 'center',
        'width': '150px'
    }) \
    .set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#343a40'), ('color', 'white'), ('font-size', '14px')]},
        {'selector': 'caption', 'props': [('color', 'black'), ('font-size', '20px'), ('font-weight', 'bold'), ('margin', '15px')]},
    ])

styled_results


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Model,Accuracy,Spam F1-Score,MCC,AUC
0,Random Forest (All Original Features),0.9869,0.9861,0.9736,0.9987
1,Random Forest (Streamlined Features),0.9857,0.9849,0.9714,0.9987
2,Linear SVM (Streamlined Features),0.977,0.9758,0.9539,0.9969
3,Linear SVM (All Original Features),0.9752,0.9739,0.9502,0.9963
4,LightGBM (All Original Features),0.9702,0.9688,0.9405,0.9961
5,Logistic Regression (Streamlined Features),0.9728,0.9714,0.9456,0.9959
6,LightGBM (Streamlined Features),0.9679,0.9664,0.9359,0.9957
7,Logistic Regression (All Original Features),0.9709,0.9694,0.9417,0.9951
8,Naive Bayes (All Original Features),0.9457,0.9426,0.891,0.9877
9,Naive Bayes (Streamlined Features),0.9334,0.9285,0.8665,0.9869
