<a href="https://colab.research.google.com/github/seanpaz478/AAI-510-Final-Project-Group7/blob/main/USD%2C_ML%2C_Final_Project%2C_Stacking%2C_LGBM_%26_RFC_%3E_LogReg_(Yaakov_Sternberg).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Imports
import os
import re
import io
import csv
import sys
import time
import json
import joblib
import string
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt

from pathlib import Path
from tqdm.auto import tqdm
from datetime import datetime
from google.colab import files, drive
from IPython.display import display, HTML
from transformers import AutoTokenizer, AutoModel

# Scikit-learn
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    accuracy_score,
    matthews_corrcoef,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    f1_score,
    precision_score,
    recall_score,
    log_loss,
    average_precision_score
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedShuffleSplit,
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_predict
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

GLOBAL CONFIGURATION & ENVIRONMENT SETUP

In [4]:
# Global Settings
warnings.filterwarnings('ignore')
tqdm.pandas()

# Mount Google Drive
drive.mount('/content/drive')

# Download NLTK stuff
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download('punkt_tab', quiet=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


True

PREPROCESSING

In [4]:
PATTERNS = {
    "url": re.compile(r"http[s]?://\S+"),
    "email": re.compile(r"\S+@\S+"),
    "phone": re.compile(r"\b(?:\d{3}[-.\s]?)?\d{3}[-.\s]?\d{4}\b"),
    "hyperlink": re.compile(r"(http|www|\.com)"),
    "currency": re.compile(r"[$£€]"),
    "non_alnum": re.compile(r"[^a-zA-Z0-9\s]"),
    "digit": re.compile(r"\d"),
    "upper_word": re.compile(r"\b[A-Z]{2,}\b"),
    "repeat_char": re.compile(r"(.)\1{2,}"),
}

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str): return ""
    text = PATTERNS["url"].sub(" ", text)
    text = PATTERNS["email"].sub(" ", text)
    text = PATTERNS["non_alnum"].sub(" ", text).lower()
    tokens = word_tokenize(text)
    return " ".join([lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 1])

FEATURE ENGINEERING FUNCTIONS

In [None]:
# Helper for safe division
def safe_division(numerator, denominator):
    """Performs division, returning 0 if the denominator is 0."""
    # Clip denominator at 1 to avoid division by zero
    denom_clipped = denominator.clip(lower=1)
    return numerator / denom_clipped

def create_features(df, spam_vocab=None, mode='full'):
    """
    Generates a set of features from the input text data.

    Args:
        df (pd.DataFrame): DataFrame containing 'text' and 'processed_text' columns.
        spam_vocab (set, optional): A set of spam-related words. Defaults to None.
        mode (str, optional): 'full' or 'streamlined'. Determines the feature set size.
                            Defaults to 'full'.

    Returns:
        pd.DataFrame: The original DataFrame with new feature columns added.
    """
    features_df = df.copy()
    raw_text = features_df["text"].astype(str)
    processed_tokens = features_df["processed_text"].str.split().fillna("").apply(list)

    # --- Core features (used in both modes) ---
    features_df["feat_char_count"] = raw_text.str.len()
    features_df["feat_word_count"] = processed_tokens.str.len()
    features_df["feat_hyperlink_count"] = raw_text.str.lower().str.count(PATTERNS["hyperlink"])
    features_df["feat_digit_count"] = raw_text.str.count(PATTERNS["digit"])

    # Calculate spam word count if vocab is provided
    if spam_vocab:
        spam_word_count = processed_tokens.apply(lambda tokens: sum(1 for word in tokens if word in spam_vocab))
        features_df["feat_spam_word_count"] = spam_word_count

    # --- Mode-specific features ---
    if mode == 'full':
        # More detailed linguistic features
        features_df["feat_sentence_count"] = raw_text.apply(lambda x: len(sent_tokenize(x))).clip(lower=1)
        features_df["feat_paragraph_count"] = raw_text.str.count(r'\n\n') + 1
        features_df["feat_word_diversity"] = processed_tokens.apply(lambda x: len(set(x)))
        features_df["feat_uppercase_char_count"] = raw_text.str.count(r"[A-Z]")

        # Ratio-based features
        features_df["feat_avg_word_len"] = safe_division(raw_text.str.replace(" ", "").str.len(), features_df["feat_word_count"])
        features_df["feat_word_diversity_ratio"] = safe_division(features_df["feat_word_diversity"], features_df["feat_word_count"])
        features_df["feat_uppercase_char_ratio"] = safe_division(features_df["feat_uppercase_char_count"], features_df["feat_char_count"])
        features_df["feat_word_per_sentence"] = safe_division(features_df["feat_word_count"], features_df["feat_sentence_count"])
        features_df["feat_word_per_paragraph"] = safe_division(features_df["feat_word_count"], features_df["feat_paragraph_count"])
        features_df["feat_sentence_per_paragraph"] = safe_division(features_df["feat_sentence_count"], features_df["feat_paragraph_count"])
        if spam_vocab:
            features_df["feat_spam_word_ratio"] = safe_division(features_df["feat_spam_word_count"], features_df["feat_word_count"])

    elif mode == 'streamlined':
        # Lightweight, faster-to-compute features
        features_df["feat_avg_word_len"] = safe_division(features_df["feat_char_count"], features_df["feat_word_count"])
        features_df["feat_uppercase_word_count"] = raw_text.str.count(PATTERNS["upper_word"])
        if spam_vocab:
            features_df["feat_spam_word_ratio"] = safe_division(spam_word_count, features_df["feat_word_count"])

    # Final cleanup
    feature_cols = [c for c in features_df.columns if c.startswith("feat_")]
    features_df[feature_cols] = features_df[feature_cols].fillna(0)
    return features_df

STACKING EXPERIMENT RUNNER

In [None]:
def get_level_one_features(X_base_train, y_train, X_base_test, base_estimators):
    """
    Trains base models and generates out-of-fold predictions (Level 1 features).
    """
    print("--- Generating Level 1 features from base models... ---")
    oof_train_preds = []
    test_preds = []
    trained_base_models = {}

    for name, model_pipeline in base_estimators:
        # Generate out-of-fold predictions for the training set
        oof_preds = cross_val_predict(
            model_pipeline, X_base_train, y_train, cv=3, method='predict_proba', n_jobs=-1
        )[:, 1]
        oof_train_preds.append(pd.Series(oof_preds, name=f"pred_{name}", index=X_base_train.index))

        # Train model on full training data to predict on the test set
        model_pipeline.fit(X_base_train, y_train)
        trained_base_models[name] = model_pipeline  # Save the trained model
        test_p = model_pipeline.predict_proba(X_base_test)[:, 1]
        test_preds.append(pd.Series(test_p, name=f"pred_{name}", index=X_base_test.index))

    # Combine predictions into DataFrames
    X_meta_train_preds = pd.concat(oof_train_preds, axis=1)
    X_meta_test_preds = pd.concat(test_preds, axis=1)

    return X_meta_train_preds, X_meta_test_preds, trained_base_models


def train_and_evaluate_meta_model(X_meta_train, y_train, X_meta_test, y_test):
    """
    Trains and evaluates the final meta-model.
    """
    print("--- Training and evaluating final meta-model... ---")

    # Define the preprocessor for the meta-model
    numeric_features = [c for c in X_meta_train.columns if c.startswith('feat_') or c.startswith('pred_')]
    meta_preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(max_features=2500), 'processed_text'),
            ('numeric', StandardScaler(), numeric_features)
        ],
        remainder='drop'
    )

    # Define and train the meta-model
    meta_model = Pipeline([
        ('preprocessor', meta_preprocessor),
        ('classifier', LogisticRegression(random_state=42, solver='liblinear'))
    ])
    meta_model.fit(X_meta_train, y_train)

    # Evaluate the model
    y_pred = meta_model.predict(X_meta_test)
    y_proba = meta_model.predict_proba(X_meta_test)[:, 1]

    # Calculate metrics more efficiently
    report = classification_report(y_test, y_pred, output_dict=True)
    results = {
        'Accuracy': report['accuracy'],
        'Spam F1-Score': report['1']['f1-score'],
        'MCC': matthews_corrcoef(y_test, y_pred),
        'AUC': roc_auc_score(y_test, y_proba)
    }

    return meta_model, results

# The main experiment function now orchestrates the calls
def run_manual_stacking_experiment(df_base, df_meta, y, experiment_name, results_list, saved_models_dict):
    """
    Performs a complete manual stacking run for a given configuration of features.
    """
    print(f"\n{'#'*80}\n# Running Stacking Experiment: {experiment_name}\n{'#'*80}")

    # 1. Split data (ensure consistent splits)
    X_base_train, X_base_test, y_train, y_test = train_test_split(df_base, y, test_size=0.25, random_state=42, stratify=y)
    X_meta_train, X_meta_test, _, _ = train_test_split(df_meta, y, test_size=0.25, random_state=42, stratify=y)

    # 2. Define base models
    base_numeric_feats = [c for c in X_base_train.columns if c.startswith('feat_')]
    base_preprocessor = ColumnTransformer(
        [('tfidf', TfidfVectorizer(max_features=2500), 'processed_text'),
         ('numeric', StandardScaler(), base_numeric_feats)],
        remainder='drop'
    )
    base_estimators = [
        ('lgbm', Pipeline([('preprocessor', base_preprocessor), ('clf', lgb.LGBMClassifier(random_state=42))])),
        ('rf', Pipeline([('preprocessor', base_preprocessor), ('clf', RandomForestClassifier(random_state=42))])),
    ]

    # 3. Generate Level 1 features
    X_meta_train_preds, X_meta_test_preds, trained_base_models = get_level_one_features(
        X_base_train, y_train, X_base_test, base_estimators
    )

    # 4. Create final meta-feature sets by combining original meta features with predictions
    X_meta_train_final = pd.concat([X_meta_train, X_meta_train_preds], axis=1)
    X_meta_test_final = pd.concat([X_meta_test, X_meta_test_preds], axis=1)

    # 5. Train and evaluate the meta-model
    meta_model, result = train_and_evaluate_meta_model(
        X_meta_train_final, y_train, X_meta_test_final, y_test
    )

    # 6. Store results and models
    result['Experiment'] = experiment_name
    results_list.append(result)
    saved_models_dict[experiment_name] = {
        'base_models': trained_base_models,
        'meta_model': meta_model
    }

In [4]:
if __name__ == '__main__':
    # Path to file in Google Drive
    file_path = '/content/drive/My Drive/spam_Emails_data.csv'

    # Load CSV file into a pandas DataFrame (DF)
    df = pd.read_csv(file_path)
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].str.strip().str.capitalize().map({'Ham': 0, 'Spam': 1})
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].astype(int)
    df['processed_text'] = df['text'].progress_apply(preprocess_text)
    y = df['label']

    temp_train_df, _ = train_test_split(df, test_size=0.3, random_state=42, stratify=y)
    tfidf_vocab_gen = TfidfVectorizer(max_features=50, stop_words='english')
    # Fit the vectorizer to your training text to build the vocabulary
    tfidf_vocab_gen.fit(temp_train_df['processed_text'])
    # Now that the vectorizer is fitted, you can safely get the vocabulary
    DATA_DRIVEN_SPAM_VOCAB = set(tfidf_vocab_gen.get_feature_names_out())


    # Create Both Feature Sets
    print("\n--- Creating feature sets for comparison... ---")
    feature_sets = {
        "Streamlined": create_features(df.drop('label', axis=1), spam_vocab=DATA_DRIVEN_SPAM_VOCAB, mode='streamlined'),
        "Full": create_features(df.drop('label', axis=1), spam_vocab=DATA_DRIVEN_SPAM_VOCAB, mode='full')
    }


    # Run all 4 Experiments
    experiments_to_run = [
        {'base': "Streamlined", 'meta': "Streamlined"},
        {'base': "Streamlined", 'meta': "Full"},
        {'base': "Full", 'meta': "Streamlined"},
        {'base': "Full", 'meta': "Full"},
    ]

    all_results = []
    all_trained_models = {}

    for exp_config in experiments_to_run:
        base_name = exp_config['base']
        meta_name = exp_config['meta']
        exp_name = f"Base: {base_name} -> Meta: {meta_name}"

        run_manual_stacking_experiment(
            df_base=feature_sets[base_name],
            df_meta=feature_sets[meta_name],
            y=y,
            experiment_name=exp_name,
            results_list=all_results,
            saved_models_dict=all_trained_models
        )

    # Display Final Comparison Table
    print(f"\n{'#'*80}\n# FINAL STACKING EXPERIMENT SUMMARY\n{'#'*80}")
    results_df = pd.DataFrame(all_results).sort_values(by='AUC', ascending=False).reset_index(drop=True)
    print(results_df.to_string())

    # Save Artifacts for the BEST Performing Experiment
    BASE_PROJECT_PATH = '/content/drive/My Drive/SpamClassifierProject_AdvancedStacking'
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    RUN_SPECIFIC_PATH = os.path.join(BASE_PROJECT_PATH, f"run_{timestamp}")
    os.makedirs(RUN_SPECIFIC_PATH, exist_ok=True)

    print(f"Artifacts will be saved in: {RUN_SPECIFIC_PATH}")

    # Identify best experiment
    best_experiment_name = results_df.iloc[0]['Experiment']
    best_model_stack = all_trained_models[best_experiment_name]

    print(f"Best performing experiment was: '{best_experiment_name}'")

    # Save each component of the best stack
    for name, model in best_model_stack['base_models'].items():
        joblib.dump(model, os.path.join(RUN_SPECIFIC_PATH, f'best_base_model_{name}.joblib'))
    joblib.dump(best_model_stack['meta_model'], os.path.join(RUN_SPECIFIC_PATH, 'best_meta_model.joblib'))

    # Save the summary & vocab
    results_df.to_csv(os.path.join(RUN_SPECIFIC_PATH, 'experiment_summary.csv'), index=False)
    joblib.dump(DATA_DRIVEN_SPAM_VOCAB, os.path.join(RUN_SPECIFIC_PATH, 'spam_vocabulary.joblib'))


--- Creating feature sets for comparison... ---

################################################################################
# Running Stacking Experiment: Base: Streamlined -> Meta: Streamlined
################################################################################
--- Generating Level 1 features from base models... ---
[LightGBM] [Info] Number of positive: 68769, number of negative: 76620
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.491067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 623916
[LightGBM] [Info] Number of data points in the train set: 145389, number of used features: 2506
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.473000 -> initscore=-0.108105
[LightGBM] [Info] Start training from score -0.108105
--- Training and evaluating final meta-model... ---

################################################################################
# Running Stacking Experiment

SAVE ALL MODELS AND ARTIFACTS TO GOOGLE DRIVE

In [5]:
# --- Mount Google Drive ---
# This will prompt for authorization if not already mounted.
drive.mount('/content/drive', force_remount=True)

# --- Create a unique, timestamped directory for this entire run ---
BASE_PROJECT_PATH = '/content/drive/My Drive/SpamClassifierProject_AdvancedStacking'
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
RUN_SPECIFIC_PATH = os.path.join(BASE_PROJECT_PATH, f"run_{timestamp}")
os.makedirs(RUN_SPECIFIC_PATH, exist_ok=True)

print(f"All artifacts for this run will be saved in: {RUN_SPECIFIC_PATH}")

# --- Save the overall experiment summary CSV ---
# This file compares all the models you just ran.
summary_csv_path = os.path.join(RUN_SPECIFIC_PATH, '1_full_experiment_summary.csv')
results_df.to_csv(summary_csv_path, index=False)
print(f"\nSaved overall results summary to: {summary_csv_path}")

# --- Save the data-driven spam vocabulary ---
# This is a key artifact used in feature engineering.
vocab_path = os.path.join(RUN_SPECIFIC_PATH, '2_spam_vocabulary.joblib')
joblib.dump(DATA_DRIVEN_SPAM_VOCAB, vocab_path)
print(f"Saved spam vocabulary to: {vocab_path}")

# --- Create a dedicated folder for all the model files ---
ALL_MODELS_PATH = os.path.join(RUN_SPECIFIC_PATH, '3_all_trained_model_stacks')
os.makedirs(ALL_MODELS_PATH, exist_ok=True)
print(f"\nSaving individual model stacks to: {ALL_MODELS_PATH}")

# --- Loop through each experiment and save its entire model stack ---
for experiment_name, model_stack in all_trained_models.items():
    # Sanitize the experiment name to create a valid folder name
    safe_folder_name = experiment_name.replace(" -> ", "_").replace(": ", "_").replace(" ", "")
    EXPERIMENT_FOLDER_PATH = os.path.join(ALL_MODELS_PATH, safe_folder_name)
    os.makedirs(EXPERIMENT_FOLDER_PATH, exist_ok=True)

    print(f"\n  Saving models for experiment: '{experiment_name}'")

    # Save each base model from the stack
    for name, model in model_stack['base_models'].items():
        model_filename = os.path.join(EXPERIMENT_FOLDER_PATH, f'base_model_{name}.joblib')
        joblib.dump(model, model_filename)
        print(f"    - Saved base model: {name}")

    # Save the meta model from the stack
    meta_model_filename = os.path.join(EXPERIMENT_FOLDER_PATH, 'meta_model.joblib')
    joblib.dump(model_stack['meta_model'], meta_model_filename)
    print(f"    - Saved meta model")

print("\n\n{'='*20} ALL ARTIFACTS SAVED SUCCESSFULLY! {'='*20}")
print(f"You can find everything in your Google Drive at: {RUN_SPECIFIC_PATH}")


--- Saving artifacts for ALL experiments... ---
Mounted at /content/drive
All artifacts for this run will be saved in: /content/drive/My Drive/SpamClassifierProject_AdvancedStacking/run_2025-06-20_13-06-14

Saved overall results summary to: /content/drive/My Drive/SpamClassifierProject_AdvancedStacking/run_2025-06-20_13-06-14/1_full_experiment_summary.csv
Saved spam vocabulary to: /content/drive/My Drive/SpamClassifierProject_AdvancedStacking/run_2025-06-20_13-06-14/2_spam_vocabulary.joblib

Saving individual model stacks to: /content/drive/My Drive/SpamClassifierProject_AdvancedStacking/run_2025-06-20_13-06-14/3_all_trained_model_stacks

  Saving models for experiment: 'Base: Streamlined -> Meta: Streamlined'
    - Saved base model: lgbm
    - Saved base model: rf
    - Saved meta model

  Saving models for experiment: 'Base: Streamlined -> Meta: Full'
    - Saved base model: lgbm
    - Saved base model: rf
    - Saved meta model

  Saving models for experiment: 'Base: Full -> Meta: 

# More readable output

In [6]:
# --- Styling the Results DataFrame ---

# Set a style for the entire notebook's pandas outputs (optional but nice)
pd.set_option('display.precision', 4)

# Define a function to highlight the top row (our best model)
def highlight_best(s):
    """Highlights the entire row of the best performing model in light green."""
    # Since the dataframe is already sorted, the best model is always at index 0
    is_max = s.index == 0
    return ['background-color: #d4edda; color: #155724; font-weight: bold;' if v else '' for v in is_max]

# Define the columns that contain numeric metrics we want to style
metric_columns = ['Accuracy', 'Spam F1-Score', 'MCC', 'AUC']

# Apply all the styling rules to the DataFrame
styled_results = results_df.style \
    .apply(highlight_best, axis=1) \
    .background_gradient(cmap='Greens', subset=metric_columns) \
    .format("{:.4f}", subset=metric_columns) \
    .set_caption(" Stacking Model Performance Comparison") \
    .set_properties(**{
        'border': '1px solid #ddd',
        'text-align': 'center',
        'width': '150px'
    }) \
    .set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#343a40'), ('color', 'white'), ('font-size', '14px')]},
        {'selector': 'caption', 'props': [('color', 'black'), ('font-size', '20px'), ('font-weight', 'bold'), ('margin', '15px')]}
    ])

# Display the styled DataFrame
# In a Colab/Jupyter environment, the last line being the object will render it.
styled_results


Unnamed: 0,Experiment,Accuracy,Spam F1-Score,MCC,AUC
0,Base: Full -> Meta: Full,0.9889,0.9882,0.9777,0.9992
1,Base: Full -> Meta: Streamlined,0.9888,0.9882,0.9776,0.9992
2,Base: Streamlined -> Meta: Full,0.9884,0.9878,0.9768,0.9992
3,Base: Streamlined -> Meta: Streamlined,0.9885,0.9878,0.9769,0.9991


HYPERPARAMETER TUNING WITH GRIDSEARCHCV (WITH CHECKPOINTS)

In [8]:
# --- Step 1: Mount Drive and Setup Checkpoint Directory ---
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

    # This directory is for saving progress and resuming if the session fails
    CHECKPOINT_DIR = '/content/drive/My Drive/SpamClassifierProject_Checkpoints'
    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
    print(f"Checkpoint directory is ready at: {CHECKPOINT_DIR}")

except ImportError:
    print("\nCould not connect to Google Drive. Checkpointing is disabled.")
    CHECKPOINT_DIR = None
except Exception as e:
    print(f"\nAn error occurred during Google Drive setup: {e}")
    CHECKPOINT_DIR = None


# --- Step 2: Define Hyperparameter Grids & Prepare Data ---
# Grids are unchanged
lgbm_param_grid = {
    'clf__n_estimators': [100, 200], 'clf__learning_rate': [0.05, 0.1], 'clf__num_leaves': [31, 50]
}
rf_param_grid = {
    'clf__n_estimators': [100, 200], 'clf__max_depth': [10, 20, None], 'clf__min_samples_split': [2, 5]
}
logistic_param_grid = {
    'classifier__C': [0.1, 1.0, 10.0], 'classifier__penalty': ['l1', 'l2']
}

# Data setup is unchanged
df_base_best = feature_sets["Full"]
df_meta_best = feature_sets["Full"]
X_base_train, X_base_test, y_train, y_test = train_test_split(
    df_base_best, y, test_size=0.25, random_state=42, stratify=y
)
X_meta_train, X_meta_test, _, _ = train_test_split(
    df_meta_best, y, test_size=0.25, random_state=42, stratify=y
)


# --- Step 3: Tune Base Models with Checkpointing ---
print("\n--- Tuning Base Models ---")
base_preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(max_features=2500), 'processed_text'),
        ('numeric', StandardScaler(), [c for c in X_base_train.columns if c.startswith('feat_')])
    ], remainder='drop'
)
base_estimators_for_tuning = {
    'lgbm': (Pipeline([('preprocessor', base_preprocessor), ('clf', lgb.LGBMClassifier(random_state=42))]), lgbm_param_grid),
    'rf': (Pipeline([('preprocessor', base_preprocessor), ('clf', RandomForestClassifier(random_state=42))]), rf_param_grid)
}

best_base_estimators = {}
best_base_params = {}

for name, (pipeline, param_grid) in base_estimators_for_tuning.items():
    checkpoint_path = os.path.join(CHECKPOINT_DIR, f'base_model_{name}_checkpoint.joblib')

    if CHECKPOINT_DIR and os.path.exists(checkpoint_path):
        # If checkpoint exists, load it
        print(f"Found checkpoint for {name}. Loading pre-tuned model.")
        checkpoint_data = joblib.load(checkpoint_path)
        best_base_estimators[name] = checkpoint_data['estimator']
        best_base_params[name] = checkpoint_data['params']
        print(f"Loaded best parameters for {name}: {best_base_params[name]}\n")
    else:
        # Otherwise, run GridSearchCV and save checkpoint
        print(f"No checkpoint found for {name}. Running GridSearchCV...")
        grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
        grid_search.fit(X_base_train, y_train)

        best_base_estimators[name] = grid_search.best_estimator_
        best_base_params[name] = grid_search.best_params_

        print(f"Best parameters for {name}: {grid_search.best_params_}")
        print(f"Best AUC score for {name}: {grid_search.best_score_:.4f}\n")

        if CHECKPOINT_DIR:
            print(f"Saving checkpoint for {name} to Google Drive...")
            checkpoint_data = {'estimator': grid_search.best_estimator_, 'params': grid_search.best_params_}
            joblib.dump(checkpoint_data, checkpoint_path)
            print("Checkpoint saved.\n")


# --- Step 4: Generate Level 1 Features (No Checkpoint Needed) ---
print("--- Generating Level 1 features using tuned base models... ---")
# This step is fast and can be rerun without issue.
oof_train_preds_tuned = []
test_preds_tuned = []

for name, best_model in best_base_estimators.items():
    oof_preds = cross_val_predict(best_model, X_base_train, y_train, cv=3, method='predict_proba', n_jobs=-1)[:, 1]
    oof_train_preds_tuned.append(pd.Series(oof_preds, name=f"pred_{name}_tuned", index=X_base_train.index))

    test_p = best_model.predict_proba(X_base_test)[:, 1]
    test_preds_tuned.append(pd.Series(test_p, name=f"pred_{name}_tuned", index=X_base_test.index))

X_meta_train_final_tuned = pd.concat([X_meta_train] + oof_train_preds_tuned, axis=1)
X_meta_test_final_tuned = pd.concat([X_meta_test] + test_preds_tuned, axis=1)
print("Level 1 features generated.")


# --- Step 5: Tune the Meta-Model with Checkpointing ---
print("\n--- Tuning the Meta-Model ---")
meta_model_checkpoint_path = os.path.join(CHECKPOINT_DIR, 'meta_model_checkpoint.joblib')

best_meta_model = None
best_meta_params = None

if CHECKPOINT_DIR and os.path.exists(meta_model_checkpoint_path):
    print("Found checkpoint for Meta-Model. Loading pre-tuned model.")
    checkpoint_data = joblib.load(meta_model_checkpoint_path)
    best_meta_model = checkpoint_data['estimator']
    best_meta_params = checkpoint_data['params']
    print(f"Loaded best parameters for Meta-Model: {best_meta_params}\n")
else:
    print("No checkpoint found for Meta-Model. Running GridSearchCV...")
    meta_numeric_features_tuned = [c for c in X_meta_train_final_tuned.columns if c.startswith('feat_') or c.startswith('pred_')]
    meta_preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(max_features=2500), 'processed_text'),
            ('numeric', StandardScaler(), meta_numeric_features_tuned)
        ], remainder='drop'
    )
    meta_model_pipeline = Pipeline([
        ('preprocessor', meta_preprocessor),
        ('classifier', LogisticRegression(random_state=42, solver='liblinear'))
    ])

    meta_grid_search = GridSearchCV(meta_model_pipeline, logistic_param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
    meta_grid_search.fit(X_meta_train_final_tuned, y_train)

    best_meta_model = meta_grid_search.best_estimator_
    best_meta_params = meta_grid_search.best_params_

    print(f"Best parameters for Meta-Model: {meta_grid_search.best_params_}")
    print(f"Best AUC score for Meta-Model on training data: {meta_grid_search.best_score_:.4f}\n")

    if CHECKPOINT_DIR:
        print("Saving checkpoint for Meta-Model to Google Drive...")
        checkpoint_data = {'estimator': best_meta_model, 'params': best_meta_params}
        joblib.dump(checkpoint_data, meta_model_checkpoint_path)
        print("Checkpoint saved.\n")


# --- Step 6: Evaluate the Final, Fully-Tuned Stacking Model ---
print(f"\n{'#'*80}\n# FINAL TUNED MODEL PERFORMANCE\n{'#'*80}")

y_pred_tuned = best_meta_model.predict(X_meta_test_final_tuned)
y_proba_tuned = best_meta_model.predict_proba(X_meta_test_final_tuned)[:, 1]

report_str = classification_report(y_test, y_pred_tuned, target_names=['Ham (0)', 'Spam (1)'])
report_dict = classification_report(y_test, y_pred_tuned, output_dict=True)

print("Classification Report for the Tuned Model:")
print(report_str)

final_metrics = {
    'Tuned Accuracy': f"{report_dict['accuracy']:.4f}",
    'Tuned Spam F1-Score': f"{report_dict['1']['f1-score']:.4f}",
    'Tuned MCC': f"{matthews_corrcoef(y_test, y_pred_tuned):.4f}",
    'Tuned AUC': f"{roc_auc_score(y_test, y_proba_tuned):.4f}"
}

print("\nFinal Performance Metrics:")
for metric, value in final_metrics.items():
    print(f"- {metric}: {value}")

print("\nHyperparameter tuning complete.")


# ==============================================================================
# 7. SAVE FINAL TUNED MODEL ARTIFACTS
# ==============================================================================
# This section saves the final results to a NEW, timestamped folder for archival.
if CHECKPOINT_DIR:
    print("\n--- Saving final artifacts for the tuned model stack... ---")

    BASE_PROJECT_PATH = '/content/drive/My Drive/SpamClassifierProject_TunedStacking'
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    RUN_SPECIFIC_PATH = os.path.join(BASE_PROJECT_PATH, f"tuned_run_{timestamp}")
    os.makedirs(RUN_SPECIFIC_PATH, exist_ok=True)

    print(f"Final artifacts will be saved in: {RUN_SPECIFIC_PATH}")

    # Save each tuned base model from the 'best_base_estimators' dict
    for name, model in best_base_estimators.items():
        joblib.dump(model, os.path.join(RUN_SPECIFIC_PATH, f'tuned_base_model_{name}.joblib'))

    # Save the tuned meta model
    joblib.dump(best_meta_model, os.path.join(RUN_SPECIFIC_PATH, 'tuned_meta_model.joblib'))
    print("Tuned base and meta models saved successfully.")

    summary_report = {
        'Final Performance Metrics': final_metrics,
        'Best Base Model Parameters': best_base_params,
        'Best Meta Model Parameters': best_meta_params
    }
    with open(os.path.join(RUN_SPECIFIC_PATH, 'summary_report.json'), 'w') as f:
        json.dump(summary_report, f, indent=4)
    with open(os.path.join(RUN_SPECIFIC_PATH, 'classification_report.txt'), 'w') as f:
        f.write(report_str)
    print("Performance reports and parameters saved successfully.")

    joblib.dump(DATA_DRIVEN_SPAM_VOCAB, os.path.join(RUN_SPECIFIC_PATH, 'spam_vocabulary.joblib'))
    print("Spam vocabulary saved successfully.")

    print("\nAll final artifacts have been saved.")
else:
    print("\nGoogle Drive not connected. Skipping final artifact saving.")


--- Starting Hyperparameter Tuning for the Best Model Stack ---
Mounted at /content/drive
Checkpoint directory is ready at: /content/drive/My Drive/SpamClassifierProject_Checkpoints

--- Tuning Base Models ---
No checkpoint found for lgbm. Running GridSearchCV...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Number of positive: 68769, number of negative: 76620
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.265260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 625695
[LightGBM] [Info] Number of data points in the train set: 145389, number of used features: 2513
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.473000 -> initscore=-0.108105
[LightGBM] [Info] Start training from score -0.108105
Best parameters for lgbm: {'clf__learning_rate': 0.1, 'clf__n_estimators': 200, 'clf__num_leaves': 50}
Best AUC score for lgbm: 0.9984

Saving checkpoint for lgbm to Google Dri



---


**Assistance Disclosure:**

I used LLMs (Codey, ChatGPT, Gemini, Claude, Grok) for brainstorming, debugging, feedback, and improving code readability.

# To Do: Fuller list of functions to try

Fuller list of functions:
    features_df["feat_char_count"] = raw_text.str.len()
    features_df["feat_word_count"] = processed_tokens.str.len()
    features_df["feat_sentence_count"] = raw_text.apply(lambda x: len(sent_tokenize(x))).clip(lower=1)
    features_df["feat_paragraph_count"] = raw_text.str.count(r'\n\n') + 1
    features_df["feat_word_diversity"] = processed_tokens.apply(lambda x: len(set(x)))
    if spam_vocab:
        features_df["feat_spam_word_count"] = processed_tokens.apply(lambda t: sum(1 for w in t if w in spam_vocab))
    features_df["feat_avg_word_len"] = raw_text.str.split().str.join('').str.len() / features_df["feat_word_count"].clip(lower=1)
    features_df["feat_word_diversity_ratio"] = features_df["feat_word_diversity"] / features_df["feat_word_count"].clip(lower=1)
    features_df["feat_uppercase_char_ratio"] = raw_text.str.count(r"[A-Z]") / features_df["feat_char_count"].clip(lower=1)
    features_df["feat_word_per_sentence"] = features_df["feat_word_count"] / features_df["feat_sentence_count"].clip(lower=1)
    features_df["feat_word_per_paragraph"] = features_df["feat_word_count"] / features_df["feat_paragraph_count"].clip(lower=1)
    features_df["feat_sentence_per_paragraph"] = features_df["feat_sentence_count"] / features_df["feat_paragraph_count"].clip(lower=1)
    features_df["feat_unique_word_per_sentence"] = features_df["feat_word_diversity"] / features_df["feat_sentence_count"].clip(lower=1)
    features_df["feat_unique_word_per_paragraph"] = features_df["feat_word_diversity"] / features_df["feat_paragraph_count"].clip(lower=1)
    features_df["feat_hyperlink_count"] = raw_text.str.lower().str.count(PATTERNS["hyperlink"])
    features_df["feat_exclamation_count"] = raw_text.str.count("!")
    features_df["feat_question_count"] = raw_text.str.count(r"\?")
    features_df["feat_digit_count"] = raw_text.str.count(PATTERNS["digit"])
    features_df["feat_uppercase_word_count"] = raw_text.str.count(PATTERNS["upper_word"])
    features_df["feat_special_char_count"] = raw_text.str.count(fr"[{re.escape(string.punctuation)}]")
    features_df["feat_currency_symbol_count"] = raw_text.str.count(PATTERNS["currency"])
    features_df["feat_phone_pattern_count"] = raw_text.str.count(PATTERNS["phone"])
    features_df["feat_repeat_char_count"] = raw_text.str.count(PATTERNS["repeat_char"])
    if spam_vocab:
        features_df["feat_spam_word_ratio"] = features_df["feat_spam_word_count"] / features_df["feat_word_count"].clip(lower=1)
        features_df["feat_spam_word_per_sentence"] = features_df["feat_spam_word_count"] / features_df["feat_sentence_count"].clip(lower=1)
        features_df["feat_spam_word_per_paragraph"] = features_df["feat_spam_word_count"] / features_df["feat_paragraph_count"].clip(lower=1)
        features_df["feat_spam_word_per_unique_word"] = features_df["feat_spam_word_count"] / features_df["feat_word_diversity"].clip(lower=1)
