<a href="https://colab.research.google.com/github/seanpaz478/AAI-510-Final-Project-Group7/blob/main/Usd%2C_ML%2C_Final_Group_Project%2C_Stacking_w_inference_time_(YSternberg).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import os
import json
from datetime import datetime
import pandas as pd
import numpy as np
import joblib
from time import time # MODIFICATION: Added the time library

# NLTK imports - ensure you have these downloaded
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    matthews_corrcoef,
    roc_auc_score,
)

# Model imports
import lightgbm as lgb

# Progress bar for pandas
from tqdm.auto import tqdm
tqdm.pandas()

# Google Drive integration
from google.colab import drive

# NLTK download
for resource in ['punkt', 'stopwords', 'wordnet', 'punkt_tab']:
    nltk.download(resource, quiet=True)

# --- Feature Engineering Setup ---

PATTERNS = {
    "url": re.compile(r"http[s]?://\S+"),
    "email": re.compile(r"\S+@\S+"),
    "phone": re.compile(r"\b(?:\d{3}[-.\s]?)?\d{3}[-.\s]?\d{4}\b"),
    "hyperlink": re.compile(r"(http|www|\.com)"),
    "currency": re.compile(r"[$£€]"),
    "non_alnum": re.compile(r"[^a-zA-Z0-9\s]"),
    "digit": re.compile(r"\d"),
    "upper_word": re.compile(r"\b[A-Z]{2,}\b"),
    "repeat_char": re.compile(r"(.)\1{2,}"),
}

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Cleans and preprocesses the raw text by removing URLs, emails, and non-alphanumeric characters,
    tokenizing, lemmatizing, and removing stopwords.
    """
    if not isinstance(text, str): return ""
    text = PATTERNS["url"].sub(" ", text)
    text = PATTERNS["email"].sub(" ", text)
    text = PATTERNS["non_alnum"].sub(" ", text).lower()
    tokens = word_tokenize(text)
    return " ".join([lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 1])

def safe_division(numerator, denominator):
    """Performs division, returning 0 if the denominator is 0."""
    return numerator / denominator.clip(lower=1)

def create_features(df, spam_vocab=None):
    """
    Generates a full set of features from the input text data.

    Args:
        df (pd.DataFrame): DataFrame containing 'text' and 'processed_text' columns.
        spam_vocab (set, optional): A set of spam-related words. Defaults to None.

    Returns:
        pd.DataFrame: The original DataFrame with new feature columns added.
    """
    features_df = df.copy()
    raw_text = features_df["text"].astype(str)
    processed_tokens = features_df["processed_text"].str.split().fillna("").apply(list)

    # --- Core features ---
    features_df["feat_char_count"] = raw_text.str.len()
    features_df["feat_word_count"] = processed_tokens.str.len()
    features_df["feat_hyperlink_count"] = raw_text.str.lower().str.count(PATTERNS["hyperlink"])
    features_df["feat_digit_count"] = raw_text.str.count(PATTERNS["digit"])

    # Calculate spam word count if vocab is provided
    if spam_vocab:
        spam_word_count = processed_tokens.apply(lambda tokens: sum(1 for word in tokens if word in spam_vocab))
        features_df["feat_spam_word_count"] = spam_word_count

    # --- Detailed linguistic features ---
    features_df["feat_sentence_count"] = raw_text.apply(lambda x: len(sent_tokenize(x))).clip(lower=1)
    features_df["feat_paragraph_count"] = raw_text.str.count(r'\n\n') + 1
    features_df["feat_word_diversity"] = processed_tokens.apply(lambda x: len(set(x)))
    features_df["feat_uppercase_char_count"] = raw_text.str.count(r"[A-Z]")

    # --- Ratio-based features ---
    features_df["feat_avg_word_len"] = safe_division(raw_text.str.replace(" ", "").str.len(), features_df["feat_word_count"])
    features_df["feat_word_diversity_ratio"] = safe_division(features_df["feat_word_diversity"], features_df["feat_word_count"])
    features_df["feat_uppercase_char_ratio"] = safe_division(features_df["feat_uppercase_char_count"], features_df["feat_char_count"])
    features_df["feat_word_per_sentence"] = safe_division(features_df["feat_word_count"], features_df["feat_sentence_count"])
    features_df["feat_word_per_paragraph"] = safe_division(features_df["feat_word_count"], features_df["feat_paragraph_count"])
    features_df["feat_sentence_per_paragraph"] = safe_division(features_df["feat_sentence_count"], features_df["feat_paragraph_count"])
    if spam_vocab:
        features_df["feat_spam_word_ratio"] = safe_division(features_df["feat_spam_word_count"], features_df["feat_word_count"])

    # Final cleanup
    feature_cols = [c for c in features_df.columns if c.startswith("feat_")]
    features_df[feature_cols] = features_df[feature_cols].fillna(0)
    return features_df

# ==============================================================================
# MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == '__main__':
    # --- Step 1: Load and Preprocess Data ---
    if 'google.colab' in str(get_ipython()):
        drive.mount('/content/drive', force_remount=True)
        file_path = '/content/drive/My Drive/spam_Emails_data.csv'
        # Directory for saving progress and final models
        CHECKPOINT_DIR = '/content/drive/My Drive/SpamClassifierProject_Checkpoints'
        os.makedirs(CHECKPOINT_DIR, exist_ok=True)
        print(f"Checkpoint directory is ready at: {CHECKPOINT_DIR}")
    else:
        # Fallback for local execution
        file_path = 'spam_Emails_data.csv' # Assumes file is in the same directory
        CHECKPOINT_DIR = None
        print("Google Drive not connected. Checkpointing is disabled.")

    df = pd.read_csv(file_path)
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].str.strip().str.capitalize().map({'Ham': 0, 'Spam': 1})
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].astype(int)
    print("Data loaded. Starting text preprocessing...")
    df['processed_text'] = df['text'].progress_apply(preprocess_text)
    y = df['label']
    print("Text preprocessing complete.")

    # --- Step 2: Create Data-Driven Spam Vocabulary ---
    temp_train_df, _ = train_test_split(df, test_size=0.3, random_state=42, stratify=y)
    tfidf_vocab_gen = TfidfVectorizer(max_features=50, stop_words='english')
    tfidf_vocab_gen.fit(temp_train_df['processed_text'])
    DATA_DRIVEN_SPAM_VOCAB = set(tfidf_vocab_gen.get_feature_names_out())
    print(f"Created a data-driven spam vocabulary with {len(DATA_DRIVEN_SPAM_VOCAB)} words.")

    # --- Step 3: Create Full Feature Set ---
    print("\n--- Creating the full feature set... ---")
    features_df = create_features(df.drop('label', axis=1), spam_vocab=DATA_DRIVEN_SPAM_VOCAB)
    print("Feature creation complete.")

    # --- Step 4: Define Hyperparameter Grids & Prepare Data for Tuning ---
    lgbm_param_grid = {
        'clf__n_estimators': [100, 200], 'clf__learning_rate': [0.05, 0.1], 'clf__num_leaves': [31, 50]
    }
    rf_param_grid = {
        'clf__n_estimators': [100, 200], 'clf__max_depth': [10, 20, None], 'clf__min_samples_split': [2, 5]
    }
    logistic_param_grid = {
        'classifier__C': [0.1, 1.0, 10.0], 'classifier__penalty': ['l1', 'l2']
    }

    # Split data for training and testing
    X_base_train, X_base_test, y_train, y_test = train_test_split(
        features_df, y, test_size=0.25, random_state=42, stratify=y
    )
    # The meta features are the same as the base features in this simplified setup
    X_meta_train, X_meta_test, _, _ = train_test_split(
        features_df, y, test_size=0.25, random_state=42, stratify=y
    )

    # --- Step 5: Tune Base Models with Checkpointing ---
    print("\n--- Tuning Base Models ---")
    base_preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(max_features=2500), 'processed_text'),
            ('numeric', StandardScaler(), [c for c in X_base_train.columns if c.startswith('feat_')])
        ], remainder='drop'
    )
    base_estimators_for_tuning = {
        'lgbm': (Pipeline([('preprocessor', base_preprocessor), ('clf', lgb.LGBMClassifier(random_state=42))]), lgbm_param_grid),
        'rf': (Pipeline([('preprocessor', base_preprocessor), ('clf', RandomForestClassifier(random_state=42))]), rf_param_grid)
    }

    best_base_estimators = {}
    best_base_params = {}

    for name, (pipeline, param_grid) in base_estimators_for_tuning.items():
        checkpoint_path = os.path.join(CHECKPOINT_DIR, f'base_model_{name}_checkpoint.joblib') if CHECKPOINT_DIR else None

        if checkpoint_path and os.path.exists(checkpoint_path):
            print(f"Found checkpoint for {name}. Loading pre-tuned model.")
            checkpoint_data = joblib.load(checkpoint_path)
            best_base_estimators[name] = checkpoint_data['estimator']
            best_base_params[name] = checkpoint_data['params']
            print(f"Loaded best parameters for {name}: {best_base_params[name]}\n")
        else:
            print(f"No checkpoint found for {name}. Running GridSearchCV...")
            grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
            grid_search.fit(X_base_train, y_train)

            best_base_estimators[name] = grid_search.best_estimator_
            best_base_params[name] = grid_search.best_params_

            print(f"Best parameters for {name}: {grid_search.best_params_}")
            print(f"Best AUC score for {name}: {grid_search.best_score_:.4f}\n")

            if checkpoint_path:
                print(f"Saving checkpoint for {name} to Google Drive...")
                checkpoint_data = {'estimator': grid_search.best_estimator_, 'params': grid_search.best_params_}
                joblib.dump(checkpoint_data, checkpoint_path)
                print("Checkpoint saved.\n")

    # --- Step 6: Generate Level 1 Features using Tuned Models ---
    print("--- Generating Level 1 features using tuned base models... ---")
    oof_train_preds_tuned = []
    test_preds_tuned = []

    for name, best_model in best_base_estimators.items():
        oof_preds = cross_val_predict(best_model, X_base_train, y_train, cv=3, method='predict_proba', n_jobs=-1)[:, 1]
        oof_train_preds_tuned.append(pd.Series(oof_preds, name=f"pred_{name}_tuned", index=X_base_train.index))

        test_p = best_model.predict_proba(X_base_test)[:, 1]
        test_preds_tuned.append(pd.Series(test_p, name=f"pred_{name}_tuned", index=X_base_test.index))

    X_meta_train_final_tuned = pd.concat([X_meta_train] + oof_train_preds_tuned, axis=1)
    X_meta_test_final_tuned = pd.concat([X_meta_test] + test_preds_tuned, axis=1)
    print("Level 1 features generated.")

    # --- Step 7: Tune the Meta-Model with Checkpointing ---
    print("\n--- Tuning the Meta-Model ---")
    meta_model_checkpoint_path = os.path.join(CHECKPOINT_DIR, 'meta_model_checkpoint.joblib') if CHECKPOINT_DIR else None

    best_meta_model = None
    best_meta_params = None

    if meta_model_checkpoint_path and os.path.exists(meta_model_checkpoint_path):
        print("Found checkpoint for Meta-Model. Loading pre-tuned model.")
        checkpoint_data = joblib.load(meta_model_checkpoint_path)
        best_meta_model = checkpoint_data['estimator']
        best_meta_params = checkpoint_data['params']
        print(f"Loaded best parameters for Meta-Model: {best_meta_params}\n")
    else:
        print("No checkpoint found for Meta-Model. Running GridSearchCV...")
        meta_numeric_features_tuned = [c for c in X_meta_train_final_tuned.columns if c.startswith('feat_') or c.startswith('pred_')]
        meta_preprocessor = ColumnTransformer(
            transformers=[
                ('tfidf', TfidfVectorizer(max_features=2500), 'processed_text'),
                ('numeric', StandardScaler(), meta_numeric_features_tuned)
            ], remainder='drop'
        )
        meta_model_pipeline = Pipeline([
            ('preprocessor', meta_preprocessor),
            ('classifier', LogisticRegression(random_state=42, solver='liblinear'))
        ])

        meta_grid_search = GridSearchCV(meta_model_pipeline, logistic_param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
        meta_grid_search.fit(X_meta_train_final_tuned, y_train)

        best_meta_model = meta_grid_search.best_estimator_
        best_meta_params = meta_grid_search.best_params_

        print(f"Best parameters for Meta-Model: {meta_grid_search.best_params_}")
        print(f"Best AUC score for Meta-Model on training data: {meta_grid_search.best_score_:.4f}\n")

        if meta_model_checkpoint_path:
            print("Saving checkpoint for Meta-Model to Google Drive...")
            checkpoint_data = {'estimator': best_meta_model, 'params': best_meta_params}
            joblib.dump(checkpoint_data, meta_model_checkpoint_path)
            print("Checkpoint saved.\n")

    # --- Step 8: Evaluate the Final, Fully-Tuned Stacking Model ---
    print(f"\n{'#'*80}\n# FINAL TUNED MODEL PERFORMANCE\n{'#'*80}")

    # MODIFICATION: Start timing the inference step in milliseconds
    inference_start_time = int(time() * 1000)

    y_pred_tuned = best_meta_model.predict(X_meta_test_final_tuned)
    y_proba_tuned = best_meta_model.predict_proba(X_meta_test_final_tuned)[:, 1]

    # MODIFICATION: End timing and calculate the duration in milliseconds
    inference_end_time = int(time() * 1000)
    inference_time_ms = inference_end_time - inference_start_time

    report_str = classification_report(y_test, y_pred_tuned, target_names=['Ham (0)', 'Spam (1)'])
    report_dict = classification_report(y_test, y_pred_tuned, output_dict=True)

    print("Classification Report for the Tuned Model:")
    print(report_str)

    final_metrics = {
        'Tuned Accuracy': f"{report_dict['accuracy']:.4f}",
        'Tuned Spam F1-Score': f"{report_dict['1']['f1-score']:.4f}",
        'Tuned MCC': f"{matthews_corrcoef(y_test, y_pred_tuned):.4f}",
        'Tuned AUC': f"{roc_auc_score(y_test, y_proba_tuned):.4f}",
        # MODIFICATION: Add the inference time in milliseconds to the final metrics dictionary
        'Inference Time (ms)': inference_time_ms
    }

    print("\nFinal Performance Metrics:")
    for metric, value in final_metrics.items():
        print(f"- {metric}: {value}")
    print("\nHyperparameter tuning complete.")

    # --- Step 9: Save Final Tuned Model Artifacts ---
    if CHECKPOINT_DIR:
        print("\n--- Saving final artifacts for the tuned model stack... ---")

        BASE_PROJECT_PATH = '/content/drive/My Drive/SpamClassifierProject_TunedStacking'
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        RUN_SPECIFIC_PATH = os.path.join(BASE_PROJECT_PATH, f"tuned_run_{timestamp}")
        os.makedirs(RUN_SPECIFIC_PATH, exist_ok=True)

        print(f"Final artifacts will be saved in: {RUN_SPECIFIC_PATH}")

        # Save each tuned base model
        for name, model in best_base_estimators.items():
            joblib.dump(model, os.path.join(RUN_SPECIFIC_PATH, f'tuned_base_model_{name}.joblib'))

        # Save the tuned meta model
        joblib.dump(best_meta_model, os.path.join(RUN_SPECIFIC_PATH, 'tuned_meta_model.joblib'))
        print("Tuned base and meta models saved successfully.")

        # Save performance reports and vocabulary
        summary_report = {
            'Final Performance Metrics': final_metrics,
            'Best Base Model Parameters': best_base_params,
            'Best Meta Model Parameters': best_meta_params
        }
        with open(os.path.join(RUN_SPECIFIC_PATH, 'summary_report.json'), 'w') as f:
            json.dump(summary_report, f, indent=4)
        with open(os.path.join(RUN_SPECIFIC_PATH, 'classification_report.txt'), 'w') as f:
            f.write(report_str)
        joblib.dump(DATA_DRIVEN_SPAM_VOCAB, os.path.join(RUN_SPECIFIC_PATH, 'spam_vocabulary.joblib'))
        print("Performance reports, parameters, and vocabulary saved successfully.")

        print("\nAll final artifacts have been saved.")
    else:
        print("\nGoogle Drive not connected. Skipping final artifact saving.")

Mounted at /content/drive
Checkpoint directory is ready at: /content/drive/My Drive/SpamClassifierProject_Checkpoints
Data loaded. Starting text preprocessing...


  0%|          | 0/193852 [00:00<?, ?it/s]

Text preprocessing complete.
Created a data-driven spam vocabulary with 50 words.

--- Creating the full feature set... ---
Feature creation complete.

--- Tuning Base Models ---
Found checkpoint for lgbm. Loading pre-tuned model.
Loaded best parameters for lgbm: {'clf__learning_rate': 0.1, 'clf__n_estimators': 200, 'clf__num_leaves': 50}

Found checkpoint for rf. Loading pre-tuned model.
Loaded best parameters for rf: {'clf__max_depth': None, 'clf__min_samples_split': 5, 'clf__n_estimators': 200}

--- Generating Level 1 features using tuned base models... ---




Level 1 features generated.

--- Tuning the Meta-Model ---
Found checkpoint for Meta-Model. Loading pre-tuned model.
Loaded best parameters for Meta-Model: {'classifier__C': 1.0, 'classifier__penalty': 'l2'}


################################################################################
# FINAL TUNED MODEL PERFORMANCE
################################################################################
Classification Report for the Tuned Model:
              precision    recall  f1-score   support

     Ham (0)       0.99      0.99      0.99     25540
    Spam (1)       0.99      0.99      0.99     22923

    accuracy                           0.99     48463
   macro avg       0.99      0.99      0.99     48463
weighted avg       0.99      0.99      0.99     48463


Final Performance Metrics:
- Tuned Accuracy: 0.9895
- Tuned Spam F1-Score: 0.9889
- Tuned MCC: 0.9789
- Tuned AUC: 0.9993
- Inference Time (ms): 19245

Hyperparameter tuning complete.

--- Saving final artifacts for the tuned

In [2]:
    # --- Step 9: Evaluate the Final, Fully-Tuned Stacking Model ---
    print(f"\n{'#'*80}\n# FINAL TUNED MODEL PERFORMANCE\n{'#'*80}")

    # MODIFICATION: Deconstruct the pipeline to time steps separately
    preprocessor = best_meta_model.named_steps['preprocessor']
    classifier = best_meta_model.named_steps['classifier']

    # Time the Vectorization / Transformation step
    transform_start_time = int(time() * 1000)
    X_test_transformed = preprocessor.transform(X_meta_test_final_tuned)
    transform_end_time = int(time() * 1000)
    transform_time_ms = transform_end_time - transform_start_time

    # Time the Prediction step
    predict_start_time = int(time() * 1000)
    y_pred_tuned = classifier.predict(X_test_transformed)
    y_proba_tuned = classifier.predict_proba(X_test_transformed)[:, 1]
    predict_end_time = int(time() * 1000)
    predict_time_ms = predict_end_time - predict_start_time

    report_str = classification_report(y_test, y_pred_tuned, target_names=['Ham (0)', 'Spam (1)'])
    report_dict = classification_report(y_test, y_pred_tuned, output_dict=True)

    print("Classification Report for the Tuned Model:")
    print(report_str)

    final_metrics = {
        'Tuned Accuracy': f"{report_dict['accuracy']:.4f}",
        'Tuned Spam F1-Score': f"{report_dict['1']['f1-score']:.4f}",
        'Tuned MCC': f"{matthews_corrcoef(y_test, y_pred_tuned):.4f}",
        'Tuned AUC': f"{roc_auc_score(y_test, y_proba_tuned):.4f}",
        'Vectorization Time (ms)': transform_time_ms,
        'Prediction Time (ms)': predict_time_ms
    }

    print("\nFinal Performance Metrics:")
    for metric, value in final_metrics.items():
        print(f"- {metric}: {value}")
    print("\nHyperparameter tuning complete.")

    # --- Step 10: Save Final Tuned Model Artifacts ---
    print("\n--- Saving final artifacts... ---")

    BASE_PROJECT_PATH = 'SpamClassifierProject_TunedStacking'
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    RUN_SPECIFIC_PATH = os.path.join(BASE_PROJECT_PATH, f"tuned_run_{timestamp}")
    os.makedirs(RUN_SPECIFIC_PATH, exist_ok=True)

    print(f"Final artifacts will be saved in: {RUN_SPECIFIC_PATH}")

    for name, model in best_base_estimators.items():
        joblib.dump(model, os.path.join(RUN_SPECIFIC_PATH, f'tuned_base_model_{name}.joblib'))

    joblib.dump(best_meta_model, os.path.join(RUN_SPECIFIC_PATH, 'tuned_meta_model.joblib'))
    print("Tuned base and meta models saved successfully.")

    summary_report = {
        'Final Performance Metrics': final_metrics,
        'Best Base Model Parameters': best_base_params,
        'Best Meta Model Parameters': best_meta_params
    }
    with open(os.path.join(RUN_SPECIFIC_PATH, 'summary_report.json'), 'w') as f:
        json.dump(summary_report, f, indent=4)
    with open(os.path.join(RUN_SPECIFIC_PATH, 'classification_report.txt'), 'w') as f:
        f.write(report_str)
    joblib.dump(DATA_DRIVEN_SPAM_VOCAB, os.path.join(RUN_SPECIFIC_PATH, 'spam_vocabulary.joblib'))
    print("Performance reports, parameters, and vocabulary saved successfully.")

    print("\nAll final artifacts have been saved.")


################################################################################
# FINAL TUNED MODEL PERFORMANCE
################################################################################
Classification Report for the Tuned Model:
              precision    recall  f1-score   support

     Ham (0)       0.99      0.99      0.99     25540
    Spam (1)       0.99      0.99      0.99     22923

    accuracy                           0.99     48463
   macro avg       0.99      0.99      0.99     48463
weighted avg       0.99      0.99      0.99     48463


Final Performance Metrics:
- Tuned Accuracy: 0.9895
- Tuned Spam F1-Score: 0.9889
- Tuned MCC: 0.9789
- Tuned AUC: 0.9993
- Vectorization Time (ms): 13884
- Prediction Time (ms): 20

Hyperparameter tuning complete.

--- Saving final artifacts... ---
Final artifacts will be saved in: SpamClassifierProject_TunedStacking/tuned_run_2025-06-22_23-51-51
Tuned base and meta models saved successfully.
Performance reports, parameters, and 



---


**Assistance Disclosure:**

I used LLMs (Codey, ChatGPT, Gemini, Claude, Grok) for brainstorming, debugging, feedback, and improving code readability.