# ML pipeline 5 seeds

In [None]:
import pandas as pd
import joblib
import json
import os
import re
import pickle
import time
import re
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.stem import PorterStemmer

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Load Datasets

In [None]:
folder_path = "../Datasets/processed_datasets"
datasets = {}

for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        name = file.removesuffix(".csv")
        df = pd.read_csv(os.path.join(folder_path, file))
        datasets[name] = df

# correct the label column data types to int
for name, df in datasets.items():
    df["label"] = pd.to_numeric(df["label"], errors="coerce")
    df = df.dropna(subset=["label"]).copy()
    df["label"] = df["label"].astype(int)
    datasets[name] = df  #update dictionary
    print(f"{name}: {df['label'].dtype}, unique values: {df['label'].unique()}")

### Clean datasets

In [None]:
stemmer = PorterStemmer()

def clean_text(text, use_stemming=True):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text) #remove URLs that can confuse TF-IDF vectorizer
    text = re.sub(r"[^a-z\s]", "", text) #remove special characters
    words = text.split()
    words = [w for w in words if w not in ENGLISH_STOP_WORDS] #remove stop words
    if use_stemming: #apply stemming
        words = [stemmer.stem(w) for w in words]
    return " ".join(words)

filtered_datasets = {}

for name, df in datasets.items():
    original_len = len(df)

    #remove rows with NaN values in subject or text
    df = df[
        df["subject"].apply(lambda x: isinstance(x, str) and x.strip() != "") &
        df["text"].apply(lambda x: isinstance(x, str) and x.strip() != "")
    ]

    #remove duplicates
    df = df.dropna(subset=["subject", "text"])
    before_dupes = len(df)
    df = df.drop_duplicates(subset=["subject", "text"])
    removed_dupes = before_dupes - len(df) #check how many duplicates were removed
    if removed_dupes > 0:
        print(f"{name}: {removed_dupes} removed duplicates")
    df = df.reset_index(drop=True)

    #extract features from subject and text
    df["subject_length"] = df["subject"].fillna("").apply(len)
    df["text_length"] = df["text"].fillna("").apply(len)
    df["num_exclamations"] = df["text"].fillna("").apply(lambda x: x.count("!"))
    df["num_uppercase_words"] = df["text"].fillna("").apply(lambda x: sum(1 for w in x.split() if w.isupper() and len(w) > 1))
    df["num_urls"] = df["text"].fillna("").apply(lambda x: len(re.findall(r"http[s]?://", x)))
    df["num_special_chars"] = df["text"].fillna("").apply(lambda x: len(re.findall(r"[#$%^&*]", x)))

    #cleaning subject and text with clean text function 
    df["subject_clean"] = df["subject"].fillna("").apply(lambda x: clean_text(x, use_stemming=True))
    df["text_clean"] = df["text"].fillna("").apply(lambda x: clean_text(x, use_stemming=True))

    #check if the dataset has both labels 0 and 1, at least 100 rows, and at least 20 samples for each class
    label_counts = df["label"].value_counts()
    has_both_labels = 0 in label_counts and 1 in label_counts
    sufficient_rows = len(df) >= 100
    class_balance_ok = label_counts.get(0, 0) >= 20 and label_counts.get(1, 0) >= 20

    if has_both_labels and sufficient_rows and class_balance_ok:
        filtered_datasets[name] = df
        print(f"{name} accepted: {len(df)} rows")
    else:
        print(f"{name} skipped: "
              f"{'missing class 0 or 1' if not has_both_labels else ''} "
              f"{'not enough rows' if not sufficient_rows else ''} "
              f"{'imbalanced classes' if not class_balance_ok else ''}")

In [None]:
#show examples from the cleaned datasets
for name, df in filtered_datasets.items():
    print(f"\nExample rows from: {name}:")
    print(df.sample(5, random_state=42)[["subject", "text", "label", "subject_clean", "text_clean", "subject_length", "num_urls", "num_exclamations", "label"]])

### Save cleaned datasets as csv

In [None]:
output_folder = "5seed_cleaned_datasets"
os.makedirs(output_folder, exist_ok=True)

for name, df in filtered_datasets.items():
    filename = os.path.join(output_folder, f"{name}.csv")
    df.to_csv(filename, index=False)
    print(f"Saved: {filename}")

### Show Dataset characteristics

In [None]:
dataset_summary = []

for name, df in filtered_datasets.items():
    num_rows = len(df)
    label_counts = df["label"].value_counts().to_dict()
    num_1 = label_counts.get(1, 0)
    num_0 = label_counts.get(0, 0)
    has_subjects = df["subject"].dropna().apply(lambda x: isinstance(x, str) and x.strip() != "").any()

    dataset_summary.append({
        "Dataset": name,
        "Rows": num_rows,
        "Label 1 (Phishing)": num_1,
        "Label 0 (Benign)": num_0,
        "Has Subjects": "Yes" if has_subjects else "No"
    })

summary_df = pd.DataFrame(dataset_summary)
print(summary_df.to_string(index=False))

# Load models

In [None]:
def load_models(seed): 
    models = {
        "Logistic Regression": LogisticRegression(
            C=0.05,  #strong regularization to reduce overfitting on small datasets
            penalty="l2", #standard l2 penalty for text classification
            class_weight="balanced", #recommended for imbalanced classes
            solver="liblinear", #solver can handle large feature space for text data
            max_iter=1000,
            random_state=seed #for reproducibility
        ),

        "Naive Bayes": MultinomialNB(
            alpha=0.1  #slight smoothing to handle unseen words that are common in phishing emails
        ),

        "Random Forest": RandomForestClassifier(
            n_estimators=300,            #increased number of trees with the goal of generalization
            max_depth=None,              
            min_samples_leaf=2,          #avoids very specific leaf nodes to allow better generalization
            max_features="sqrt",         #standard
            class_weight="balanced",     #recommended for imbalanced classes to reduce bias
            n_jobs=-2,
            random_state=seed
        ),

        "Support Vector Machine": SVC(
            kernel="linear",             #good for TF-IDF features
            C=0.05,                      #strong regularization to prevent overfitting on limited data
            class_weight="balanced",
            random_state=seed
        ),

        "Neural Network": MLPClassifier(
            hidden_layer_sizes=(80,),   #according to literature, 80 neurons are enough for text data, ranges vary from 50 to 100
            activation="relu",          #standard activation function for text data
            alpha=0.0005,               #L2 regularization to prevent overfitting
            early_stopping=True,        #stops training if validation score does not improve
            learning_rate_init=0.001,   #standard setting, initial learning rate for small neural networks
            learning_rate="adaptive",   #learning rate decreases when validation stops improving
            max_iter=1000,              #allows convergence for small datasets  
            random_state=seed
        )
    }
    return models

# Train / Test models on 5 different seeds

In [None]:
seeds = [7, 28, 42, 95, 450]
all_results = []
stored_models = {}
splits = {}

for seed in seeds:
    print(f"\nSplit with random_state={seed}")
    
    for dataset_name, df in filtered_datasets.items():
        # select relevant features for input
        X = df[[
            "subject_clean", "text_clean",
            "subject_length", "text_length",
            "num_exclamations", "num_uppercase_words",
            "num_urls", "num_special_chars"
        ]]
        y = df["label"]

        #stratified train/test split to preserve label distribution
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=seed, stratify=y
        )

        #save dataset splits for later use
        splits[(dataset_name, seed)] = {
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test
        }

        #load models with seed-specific random_state
        models = load_models(seed)

        for model_name, model in models.items():
            #use TF-IDF for text features only for NB
            if isinstance(model, MultinomialNB): #NB requires different configuration (no normalization, no negative values)
                transformer = ColumnTransformer([
                    ("subject_tfidf", TfidfVectorizer(
                        max_features=2000, #reduces token amount to 2000 for faster training and less overfitting
                        ngram_range=(1, 2), #includes word pairs for better context, good for phishing phrases
                        min_df=0.01, #removes words that appear in less than 1% of the emails
                        max_df=0.9, #removes words that appear in more than 90% of the emails
                        sublinear_tf=True, #applies sublinear term frequency scaling, smoothing for high frequency words
                        stop_words=None, #stopwords cleaned in pre-processing
                        norm=None, #no normalization for Naive Bayes to treat features as counts
                        lowercase=True,
                        strip_accents="unicode"
                    ), "subject_clean"),
                    ("text_tfidf", TfidfVectorizer(
                        max_features=5000,
                        ngram_range=(1, 2),
                        min_df=0.01,
                        max_df=0.9,
                        sublinear_tf=True,
                        norm=None,
                        stop_words=None, 
                        lowercase=True,
                        strip_accents="unicode"
                    ), "text_clean")
                ])
            else:
                #for all other models use TF-IDF + scaled numerical features
                transformer = ColumnTransformer([
                    ("subject_tfidf", TfidfVectorizer(
                        max_features=2000,
                        ngram_range=(1, 2),
                        min_df=0.01,
                        max_df=0.9,
                        sublinear_tf=True,
                        norm="l2", #L2 normalization for all other models (transforms document vectors to unit length)
                        stop_words=None,
                        lowercase=True,
                        strip_accents="unicode"
                    ), "subject_clean"),
                    ("text_tfidf", TfidfVectorizer(
                        max_features=5000,
                        ngram_range=(1, 2),
                        min_df=0.01,
                        max_df=0.9,
                        sublinear_tf=True,
                        norm="l2",
                        stop_words=None,
                        lowercase=True,
                        strip_accents="unicode"
                    ), "text_clean"),
                    ("numerical", StandardScaler(), [ #engineered numerical features from raw email data
                        "subject_length", "text_length",
                        "num_exclamations", "num_uppercase_words",
                        "num_urls", "num_special_chars"
                    ])
                ])

            #build pipeline with preprocessing (column transformer) and model, pipeline object ensures no data leakage between train and test set
            pipeline = Pipeline([
                ("features", transformer),
                ("clf", model)
            ])

            #train model on training set
            pipeline.fit(X_train, y_train)
            print(f"{model_name} trained on {dataset_name} with seed {seed}")

            #evaluate on test set
            y_pred = pipeline.predict(X_test)

            #store trained model pipeline for later use
            stored_models[(dataset_name, model_name, seed)] = pipeline

            #save performance metrics
            all_results.append({
                "Dataset": dataset_name,
                "Model": model_name,
                "Seed": seed,
                "Accuracy": accuracy_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "F1 Score": f1_score(y_test, y_pred)
            })

export_dir = "v2_5seed_model_exports"
os.makedirs(export_dir, exist_ok=True)

#save all trained models to pkl file for later use
joblib.dump(stored_models, os.path.join(export_dir, "stored_models.pkl"))
print(f"Stored_models saved to '{export_dir}/stored_models.pkl'")

#save train/test splits
joblib.dump(splits, os.path.join(export_dir, "splits.pkl"))
print(f"Splits saved to '{export_dir}/splits.pkl'")

#results df with all metrics
results_df = pd.DataFrame(all_results)

#export results to csv
results_df.to_csv(os.path.join(export_dir, "v2_5seed_model_results.csv"), index=False)
print(f"v2_5seed_model_results.csv saved to '{export_dir}/v2_5seed_model_results.csv'")

### Load results from csv for later use

In [None]:
results_df = pd.read_csv(os.path.join(model_dir, "v2_5seed_model_results.csv"))
print(f"Loaded results_df with shape: {results_df.shape}")

### Re-construct dataset splits

In [None]:
split_export_dir = "v2_5seed_splits"
os.makedirs(split_export_dir, exist_ok=True)

seeds = [7, 28, 42, 95, 450]

#features used for model training
feature_columns = [
    "subject_clean", "text_clean",
    "subject_length", "text_length",
    "num_exclamations", "num_uppercase_words",
    "num_urls", "num_special_chars"
]

#loop through each seed and dataset to create train/test splits
for seed in seeds:
    for dataset_name, df in filtered_datasets.items():
        #select features and labels
        X = df[feature_columns]
        y = df["label"]

        #perform the same stratified split used in model training
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=seed, stratify=y
        )

        #store the split in a dictionary
        split_dict = {
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test
        }

        #save to separate file for each dataset and seed combination
        split_filename = f"{dataset_name}_seed{seed}_split.pkl"
        joblib.dump(split_dict, os.path.join(split_export_dir, split_filename))
        print(f"Split saved: {split_filename}")

### Load trained models and dataset splits for later use

In [None]:
model_dir = "v2_5seed_model_exports"
split_dir = "v2_5seed_splits"

#load trained models
stored_models = joblib.load(os.path.join(model_dir, "stored_models.pkl"))
print(f"Loaded {len(stored_models)} trained models from '{model_dir}'.")

#load train/test splits
splits = {}

for file in os.listdir(split_dir):
    if file.endswith("_split.pkl"):
        parts = file.replace("_split.pkl", "").split("_seed")
        dataset_name = parts[0]
        seed = int(parts[1])
        path = os.path.join(split_dir, file)
        splits[(dataset_name, seed)] = joblib.load(path)

print(f"Loaded {len(splits)} dataset splits from '{split_dir}'.")