# ML all vs. one pipeline 5 seed

*Code used in Google Colab*

In [None]:
import pandas as pd
import joblib
import json
import os
import re
import pickle
import time
import re
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.stem import PorterStemmer

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Load prepared datasets

In [None]:
folder_path = "/content/drive/MyDrive/MASTER EXPERIMENTS/ML allvsone 5seed/5seed_cleaned_datasets"
all_datasets = {}

for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        name = file.replace(".csv", "")
        df = pd.read_csv(os.path.join(folder_path, file))
        all_datasets[name] = df
print(f"Loaded {len(all_datasets)} cleaned datasets")
display(all_datasets.keys())

## Load combined (for training) and single datasets (for testing)

In [None]:
#load all combined datasets for training
combined_folder = "/content/drive/MyDrive/MASTER EXPERIMENTS/ML allvsone 5seed/all_vs_one_combined_trainsets"
train_sets = {}

for file in os.listdir(combined_folder):
    if file.endswith(".csv") and file.startswith("combined_without_"):
        file_path = os.path.join(combined_folder, file)
        df = pd.read_csv(file_path)

        train_set_name = file.replace(".csv", "")
        train_sets[train_set_name] = df

        print(f"Loaded training set '{train_set_name}' ({df.shape[0]} rows)")

test_sets = all_datasets
print(f"Loaded {len(test_sets)} test sets from all_datasets")
display(test_sets.keys())

## Models

In [None]:
def load_models(seed):
    models = {
        "Logistic Regression": LogisticRegression(
            C=0.05,  #strong regularization according to SpamEval Benchmarks, reduce overfitting on small datasets
            penalty="l2", #standard l2 penalty for text classification
            class_weight="balanced", #recommended for imbalanced classes
            solver="liblinear", #solver can handle large feature space for text data
            max_iter=1000,
            random_state=seed #for reproducibility
        ),

        "Naive Bayes": MultinomialNB(
            alpha=0.1  #slight smoothing to handle unseen words that are common in phishing emails
        ),

        "Random Forest": RandomForestClassifier(
            n_estimators=300,            #300 trees for stable results, recommended on text data
            max_depth=None,              #no limit on depth, ensemble reduces overfitting
            min_samples_leaf=2,          #avoids very specific leaf nodes (regularization)
            max_features="sqrt",         #recommended for high dimensional text-features
            class_weight="balanced",     #recommended for imbalanced classes to reduce bias
            n_jobs=-2,
            random_state=seed
        ),

        "Support Vector Machine": SVC(
            kernel="linear",             #recommended for high dimensional TF-IDF features
            C=0.05,                      #strong regularization to prevent overfitting on limited data
            class_weight="balanced",
            random_state=seed
        ),

        "Neural Network": MLPClassifier(
            hidden_layer_sizes=(80,),   #according to literature, 80 neurons are enough for text data
            activation="relu",          #standard activation function for text data
            alpha=0.0005,               #L2 regularization (weights decay) to prevent overfitting
            early_stopping=True,        #stops training if validation score does not improve
            learning_rate_init=0.001,   #initial learning rate for small neural networks
            learning_rate="adaptive",   #learning rate decreases when validation stops improving
            max_iter=1000,              #allows convergence for small datasets
            random_state=seed
        )
    }
    return models

# All vs. one 5seed Pipeline ML

In [None]:
seed_list = [7, 28, 42, 95, 450]

#output folder for models & splits
base_output_dir = "/content/drive/MyDrive/MASTER EXPERIMENTS/ML allvsone 5seed/ML_allvsone_5seed_output"
os.makedirs(base_output_dir, exist_ok=True)

#folder for results
results_dir = "/content/drive/MyDrive/MASTER EXPERIMENTS/ML allvsone 5seed/ML_allvsone_5seed_results"
os.makedirs(results_dir, exist_ok=True)

#combined results csv
all_results_csv_path = os.path.join(results_dir, "all_vs_one_results_seeds_28_450.csv")

#load existing results if available
if os.path.exists(all_results_csv_path):
    all_results_df = pd.read_csv(all_results_csv_path)
else:
    all_results_df = pd.DataFrame(columns=["Seed", "Trained_On", "Model", "Accuracy", "Precision", "Recall", "F1 Score"])

for seed in seed_list:
    print(f"\n=== Starting experiments for seed {seed} ===")

    #create output folders per seed
    output_dir = os.path.join(base_output_dir, f"seed_{seed}")
    split_dir = os.path.join(output_dir, "splits")
    model_dir = os.path.join(output_dir, "models")

    os.makedirs(split_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    #load models with current seed
    models = load_models(seed)

    for train_set_name, df_combined in train_sets.items():
        print(f"\n[Seed {seed}] Preparing dataset: {train_set_name}")

        features = [
            "subject_clean", "text_clean",
            "subject_length", "text_length",
            "num_exclamations", "num_uppercase_words",
            "num_urls", "num_special_chars"
        ]
        X = df_combined[features].copy()
        y = df_combined["label"]

        X["subject_clean"] = X["subject_clean"].fillna("").astype(str)
        X["text_clean"] = X["text_clean"].fillna("").astype(str)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=seed, stratify=y
        )

        # save dataset split once per dataset (robust write)
        split_path = os.path.join(split_dir, f"{train_set_name}_split.pkl")
        if not os.path.exists(split_path):
            try:
                joblib.dump({
                    "X_train": X_train, "X_test": X_test,
                    "y_train": y_train, "y_test": y_test
                }, split_path)
                print(f"[Seed {seed}] Successfully saved split for '{train_set_name}'")
            except Exception as e:
                print(f"[Seed {seed}] Error saving split for '{train_set_name}': {e}")

        for model_name, model in models.items():
            print(f"[Seed {seed}] Dataset '{train_set_name}': checking status for model '{model_name}' ...")

            #check if result already exists in CSV
            result_exists = (
                (all_results_df["Seed"] == seed) &
                (all_results_df["Trained_On"] == train_set_name) &
                (all_results_df["Model"] == model_name)
            ).any()
            if result_exists:
                print(f"[Seed {seed}] Skipping {model_name} on '{train_set_name}' (already in results CSV)")
                continue

            #build model path and check for existing model file
            safe_model_name = model_name.replace(" ", "_").replace("/", "_")
            model_path = os.path.join(
                model_dir, f"{safe_model_name}__{train_set_name.replace('.csv', '')}.pkl"
            )

            if os.path.exists(model_path):
                #resume model exists but CSV entry is missing
                print(f"[Seed {seed}] Found existing model file for {model_name} on '{train_set_name}'. Loading and evaluating ...")
                try:
                    loaded_pipeline = joblib.load(model_path)
                    y_pred = loaded_pipeline.predict(X_test)

                    new_result = {
                        "Seed": seed,
                        "Trained_On": train_set_name,
                        "Model": model_name,
                        "Accuracy": accuracy_score(y_test, y_pred),
                        "Precision": precision_score(y_test, y_pred, zero_division=0),
                        "Recall": recall_score(y_test, y_pred, zero_division=0),
                        "F1 Score": f1_score(y_test, y_pred, zero_division=0)
                    }
                    all_results_df = pd.concat([all_results_df, pd.DataFrame([new_result])], ignore_index=True)

                    temp_path = all_results_csv_path + ".tmp"
                    all_results_df.to_csv(temp_path, index=False)
                    os.replace(temp_path, all_results_csv_path)

                    print(f"[Seed {seed}] Appended result (resume) for {model_name} on '{train_set_name}'")
                    continue
                except Exception as e:
                    print(f"[Seed {seed}] Failed to load existing model for {model_name} on '{train_set_name}'. Will retrain. Error: {e}")

            #train from scratch
            if model_name == "Naive Bayes":
                transformer = ColumnTransformer([
                    ("subject_tfidf", TfidfVectorizer(
                        max_features=2000,
                        ngram_range=(1, 2),
                        min_df=0.01,
                        max_df=0.9,
                        sublinear_tf=True,
                        norm=None,
                        stop_words=None,
                        lowercase=True,
                        strip_accents="unicode"
                    ), "subject_clean"),
                    ("text_tfidf", TfidfVectorizer(
                        max_features=5000,
                        ngram_range=(1, 2),
                        min_df=0.01,
                        max_df=0.9,
                        sublinear_tf=True,
                        norm=None,
                        stop_words=None,
                        lowercase=True,
                        strip_accents="unicode"
                    ), "text_clean")
                ])
            else:
                transformer = ColumnTransformer([
                    ("subject_tfidf", TfidfVectorizer(
                        max_features=2000,
                        ngram_range=(1, 2),
                        min_df=0.01,
                        max_df=0.9,
                        sublinear_tf=True,
                        norm="l2",
                        stop_words=None,
                        lowercase=True,
                        strip_accents="unicode"
                    ), "subject_clean"),
                    ("text_tfidf", TfidfVectorizer(
                        max_features=5000,
                        ngram_range=(1, 2),
                        min_df=0.01,
                        max_df=0.9,
                        sublinear_tf=True,
                        norm="l2",
                        stop_words=None,
                        lowercase=True,
                        strip_accents="unicode"
                    ), "text_clean"),
                    ("numerical", StandardScaler(), [
                        "subject_length", "text_length",
                        "num_exclamations", "num_uppercase_words",
                        "num_urls", "num_special_chars"
                    ])
                ])

            pipeline = Pipeline([
                ("features", transformer),
                ("clf", model)
            ])

            print(f"[Seed {seed}] START training: model='{model_name}' dataset='{train_set_name}'")
            pipeline.fit(X_train, y_train)
            print(f"[Seed {seed}] FINISH training: model='{model_name}' dataset='{train_set_name}'")

            y_pred = pipeline.predict(X_test)

            joblib.dump(pipeline, model_path)
            print(f"[Seed {seed}] Saved model to: {model_path}")

            new_result = {
                "Seed": seed,
                "Trained_On": train_set_name,
                "Model": model_name,
                "Accuracy": accuracy_score(y_test, y_pred),
                "Precision": precision_score(y_test, y_pred, zero_division=0),
                "Recall": recall_score(y_test, y_pred, zero_division=0),
                "F1 Score": f1_score(y_test, y_pred, zero_division=0)
            }
            all_results_df = pd.concat([all_results_df, pd.DataFrame([new_result])], ignore_index=True)

            temp_path = all_results_csv_path + ".tmp"
            all_results_df.to_csv(temp_path, index=False)
            os.replace(temp_path, all_results_csv_path)
            print(f"[Seed {seed}] Appended result for {model_name} on '{train_set_name}'")

print(f"\nAll seeds completed. Final results saved to: {all_results_csv_path}")