# DistilBERT 5seed pipeline

*This code was used in Google Colab*

In [None]:
import numpy as np
_original_array = np.array
def safe_array(obj, *args, **kwargs):
    if "copy" in kwargs and kwargs["copy"] is False:
        return np.asarray(obj)
    return _original_array(obj, *args, **kwargs)
np.array = safe_array

In [None]:
from datasets import load_from_disk, concatenate_datasets, load_dataset, Dataset, concatenate_datasets, ClassLabel, DatasetDict
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertTokenizerFast, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import torch
import os
import json
from google.colab import drive
import pandas as pd
from datetime import datetime

In [None]:
dataset_names = [
    "Al-Subaiey_CEAS",
    "Al-Subaiey_Enron",
    "Al-Subaiey_Ling",
    "Al-Subaiey_SpamAssassin",
    "Champa_Trec",
    "Chatuat_Enhancing_Phishing_Detection",
    "Giri_EnronSpamSubset",
    "Giri_LingSpam"]

base_path = "/content/drive/MyDrive/MASTER EXPERIMENTS/original_datasets"

original_datasets = {
    name: pd.read_csv(os.path.join(base_path, f"{name}.csv"))
    for name in dataset_names
}


In [None]:
#create combined datasets
combined_sets = []

for excluded_name, excluded_df in original_datasets.items():
    included_dfs = [df for name, df in original_datasets.items() if name != excluded_name]
    combined_df = pd.concat(included_dfs, ignore_index=True)

    combined_sets.append({
        "name": f"combined_without_{excluded_name}",
        "excluded_name": excluded_name,
        "train": combined_df,
        "excluded_test": excluded_df
    })


for entry in combined_sets:
    train_size = len(entry["train"])
    test_size = len(entry["excluded_test"])
    print(f"{entry['name']}: Train size = {train_size}, Test size (excluded) = {test_size}")


In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(batch):
    texts = [text if isinstance(text, str) else "" for text in batch["text"]]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

def prepare_dataset(ds):
    columns_to_remove = [col for col in ds.column_names if col not in ["input_ids", "attention_mask", "label"]]
    if columns_to_remove:
        ds = ds.remove_columns(columns_to_remove)
    return ds.with_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
tokenized_sets = []
seed_list = [7, 28, 42, 95, 450]

for entry in combined_sets:
    name = entry["name"]
    excluded_name = entry["excluded_name"]

    #tokenize full combined train set
    full_train_df = entry["train"][["text", "label"]].reset_index(drop=True)
    full_ds = Dataset.from_pandas(full_train_df)
    tokenized_full = prepare_dataset(full_ds.map(tokenize_function, batched=True))

    #tokenize excluded test sets
    test_ds = Dataset.from_pandas(entry["excluded_test"][["text", "label"]].reset_index(drop=True))
    tokenized_test = prepare_dataset(test_ds.map(tokenize_function, batched=True))

    #create seed based splits
    labels = full_train_df["label"].to_numpy()
    indices = np.arange(len(full_train_df))

    for seed in seed_list:
        train_idx, val_idx = train_test_split(
            indices,
            test_size=0.3,
            stratify=labels,
            random_state=seed
        )

        tokenized_train = tokenized_full.select(train_idx.tolist())
        tokenized_val   = tokenized_full.select(val_idx.tolist())

        tokenized_sets.append({
            "name": name,
            "excluded_name": excluded_name,
            "seed": seed,
            "train": tokenized_train,
            "val": tokenized_val,
            "excluded_test": tokenized_test
        })

    print(f"Prepared dataset '{name}' with {len(seed_list)} seeds.")

print(f"\nLoaded and tokenized {len(tokenized_sets)} dataset-seed combinations.")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device.upper()}")


In [None]:
training_args_template = {
    "eval_strategy": "epoch",
    "num_train_epochs": 6,
    "learning_rate": 2e-5,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 64,
    "weight_decay": 0.01,
    "save_strategy": "epoch",
    "logging_strategy": "epoch",
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_f1",
    "save_total_limit": 1,
}

In [None]:
bert_model_dir = "/content/drive/MyDrive/MASTER EXPERIMENTS/bert_allvsone_5seed_models"
os.makedirs(bert_model_dir, exist_ok=True)

results_dir = "/content/drive/MyDrive/MASTER EXPERIMENTS/bert_allvsone_5seed_results"
os.makedirs(results_dir, exist_ok=True)

timestamp = datetime.now().strftime("%Y-%m-%d")
results = []

In [None]:
#function to compute metrics for trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

In [None]:
#training function in the all vs one setup
def train_model_on_dataset(name, seed, train_ds, val_ds, output_dir=bert_model_dir):
    model_output_dir = os.path.join(output_dir, f"bert_{name}_seed{seed}")
    os.makedirs(model_output_dir, exist_ok=True)

    if os.path.exists(model_output_dir) and os.path.isfile(os.path.join(model_output_dir, "model.safetensors")): #check if model already exists but still use it for evaluation (implemented because of session timeouts)
        print(f"Model already exists at: {model_output_dir}.")
        model = DistilBertForSequenceClassification.from_pretrained(model_output_dir) #load the model again

        training_args = TrainingArguments(
        output_dir=model_output_dir,
        report_to="none",
        seed=seed,
        **training_args_template
        )

        trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics
        )
        return trainer #return the trainer for the evaluation loop

    os.makedirs(model_output_dir, exist_ok=True)

    print(f"[Seed {seed}] Training on: {name}")

    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


    training_args = TrainingArguments(
        output_dir=model_output_dir,
        logging_dir=os.path.join(model_output_dir, "logs"),
        report_to="none",
        **training_args_template
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] #stops if no improvements 3x in a row
    )

    trainer.train()

    #save model and tokenizer
    model.save_pretrained(model_output_dir)
    tokenizer.save_pretrained(model_output_dir)

    print(f"[Seed {seed}] Finished training on {name}. Model saved to: {model_output_dir}")
    return trainer

In [None]:
results_path = os.path.join(results_dir, "all_vs_one_bert_5seed_results.csv")

#load existing results if available
if os.path.exists(results_path):
    results_df = pd.read_csv(results_path)
    print(f"Loaded existing results with {len(results_df)} entries from {results_path}")
else:
    results_df = pd.DataFrame(columns=[
        "Train Dataset", "Train Seed",
        "Test Dataset",  "Test Seed",
        "Evaluation Type",
        "Train Size", "Test Size",
        "Accuracy", "Precision", "Recall", "F1"
    ])

for entry in tokenized_sets:
    name = entry["name"]
    excluded_name = entry["excluded_name"]
    seed = entry["seed"]

    print(f"\n[Seed {seed}] Preparing dataset: {name}")

    train_ds = entry["train"]
    val_ds   = entry["val"]
    test_ds  = entry["excluded_test"]

    #train or resume model
    trainer = train_model_on_dataset(name, seed, train_ds, val_ds)

    #internal evaluation
    result_exists = (
        (results_df["Train Dataset"]   == name) &
        (results_df["Train Seed"]      == seed) &
        (results_df["Test Dataset"]    == name) &
        (results_df["Test Seed"]       == seed) &
        (results_df["Evaluation Type"] == "Internal")
    ).any()

    if result_exists:
        print(f"[Seed {seed}] skipping internal evaluation for '{name}' (already in results CSV)")
    else:
        print(f"[Seed {seed}] start internal evaluation: dataset='{name}'")
        val_pred   = trainer.predict(val_ds)
        val_preds  = np.argmax(val_pred.predictions, axis=1)
        val_labels = val_pred.label_ids

        new_row = {
            "Train Dataset": name,
            "Train Seed":    seed,
            "Test Dataset":  name,
            "Test Seed":     seed,
            "Evaluation Type": "Internal",
            "Train Size": len(train_ds),
            "Test Size":  len(val_ds),
            "Accuracy":   accuracy_score(val_labels, val_preds),
            "Precision":  precision_score(val_labels, val_preds, zero_division=0),
            "Recall":     recall_score(val_labels, val_preds,  zero_division=0),
            "F1":         f1_score(val_labels, val_preds,      zero_division=0),
        }

        tmp_path = results_path + ".tmp"
        updated_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        updated_df.to_csv(tmp_path, index=False)
        os.replace(tmp_path, results_path)
        results_df = updated_df
        print(f"[Seed {seed}] Appended INTERNAL result for '{name}'")

    #external evaluation on full excluded dataset
    result_exists = (
        (results_df["Train Dataset"]   == name) &
        (results_df["Train Seed"]      == seed) &
        (results_df["Test Dataset"]    == excluded_name) &
        (results_df["Evaluation Type"] == "External")
    ).any()

    if result_exists:
        print(f"[Seed {seed}] skipping external evaluation: train='{name}', test='{excluded_name}' (already in results csv).")
    else:
        print(f"[Seed {seed}] start external evaluation: train='{name}', test='{excluded_name}'.")
        test_pred   = trainer.predict(test_ds)
        test_preds  = np.argmax(test_pred.predictions, axis=1)
        test_labels = test_pred.label_ids

        new_row = {
            "Train Dataset": name,
            "Train Seed":    seed,
            "Test Dataset":  excluded_name,
            "Test Seed":     "FULL",
            "Evaluation Type": "External",
            "Train Size": len(train_ds),
            "Test Size":  len(test_ds),
            "Accuracy":   accuracy_score(test_labels, test_preds),
            "Precision":  precision_score(test_labels, test_preds, zero_division=0),
            "Recall":     recall_score(test_labels, test_preds,  zero_division=0),
            "F1":         f1_score(test_labels, test_preds,      zero_division=0),
        }

        tmp_path = results_path + ".tmp"
        updated_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        updated_df.to_csv(tmp_path, index=False)
        os.replace(tmp_path, results_path)
        results_df = updated_df
        print(f"[Seed {seed}] Appended external result train='{name}', test='{excluded_name}'")

print(f"\nAll seeds completed. Final results saved to: {results_path}")