# DistilBERT 5seed pipeline

*This code was used in Google Colab*

In [None]:
import numpy as np
_original_array = np.array
def safe_array(obj, *args, **kwargs):
    if "copy" in kwargs and kwargs["copy"] is False:
        return np.asarray(obj)
    return _original_array(obj, *args, **kwargs)

np.array = safe_array

In [None]:
from datasets import load_from_disk, concatenate_datasets, load_dataset, Dataset, concatenate_datasets, ClassLabel, DatasetDict
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertTokenizerFast, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import torch
import os
import json
from google.colab import drive
import pandas as pd
from datetime import datetime

In [None]:
base_dataset_dir = "/content/drive/MyDrive/MASTER EXPERIMENTS/original_datasets"
output_base_dir = "/content/drive/MyDrive/MASTER EXPERIMENTS/bert_allvsone_5seed_output"

#sub dirs
model_dir   = os.path.join(output_base_dir, "models")
results_dir = os.path.join(output_base_dir, "results")
os.makedirs(model_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

#result files
internal_results_csv = os.path.join(results_dir, "internal_eval.csv")
cross_results_csv    = os.path.join(results_dir, "cross_eval.csv")

#helper for model paths per datasets and seed Modellpfade
def model_dir_for(dataset_name, seed):
    d = os.path.join(model_dir, dataset_name, f"seed_{seed}")
    os.makedirs(d, exist_ok=True)
    return d

print("Structure created")
print("Models base:", model_dir)
print("Results base:", results_dir)
print("Internal CSV:", internal_results_csv)
print("Cross CSV:", cross_results_csv)

In [None]:
drive.mount('/content/drive')

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(batch):
    texts = [text if isinstance(text, str) else "" for text in batch["text"]]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

def prepare_dataset(ds):
    columns_to_remove = [col for col in ds.column_names if col not in ["input_ids", "attention_mask", "label"]]
    if columns_to_remove:
        ds = ds.remove_columns(columns_to_remove)
    return ds.with_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
seed_list = [7, 28, 42, 95, 450]
tokenized_sets = []

# load datasets
dataset_files = [f for f in os.listdir(base_dataset_dir) if f.endswith(".csv")]

for file in dataset_files:
    name = os.path.splitext(file)[0]
    print(f"Loading: {name}")

    # load csv to pandas df
    df = pd.read_csv(os.path.join(base_dataset_dir, file))

    # clean up dataframe
    df = df[df["text"].apply(lambda x: isinstance(x, str) and x.strip() != "")]
    df["label"] = df["label"].astype(int)
    df = df[["text", "label"]].dropna().reset_index(drop=True)

    # dicts for all seeds
    train_dict, val_dict = {}, {}

    for seed in seed_list:
        # create train / val split for each seed
        train_df, val_df = train_test_split(
            df,
            test_size=0.3,
            stratify=df["label"],
            random_state=seed
        )

        # convert to hugging face dataset
        train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
        val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))

        # tokenize datasets and prepare
        tokenized_train = prepare_dataset(train_ds.map(tokenize_function, batched=True))
        tokenized_val = prepare_dataset(val_ds.map(tokenize_function, batched=True))

        train_dict[seed] = tokenized_train
        val_dict[seed] = tokenized_val

        print(f" -> Prepared seed {seed} for dataset {name}")

    tokenized_sets.append({
        "name": name,
        "train": train_dict,
        "val": val_dict
    })

print(f"\nLoaded and tokenized {len(tokenized_sets)} datasets with {len(seed_list)} seeds each.")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

In [None]:
training_args_template = {
    "eval_strategy": "epoch",
    "num_train_epochs": 6,
    "learning_rate": 2e-5,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 64,
    "weight_decay": 0.01,
    "save_strategy": "epoch",
    "logging_strategy": "epoch",
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_f1",
    "save_total_limit": 1
}

In [None]:
def train_model_single_eval(dataset_name, seed, train_ds, val_ds, training_args_template, model_dir):

    #make model output directory unique per dataset+seed
    model_output_dir = os.path.join(model_dir, f"bert_{dataset_name}_seed{seed}")

    #if model exists, load it
    if os.path.exists(model_output_dir) and os.path.isfile(os.path.join(model_output_dir, "model.safetensors")):
        print(f"[Seed {seed}] Model already exists at: {model_output_dir}. Resuming without retraining.")
        model = DistilBertForSequenceClassification.from_pretrained(model_output_dir)

        training_args = TrainingArguments(
            output_dir=model_output_dir,
            report_to="none",
            seed=seed, 
            **training_args_template
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            eval_dataset=val_ds,
            compute_metrics=compute_metrics
        )

        return trainer

    #train model if it doesnt exist yet
    print(f"[Seed {seed}] Training on dataset: {dataset_name}")
    os.makedirs(model_output_dir, exist_ok=True)

    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

    training_args = TrainingArguments(
        output_dir=model_output_dir,
        logging_dir=os.path.join(model_output_dir, "logs"),
        report_to="none",
        seed=seed, 
        **training_args_template
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    model.save_pretrained(model_output_dir)
    tokenizer.save_pretrained(model_output_dir)

    print(f"[Seed {seed}] Finished training on {dataset_name}. Model saved to: {model_output_dir}")
    return trainer

In [None]:
results_path = os.path.join(results_dir, f"single_eval_results.csv")

#load existing results if available
if os.path.exists(results_path):
    results_df = pd.read_csv(results_path)
    print(f"Loaded existing results with {len(results_df)} entries from {results_path}")
else:
    results_df = pd.DataFrame(columns=[
        "Trained Dataset", "Trained Seed",
        "Tested Dataset", "Tested Seed",
        "Evaluation Type",
        "Train Size", "Test Size",
        "Accuracy", "Precision", "Recall", "F1"
    ])

results = results_df.to_dict("records")  #start with existing results

for train_entry in tokenized_sets:
    dataset_name = train_entry["name"]

    for seed in seed_list:
        print(f"\nTraining + evaluation: dataset={dataset_name}, seed={seed}")

        train_ds = train_entry["train"][seed]
        val_ds = train_entry["val"][seed]

        #check if internal evaluation already exists
        already_done_internal = (
            (results_df["Trained Dataset"] == dataset_name) &
            (results_df["Trained Seed"] == seed) &
            (results_df["Tested Dataset"] == dataset_name) &
            (results_df["Tested Seed"] == seed) &
            (results_df["Evaluation Type"] == "Internal")
        ).any()

        #train or resume model (needed for internal + external eval)
        trainer = train_model_single_eval(dataset_name, seed, train_ds, val_ds, training_args_template, model_dir)

        #internal evaluation
        if not already_done_internal:
            print(f"[Seed {seed}] Internal evaluation for: {dataset_name}")
            val_predictions = trainer.predict(val_ds)
            val_preds = np.argmax(val_predictions.predictions, axis=1)
            val_labels = val_predictions.label_ids

            results.append({
                "Trained Dataset": dataset_name,
                "Trained Seed": seed,
                "Tested Dataset": dataset_name,
                "Tested Seed": seed,
                "Evaluation Type": "Internal",
                "Train Size": len(train_ds),
                "Test Size": len(val_ds),
                "Accuracy": accuracy_score(val_labels, val_preds),
                "Precision": precision_score(val_labels, val_preds, zero_division=0),
                "Recall": recall_score(val_labels, val_preds, zero_division=0),
                "F1": f1_score(val_labels, val_preds, zero_division=0)
            })

            pd.DataFrame(results).to_csv(results_path, index=False)
            results_df = pd.read_csv(results_path)  # reload after saving
        else:
            print(f"[Seed {seed}] Internal evaluation already exists → skipping")

        #external evaluation
        for test_entry in tokenized_sets:
            test_dataset_name = test_entry["name"]

            for test_seed in seed_list:
                if test_dataset_name == dataset_name and test_seed == seed:
                    continue  #skip identical dataset+seed

                already_done_external = (
                    (results_df["Trained Dataset"] == dataset_name) &
                    (results_df["Trained Seed"] == seed) &
                    (results_df["Tested Dataset"] == test_dataset_name) &
                    (results_df["Tested Seed"] == test_seed) &
                    (results_df["Evaluation Type"] == "External")
                ).any()

                if already_done_external:
                    print(f"[Seed {seed}] External evaluation train={dataset_name}, test={test_dataset_name} (seed {test_seed}) already exists, skipping")
                    continue

                external_test_ds = test_entry["val"][test_seed]

                print(f"[Seed {seed}] External evaluation: train={dataset_name} (seed {seed}), test={test_dataset_name} (seed {test_seed})")
                test_predictions = trainer.predict(external_test_ds)
                test_preds = np.argmax(test_predictions.predictions, axis=1)
                test_labels = test_predictions.label_ids

                results.append({
                    "Trained Dataset": dataset_name,
                    "Trained Seed": seed,
                    "Tested Dataset": test_dataset_name,
                    "Tested Seed": test_seed,
                    "Evaluation Type": "External",
                    "Train Size": len(train_ds),
                    "Test Size": len(external_test_ds),
                    "Accuracy": accuracy_score(test_labels, test_preds),
                    "Precision": precision_score(test_labels, test_preds, zero_division=0),
                    "Recall": recall_score(test_labels, test_preds, zero_division=0),
                    "F1": f1_score(test_labels, test_preds, zero_division=0)
                })

                pd.DataFrame(results).to_csv(results_path, index=False)
                results_df = pd.read_csv(results_path)  # reload after saving

print(f"\nAll results saved to: {results_path}")
display(pd.DataFrame(results))