# Model Training (NLP)

This notebook trains 5 transformer-based NLP models:
- BERT
- ClinicalBERT
- DistilBERT
- BioBERT
- ALBERT

In [None]:
import os
import math
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

set_seed(42)
torch.manual_seed(42)

NLP_FEATURE_DIR = Path("../data/processed/nlpfeatures2")
PROC_OUT = Path("../data/processed/nlp2")
MODEL_OUT = Path("../models/nlp2")
FIG_OUT = Path("../figures/nlp2")

for p in [PROC_OUT, MODEL_OUT, FIG_OUT]:
    p.mkdir(parents=True, exist_ok=True)

NUM_EPOCHS = 5 
PER_DEVICE_TRAIN_BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH_SIZE = 8
MAX_LENGTH = 128
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

## Load the train and test

In [None]:
train_csv = NLP_FEATURE_DIR / "train.csv"
test_csv  = NLP_FEATURE_DIR / "test.csv"

if not train_csv.exists() or not test_csv.exists():
    raise FileNotFoundError(f"Train/test CSVs not found in {NLP_FEATURE_DIR}. Please run 20_feature_engineering_nlp.ipynb first.")

train_df = pd.read_csv(train_csv)
test_df  = pd.read_csv(test_csv)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("\nSample:")
display(train_df.head(2))

for c in ["Student Information", "Depression Label"]:
    if c not in train_df.columns:
        raise ValueError(f"Column {c} missing in {train_csv}")
    if c not in test_df.columns:
        raise ValueError(f"Column {c} missing in {test_csv}")

## Encode target labels to integers

In [None]:
le = LabelEncoder()
train_df["label_enc"] = le.fit_transform(train_df["Depression Label"].astype(str))
test_df["label_enc"]  = le.transform(test_df["Depression Label"].astype(str))

num_labels = len(le.classes_)
print("Classes:", list(le.classes_))
print("Num labels:", num_labels)

train_ds = Dataset.from_pandas(train_df[["Student Information", "label_enc"]].rename(columns={"Student Information":"text", "label_enc":"label"}))
test_ds  = Dataset.from_pandas(test_df[["Student Information", "label_enc"]].rename(columns={"Student Information":"text", "label_enc":"label"}))

print(train_ds)
print(test_ds)

## Models list and tokenizer/model names

In [None]:
MODELS = [
    ("bert-base-uncased", "bert-base-uncased"),
    ("clinical-bert", "emilyalsentzer/Bio_ClinicalBERT"),
    ("distilbert-base-uncased", "distilbert-base-uncased"),
    ("biobert-base-cased-v1.1", "dmis-lab/biobert-base-cased-v1.1"),
    ("albert-base-v2", "albert-base-v2"),
]

print("Models to run:", [m[0] for m in MODELS])

## Define compute_metrics used by Trainer (accuracy, precision, recall, f1)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average="weighted", zero_division=0)
    rec = recall_score(labels, preds, average="weighted", zero_division=0)
    f1 = f1_score(labels, preds, average="weighted", zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

## Training Loop

In [None]:
RUN_MODELS = MODELS

from transformers import logging as tf_logging
tf_logging.set_verbosity_error()

for slug, hf_name in RUN_MODELS:
    print("\n" + "="*80)
    print(f"Starting training: {slug}  (HF model: {hf_name})")
    print("="*80)
    
    model_out_dir = MODEL_OUT / slug
    fig_out_dir = FIG_OUT / slug
    proc_out_dir = PROC_OUT
    model_out_dir.mkdir(parents=True, exist_ok=True)
    fig_out_dir.mkdir(parents=True, exist_ok=True)
    proc_out_dir.mkdir(parents=True, exist_ok=True)

    print("Loading tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained(hf_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(hf_name, num_labels=num_labels)

    def tokenize_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding=False, max_length=MAX_LENGTH)

    tokenized_train = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
    tokenized_test  = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=str(model_out_dir),
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        disable_tqdm=False,
        load_best_model_at_end=False,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=False,
        push_to_hub=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    print("Beginning training...")
    train_result = trainer.train()
    trainer.save_model(str(model_out_dir))
    print("Model saved to:", model_out_dir)

    print("Running final evaluation on test set...")
    metrics = trainer.evaluate(eval_dataset=tokenized_test)
    print("Final metrics:", metrics)

    logs = trainer.state.log_history
    epoch_nums = []
    epoch_acc = []
    epoch_prec = []
    epoch_rec = []
    epoch_f1 = []
    for entry in logs:
        if "eval_accuracy" in entry:
            epoch_nums.append(entry.get("epoch"))
            epoch_acc.append(entry.get("eval_accuracy"))
            epoch_prec.append(entry.get("eval_precision"))
            epoch_rec.append(entry.get("eval_recall"))
            epoch_f1.append(entry.get("eval_f1"))
    if not epoch_nums:
        epoch_nums = [i+1 for i in range(NUM_EPOCHS)]
        epoch_acc = [metrics.get("eval_accuracy")] * NUM_EPOCHS
        epoch_prec = [metrics.get("eval_precision")] * NUM_EPOCHS
        epoch_rec = [metrics.get("eval_recall")] * NUM_EPOCHS
        epoch_f1 = [metrics.get("eval_f1")] * NUM_EPOCHS

    results_df = pd.DataFrame({
        "epoch": epoch_nums,
        "accuracy": epoch_acc,
        "precision": epoch_prec,
        "recall": epoch_rec,
        "f1": epoch_f1
    })
    results_csv = proc_out_dir / f"{slug}_results.csv"
    results_df.to_csv(results_csv, index=False)
    print("Saved epoch metrics CSV ->", results_csv)
    
    summary_out = proc_out_dir / f"{slug}_final_metrics.csv"
    pd.DataFrame([metrics]).to_csv(summary_out, index=False)
    print("Saved final metrics ->", summary_out)

    fig, ax = plt.subplots(figsize=(6,4))
    ax.plot(epoch_nums, epoch_acc, marker='o', label='Accuracy')
    ax.plot(epoch_nums, epoch_f1, marker='o', label='F1')
    ax.set_xlabel("Epoch")
    ax.set_ylabel("Score")
    ax.set_title(f"{slug} — Accuracy/F1 vs Epoch")
    ax.set_xticks(epoch_nums)
    ax.legend()
    acc_fig_path = fig_out_dir / f"{slug}_accuracy_epoch.png"
    fig.savefig(acc_fig_path, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print("Saved accuracy vs epoch ->", acc_fig_path)

    print("Computing confusion matrix on test set...")
    preds_output = trainer.predict(tokenized_test)
    pred_labels = np.argmax(preds_output.predictions, axis=-1)
    true_labels = preds_output.label_ids
    cm = confusion_matrix(true_labels, pred_labels)
    cm_fig, ax = plt.subplots(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_title(f"{slug} — Confusion Matrix")
    cm_path = fig_out_dir / f"{slug}_confusion.png"
    cm_fig.savefig(cm_path, dpi=300, bbox_inches="tight")
    plt.close(cm_fig)
    print("Saved confusion matrix ->", cm_path)

    mapping_path = model_out_dir / "label_mapping.csv"
    pd.DataFrame({"label": list(le.classes_), "enc": list(range(len(le.classes_)))}).to_csv(mapping_path, index=False)
    print("Saved label mapping ->", mapping_path)
    
    print(f"Finished model: {slug}")

## Aggregate per-model final results

In [None]:
agg = []
for slug, hf_name in MODELS:
    summary_out = PROC_OUT / f"{slug}_final_metrics.csv"
    if summary_out.exists():
        dfm = pd.read_csv(summary_out)
        dfm["model"] = slug
        agg.append(dfm)
    else:
        print("Summary missing for", slug)

if agg:
    combined = pd.concat(agg, ignore_index=True, sort=False)
    display(combined[["model", "eval_accuracy", "eval_precision", "eval_recall", "eval_f1", "eval_loss"]])
    combined.to_csv(PROC_OUT / "all_nlp_models_summary.csv", index=False)
    print("Saved combined summary ->", PROC_OUT / "all_nlp_models_summary.csv")
else:
    print("No final metrics found.")