# Model Training (NLP)

This notebook trains 5 transformer-based NLP models:
- BERT
- ClinicalBERT
- DistilBERT
- BioBERT
- ALBERT

In [1]:
import os
import math
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

set_seed(42)
torch.manual_seed(42)

NLP_FEATURE_DIR = Path("../data/processed/nlpfeatures1")
PROC_OUT = Path("../data/processed/nlp1")
MODEL_OUT = Path("../models/nlp1")
FIG_OUT = Path("../figures/nlp1")

for p in [PROC_OUT, MODEL_OUT, FIG_OUT]:
    p.mkdir(parents=True, exist_ok=True)

NUM_EPOCHS = 5 
PER_DEVICE_TRAIN_BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH_SIZE = 8
MAX_LENGTH = 128
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

## Load the train and test

In [2]:
train_csv = NLP_FEATURE_DIR / "train.csv"
test_csv  = NLP_FEATURE_DIR / "test.csv"

if not train_csv.exists() or not test_csv.exists():
    raise FileNotFoundError(f"Train/test CSVs not found in {NLP_FEATURE_DIR}. Please run 20_feature_engineering_nlp.ipynb first.")

train_df = pd.read_csv(train_csv)
test_df  = pd.read_csv(test_csv)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("\nSample:")
display(train_df.head(2))

for c in ["Student Information", "Depression Label"]:
    if c not in train_df.columns:
        raise ValueError(f"Column {c} missing in {train_csv}")
    if c not in test_df.columns:
        raise ValueError(f"Column {c} missing in {test_csv}")

Train shape: (1617, 2)
Test shape: (405, 2)

Sample:


Unnamed: 0,Student Information,Depression Label
0,"The student is around 23-26 years old, male, s...",Severe
1,"The student is around 18-22 years old, male, s...",Severe


## Encode target labels to integers

In [3]:
le = LabelEncoder()
train_df["label_enc"] = le.fit_transform(train_df["Depression Label"].astype(str))
test_df["label_enc"]  = le.transform(test_df["Depression Label"].astype(str))

num_labels = len(le.classes_)
print("Classes:", list(le.classes_))
print("Num labels:", num_labels)

train_ds = Dataset.from_pandas(train_df[["Student Information", "label_enc"]].rename(columns={"Student Information":"text", "label_enc":"label"}))
test_ds  = Dataset.from_pandas(test_df[["Student Information", "label_enc"]].rename(columns={"Student Information":"text", "label_enc":"label"}))

print(train_ds)
print(test_ds)

Classes: ['Mild', 'Minimal', 'Moderate', 'Moderately Severe', 'Severe']
Num labels: 5
Dataset({
    features: ['text', 'label'],
    num_rows: 1617
})
Dataset({
    features: ['text', 'label'],
    num_rows: 405
})


## Models list and tokenizer/model names

In [4]:
MODELS = [
    ("bert-base-uncased", "bert-base-uncased"),
    ("clinical-bert", "emilyalsentzer/Bio_ClinicalBERT"),
    ("distilbert-base-uncased", "distilbert-base-uncased"),
    ("biobert-base-cased-v1.1", "dmis-lab/biobert-base-cased-v1.1"),
    ("albert-base-v2", "albert-base-v2"),
]

print("Models to run:", [m[0] for m in MODELS])

Models to run: ['bert-base-uncased', 'clinical-bert', 'distilbert-base-uncased', 'biobert-base-cased-v1.1', 'albert-base-v2']


## Define compute_metrics used by Trainer (accuracy, precision, recall, f1)

In [5]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average="weighted", zero_division=0)
    rec = recall_score(labels, preds, average="weighted", zero_division=0)
    f1 = f1_score(labels, preds, average="weighted", zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

## Training Loop

In [6]:
RUN_MODELS = MODELS

from transformers import logging as tf_logging
tf_logging.set_verbosity_error()

for slug, hf_name in RUN_MODELS:
    print("\n" + "="*80)
    print(f"Starting training: {slug}  (HF model: {hf_name})")
    print("="*80)
    
    model_out_dir = MODEL_OUT / slug
    fig_out_dir = FIG_OUT / slug
    proc_out_dir = PROC_OUT
    model_out_dir.mkdir(parents=True, exist_ok=True)
    fig_out_dir.mkdir(parents=True, exist_ok=True)
    proc_out_dir.mkdir(parents=True, exist_ok=True)

    print("Loading tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained(hf_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(hf_name, num_labels=num_labels)

    def tokenize_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding=False, max_length=MAX_LENGTH)

    tokenized_train = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
    tokenized_test  = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=str(model_out_dir),
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        disable_tqdm=False,
        load_best_model_at_end=False,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=False,
        push_to_hub=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    print("Beginning training...")
    train_result = trainer.train()
    trainer.save_model(str(model_out_dir))
    print("Model saved to:", model_out_dir)

    print("Running final evaluation on test set...")
    metrics = trainer.evaluate(eval_dataset=tokenized_test)
    print("Final metrics:", metrics)

    logs = trainer.state.log_history
    epoch_nums = []
    epoch_acc = []
    epoch_prec = []
    epoch_rec = []
    epoch_f1 = []
    for entry in logs:
        if "eval_accuracy" in entry:
            epoch_nums.append(entry.get("epoch"))
            epoch_acc.append(entry.get("eval_accuracy"))
            epoch_prec.append(entry.get("eval_precision"))
            epoch_rec.append(entry.get("eval_recall"))
            epoch_f1.append(entry.get("eval_f1"))
    if not epoch_nums:
        epoch_nums = [i+1 for i in range(NUM_EPOCHS)]
        epoch_acc = [metrics.get("eval_accuracy")] * NUM_EPOCHS
        epoch_prec = [metrics.get("eval_precision")] * NUM_EPOCHS
        epoch_rec = [metrics.get("eval_recall")] * NUM_EPOCHS
        epoch_f1 = [metrics.get("eval_f1")] * NUM_EPOCHS

    results_df = pd.DataFrame({
        "epoch": epoch_nums,
        "accuracy": epoch_acc,
        "precision": epoch_prec,
        "recall": epoch_rec,
        "f1": epoch_f1
    })
    results_csv = proc_out_dir / f"{slug}_results.csv"
    results_df.to_csv(results_csv, index=False)
    print("Saved epoch metrics CSV ->", results_csv)
    
    summary_out = proc_out_dir / f"{slug}_final_metrics.csv"
    pd.DataFrame([metrics]).to_csv(summary_out, index=False)
    print("Saved final metrics ->", summary_out)

    fig, ax = plt.subplots(figsize=(6,4))
    ax.plot(epoch_nums, epoch_acc, marker='o', label='Accuracy')
    ax.plot(epoch_nums, epoch_f1, marker='o', label='F1')
    ax.set_xlabel("Epoch")
    ax.set_ylabel("Score")
    ax.set_title(f"{slug} — Accuracy/F1 vs Epoch")
    ax.set_xticks(epoch_nums)
    ax.legend()
    acc_fig_path = fig_out_dir / f"{slug}_accuracy_epoch.png"
    fig.savefig(acc_fig_path, dpi=300, bbox_inches="tight")
    plt.close(fig)
    print("Saved accuracy vs epoch ->", acc_fig_path)

    print("Computing confusion matrix on test set...")
    preds_output = trainer.predict(tokenized_test)
    pred_labels = np.argmax(preds_output.predictions, axis=-1)
    true_labels = preds_output.label_ids
    cm = confusion_matrix(true_labels, pred_labels)
    cm_fig, ax = plt.subplots(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_title(f"{slug} — Confusion Matrix")
    cm_path = fig_out_dir / f"{slug}_confusion.png"
    cm_fig.savefig(cm_path, dpi=300, bbox_inches="tight")
    plt.close(cm_fig)
    print("Saved confusion matrix ->", cm_path)

    mapping_path = model_out_dir / "label_mapping.csv"
    pd.DataFrame({"label": list(le.classes_), "enc": list(range(len(le.classes_)))}).to_csv(mapping_path, index=False)
    print("Saved label mapping ->", mapping_path)
    
    print(f"Finished model: {slug}")


Starting training: bert-base-uncased  (HF model: bert-base-uncased)
Loading tokenizer and model...


Map:   0%|          | 0/1617 [00:00<?, ? examples/s]

Map:   0%|          | 0/405 [00:00<?, ? examples/s]

  trainer = Trainer(


Beginning training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2648,1.159849,0.511111,0.505895,0.511111,0.482648
2,1.1586,1.108542,0.530864,0.528295,0.530864,0.525883
3,1.1109,1.137865,0.520988,0.526071,0.520988,0.503904
4,1.0985,1.117982,0.525926,0.521159,0.525926,0.522495
5,1.0603,1.137752,0.525926,0.51701,0.525926,0.518544




Model saved to: ..\models\nlp1\bert-base-uncased
Running final evaluation on test set...




Final metrics: {'eval_loss': 1.1377522945404053, 'eval_accuracy': 0.5259259259259259, 'eval_precision': 0.5170102137231767, 'eval_recall': 0.5259259259259259, 'eval_f1': 0.5185435697995506, 'eval_runtime': 55.3762, 'eval_samples_per_second': 7.314, 'eval_steps_per_second': 0.921, 'epoch': 5.0}
Saved epoch metrics CSV -> ..\data\processed\nlp1\bert-base-uncased_results.csv
Saved final metrics -> ..\data\processed\nlp1\bert-base-uncased_final_metrics.csv
Saved accuracy vs epoch -> ..\figures\nlp1\bert-base-uncased\bert-base-uncased_accuracy_epoch.png
Computing confusion matrix on test set...




Saved confusion matrix -> ..\figures\nlp1\bert-base-uncased\bert-base-uncased_confusion.png
Saved label mapping -> ..\models\nlp1\bert-base-uncased\label_mapping.csv
Finished model: bert-base-uncased

Starting training: clinical-bert  (HF model: emilyalsentzer/Bio_ClinicalBERT)
Loading tokenizer and model...


Map:   0%|          | 0/1617 [00:00<?, ? examples/s]

Map:   0%|          | 0/405 [00:00<?, ? examples/s]

  trainer = Trainer(


Beginning training...




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2928,1.145976,0.488889,0.471226,0.488889,0.462728
2,1.147,1.130662,0.501235,0.50189,0.501235,0.495252
3,1.121,1.133102,0.506173,0.497702,0.506173,0.482497
4,1.1025,1.117291,0.511111,0.506243,0.511111,0.502724
5,1.0692,1.135531,0.503704,0.488789,0.503704,0.488983




Model saved to: ..\models\nlp1\clinical-bert
Running final evaluation on test set...




Final metrics: {'eval_loss': 1.135530948638916, 'eval_accuracy': 0.5037037037037037, 'eval_precision': 0.4887894583697052, 'eval_recall': 0.5037037037037037, 'eval_f1': 0.48898262994942987, 'eval_runtime': 54.8795, 'eval_samples_per_second': 7.38, 'eval_steps_per_second': 0.929, 'epoch': 5.0}
Saved epoch metrics CSV -> ..\data\processed\nlp1\clinical-bert_results.csv
Saved final metrics -> ..\data\processed\nlp1\clinical-bert_final_metrics.csv
Saved accuracy vs epoch -> ..\figures\nlp1\clinical-bert\clinical-bert_accuracy_epoch.png
Computing confusion matrix on test set...




Saved confusion matrix -> ..\figures\nlp1\clinical-bert\clinical-bert_confusion.png
Saved label mapping -> ..\models\nlp1\clinical-bert\label_mapping.csv
Finished model: clinical-bert

Starting training: distilbert-base-uncased  (HF model: distilbert-base-uncased)
Loading tokenizer and model...


Map:   0%|          | 0/1617 [00:00<?, ? examples/s]

Map:   0%|          | 0/405 [00:00<?, ? examples/s]

  trainer = Trainer(


Beginning training...




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2402,1.125706,0.533333,0.522432,0.533333,0.520754
2,1.1422,1.116738,0.540741,0.534403,0.540741,0.531281
3,1.0976,1.143461,0.525926,0.52475,0.525926,0.512286
4,1.0908,1.112749,0.528395,0.520856,0.528395,0.52284
5,1.0545,1.122648,0.525926,0.517192,0.525926,0.518885




Model saved to: ..\models\nlp1\distilbert-base-uncased
Running final evaluation on test set...




Final metrics: {'eval_loss': 1.1226482391357422, 'eval_accuracy': 0.5259259259259259, 'eval_precision': 0.5171921671014127, 'eval_recall': 0.5259259259259259, 'eval_f1': 0.5188848603765112, 'eval_runtime': 27.5431, 'eval_samples_per_second': 14.704, 'eval_steps_per_second': 1.852, 'epoch': 5.0}
Saved epoch metrics CSV -> ..\data\processed\nlp1\distilbert-base-uncased_results.csv
Saved final metrics -> ..\data\processed\nlp1\distilbert-base-uncased_final_metrics.csv
Saved accuracy vs epoch -> ..\figures\nlp1\distilbert-base-uncased\distilbert-base-uncased_accuracy_epoch.png
Computing confusion matrix on test set...




Saved confusion matrix -> ..\figures\nlp1\distilbert-base-uncased\distilbert-base-uncased_confusion.png
Saved label mapping -> ..\models\nlp1\distilbert-base-uncased\label_mapping.csv
Finished model: distilbert-base-uncased

Starting training: biobert-base-cased-v1.1  (HF model: dmis-lab/biobert-base-cased-v1.1)
Loading tokenizer and model...


Map:   0%|          | 0/1617 [00:00<?, ? examples/s]

Map:   0%|          | 0/405 [00:00<?, ? examples/s]

  trainer = Trainer(


Beginning training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2575,1.117386,0.516049,0.497165,0.516049,0.49286
2,1.1655,1.099709,0.535802,0.524676,0.535802,0.524526
3,1.1227,1.15114,0.481481,0.479175,0.481481,0.466543
4,1.0997,1.12301,0.511111,0.500067,0.511111,0.50355
5,1.0751,1.139685,0.491358,0.479045,0.491358,0.482077




Model saved to: ..\models\nlp1\biobert-base-cased-v1.1
Running final evaluation on test set...




Final metrics: {'eval_loss': 1.139685034751892, 'eval_accuracy': 0.49135802469135803, 'eval_precision': 0.47904524034279167, 'eval_recall': 0.49135802469135803, 'eval_f1': 0.48207655074698147, 'eval_runtime': 53.7901, 'eval_samples_per_second': 7.529, 'eval_steps_per_second': 0.948, 'epoch': 5.0}
Saved epoch metrics CSV -> ..\data\processed\nlp1\biobert-base-cased-v1.1_results.csv
Saved final metrics -> ..\data\processed\nlp1\biobert-base-cased-v1.1_final_metrics.csv
Saved accuracy vs epoch -> ..\figures\nlp1\biobert-base-cased-v1.1\biobert-base-cased-v1.1_accuracy_epoch.png
Computing confusion matrix on test set...




Saved confusion matrix -> ..\figures\nlp1\biobert-base-cased-v1.1\biobert-base-cased-v1.1_confusion.png
Saved label mapping -> ..\models\nlp1\biobert-base-cased-v1.1\label_mapping.csv
Finished model: biobert-base-cased-v1.1

Starting training: albert-base-v2  (HF model: albert-base-v2)
Loading tokenizer and model...


Map:   0%|          | 0/1617 [00:00<?, ? examples/s]

Map:   0%|          | 0/405 [00:00<?, ? examples/s]

  trainer = Trainer(


Beginning training...




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4272,1.272791,0.454321,0.392057,0.454321,0.381978
2,1.2549,1.16999,0.501235,0.456789,0.501235,0.4638
3,1.1929,1.148874,0.511111,0.516957,0.511111,0.50098
4,1.162,1.111626,0.538272,0.541485,0.538272,0.538884
5,1.1133,1.129012,0.508642,0.504674,0.508642,0.503896




Model saved to: ..\models\nlp1\albert-base-v2
Running final evaluation on test set...




Final metrics: {'eval_loss': 1.1290124654769897, 'eval_accuracy': 0.508641975308642, 'eval_precision': 0.504674239211894, 'eval_recall': 0.508641975308642, 'eval_f1': 0.5038964881547559, 'eval_runtime': 61.9079, 'eval_samples_per_second': 6.542, 'eval_steps_per_second': 0.824, 'epoch': 5.0}
Saved epoch metrics CSV -> ..\data\processed\nlp1\albert-base-v2_results.csv
Saved final metrics -> ..\data\processed\nlp1\albert-base-v2_final_metrics.csv
Saved accuracy vs epoch -> ..\figures\nlp1\albert-base-v2\albert-base-v2_accuracy_epoch.png
Computing confusion matrix on test set...




Saved confusion matrix -> ..\figures\nlp1\albert-base-v2\albert-base-v2_confusion.png
Saved label mapping -> ..\models\nlp1\albert-base-v2\label_mapping.csv
Finished model: albert-base-v2


## Aggregate per-model final results

In [7]:
agg = []
for slug, hf_name in MODELS:
    summary_out = PROC_OUT / f"{slug}_final_metrics.csv"
    if summary_out.exists():
        dfm = pd.read_csv(summary_out)
        dfm["model"] = slug
        agg.append(dfm)
    else:
        print("Summary missing for", slug)

if agg:
    combined = pd.concat(agg, ignore_index=True, sort=False)
    display(combined[["model", "eval_accuracy", "eval_precision", "eval_recall", "eval_f1", "eval_loss"]])
    combined.to_csv(PROC_OUT / "all_nlp_models_summary.csv", index=False)
    print("Saved combined summary ->", PROC_OUT / "all_nlp_models_summary.csv")
else:
    print("No final metrics found.")

Unnamed: 0,model,eval_accuracy,eval_precision,eval_recall,eval_f1,eval_loss
0,bert-base-uncased,0.525926,0.51701,0.525926,0.518544,1.137752
1,clinical-bert,0.503704,0.488789,0.503704,0.488983,1.135531
2,distilbert-base-uncased,0.525926,0.517192,0.525926,0.518885,1.122648
3,biobert-base-cased-v1.1,0.491358,0.479045,0.491358,0.482077,1.139685
4,albert-base-v2,0.508642,0.504674,0.508642,0.503896,1.129012


Saved combined summary -> ..\data\processed\nlp1\all_nlp_models_summary.csv
