In [1]:
from pathlib import Path
import pandas as pd

# ===== config =====
DATASET_NAME = "combined_mental_condition_dataset_no_stress.csv"

# Option A: set this if you know the path
# data_warehouse = Path(r"/absolute/path/to/Data_Warehouse")

# Option B: auto locate Data_Warehouse by walking up from current working folder
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate a folder named Data_Warehouse")

try:
    data_warehouse
except NameError:
    try:
        script_dir = Path(__file__).resolve().parent  # running as a script
    except NameError:
        script_dir = Path.cwd()                       # running in a notebook
    data_warehouse = find_data_warehouse(script_dir)

# ===== load =====
data_path = data_warehouse / DATASET_NAME
df_ns = pd.read_csv(data_path)

# ===== basic checks =====
required = {"text", "label"}
missing = required - set(df_ns.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# coerce types and clean minimal issues
df_ns["text"] = df_ns["text"].astype(str)
df_ns["label"] = df_ns["label"].astype(str).str.strip()

# drop empty text or label rows if any
before = len(df_ns)
df_ns = df_ns[(df_ns["text"].str.strip() != "") & (df_ns["label"].str.strip() != "")]
dropped = before - len(df_ns)

# sanity check for stress leakage
if df_ns["label"].str.lower().eq("stress").any():
    raise ValueError("Found label 'stress' in the no stress dataset. Please verify the input file.")

print(f"Loaded {len(df_ns)} rows from {data_path}")
if dropped:
    print(f"Dropped {dropped} empty text or label rows")

print("\nLabel distribution:")
print(df_ns["label"].value_counts())

print("\nSample rows:")
print(df_ns.sample(min(5, len(df_ns)), random_state=42)[["text", "label"]])


Loaded 4696 rows from d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\combined_mental_condition_dataset_no_stress.csv
Dropped 1 empty text or label rows

Label distribution:
label
depression    2322
suicide        838
none           706
anxiety        416
ptsd           414
Name: count, dtype: int64

Sample rows:
                                                   text       label
1550  Older female fighting depression\nHi I’m a 54 ...  depression
3801  This weekend was terrible, and I guess I wante...        ptsd
3730  Its hard knowing that everyone around you does...        ptsd
584   I am Mark, this is my first post on here. I am...  depression
2240  I'm rewatching Bojack Horseman on Netflix for ...  depression


In [3]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

# ===== config =====
RANDOM_STATE = 42
TEST_SIZE = 0.10
VAL_SIZE = 0.10   # portion of the full dataset

# ===== input =====
# df_ns is assumed to exist with columns ["text", "label"]

# ===== manual encode labels =====
unique_labels = sorted([lbl for lbl in df_ns["label"].unique() if lbl.lower() != "none"])

# Assign 0–3 to all labels except "none"
class_to_id = {lbl: i for i, lbl in enumerate(unique_labels)}

# Force "none" → 4
class_to_id["none"] = 4

id_to_class = {v: k for k, v in class_to_id.items()}

# Map labels to ids
df_ns = df_ns.copy()
df_ns["label_enc"] = df_ns["label"].str.lower().map(class_to_id)

print("Classes and ids (forced 'none' → 4):")
for k, v in class_to_id.items():
    print(f"{k} -> {v}")

# ===== split 80 10 10 with stratify =====
df_trainval, df_test = train_test_split(
    df_ns,
    test_size=TEST_SIZE,
    stratify=df_ns["label_enc"],
    random_state=RANDOM_STATE,
)

val_size_relative = VAL_SIZE / (1.0 - TEST_SIZE)  # 0.10 / 0.90 = 0.111...
df_train, df_val = train_test_split(
    df_trainval,
    test_size=val_size_relative,
    stratify=df_trainval["label_enc"],
    random_state=RANDOM_STATE,
)

# ===== quick checks =====
def show_split_stats(name, frame):
    print(f"\n{name} size: {len(frame)}")
    print(frame["label_enc"].value_counts().sort_index())

print("\nFinal split sizes")
print(f"Train: {len(df_train)}  Validation: {len(df_val)}  Test: {len(df_test)}")
show_split_stats("Train", df_train)
show_split_stats("Validation", df_val)
show_split_stats("Test", df_test)

# ===== save artifacts =====
try:
    data_warehouse
except NameError:
    data_warehouse = Path.cwd()

out_dir = data_warehouse / "mental_health_splits_no_stress"
out_dir.mkdir(parents=True, exist_ok=True)

df_train.to_csv(out_dir / "train.csv", index=False)
df_val.to_csv(out_dir / "val.csv", index=False)
df_test.to_csv(out_dir / "test.csv", index=False)

pd.Series(class_to_id).to_csv(out_dir / "label_classes.csv")

print(f"\nSaved splits and label mapping to {out_dir.resolve()}")


Classes and ids (forced 'none' → 4):
anxiety -> 0
depression -> 1
ptsd -> 2
suicide -> 3
none -> 4

Final split sizes
Train: 3756  Validation: 470  Test: 470

Train size: 3756
label_enc
0     332
1    1858
2     332
3     670
4     564
Name: count, dtype: int64

Validation size: 470
label_enc
0     42
1    232
2     41
3     84
4     71
Name: count, dtype: int64

Test size: 470
label_enc
0     42
1    232
2     41
3     84
4     71
Name: count, dtype: int64

Saved splits and label mapping to D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_no_stress


In [5]:
# ===== baselines with GPU for embeddings and calibrated SVM for AUC =====
from pathlib import Path
import json
import numpy as np
import pandas as pd

import torch
from sentence_transformers import SentenceTransformer

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

import joblib

# Optional GPU accelerated models via RAPIDS cuML if available
use_cuml = False
try:
    import cuml
    from cuml.linear_model import LogisticRegression as cuLogisticRegression
    from cuml.svm import SVC as cuSVC
    use_cuml = True
except Exception:
    use_cuml = False

RANDOM_STATE = 42

# df_train, df_val, df_test are expected to exist with ["text", "label", "label_enc"]
assert {"text", "label", "label_enc"}.issubset(df_train.columns)
assert {"text", "label", "label_enc"}.issubset(df_val.columns)
assert {"text", "label", "label_enc"}.issubset(df_test.columns)

# ===== embeddings on GPU if available =====
device = "cuda" if torch.cuda.is_available() else "cpu"
st_model = SentenceTransformer("sentence-transformers/all-roberta-large-v1", device=device)

def embed_texts(model, texts, batch_size=256, show_progress=True):
    return model.encode(
        list(texts),
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=show_progress,
    )

X_train = embed_texts(st_model, df_train["text"])
y_train = df_train["label_enc"].to_numpy()

X_val   = embed_texts(st_model, df_val["text"])
y_val   = df_val["label_enc"].to_numpy()

X_test  = embed_texts(st_model, df_test["text"])
y_test  = df_test["label_enc"].to_numpy()

print("Embedding shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Embeddings computed on device:", device)

# ===== helper to evaluate =====
def evaluate_model(name, model, X, y_true, proba=None):
    y_pred = model.predict(X)
    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    f1_weighted = f1_score(y_true, y_pred, average="weighted")
    prec_macro = precision_score(y_true, y_pred, average="macro")
    prec_weighted = precision_score(y_true, y_pred, average="weighted")
    rec_macro = recall_score(y_true, y_pred, average="macro")
    rec_weighted = recall_score(y_true, y_pred, average="weighted")

    auc = None
    if proba is not None:
        auc = roc_auc_score(y_true, proba, multi_class="ovr")

    print(f"\n=== {name} ===")
    print(f"accuracy: {acc:.4f}")
    print(f"precision_macro: {prec_macro:.4f}  precision_weighted: {prec_weighted:.4f}")
    print(f"recall_macro: {rec_macro:.4f}  recall_weighted: {rec_weighted:.4f}")
    print(f"f1_macro: {f1_macro:.4f}  f1_weighted: {f1_weighted:.4f}")
    if auc is not None:
        print(f"roc_auc_ovr: {auc:.4f}")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))

    return {
        "accuracy": float(acc),
        "precision_macro": float(prec_macro),
        "precision_weighted": float(prec_weighted),
        "recall_macro": float(rec_macro),
        "recall_weighted": float(rec_weighted),
        "f1_macro": float(f1_macro),
        "f1_weighted": float(f1_weighted),
        "roc_auc_ovr": float(auc) if auc is not None else None,
    }

# ===== Logistic Regression =====
if use_cuml:
    # cuML Logistic Regression uses GPU
    logreg = cuLogisticRegression(
        penalty="l2",
        C=1.0,
        max_iter=2000,
        tol=1e-4,
        fit_intercept=True,
        multi_class="ovr",  # cuML supports ovr; macro metrics will still be fair
        random_state=RANDOM_STATE,
        verbose=0,
    )
    logreg.fit(X_train, y_train)
    # probability estimates
    val_proba_lr = logreg.predict_proba(X_val)
    test_proba_lr = logreg.predict_proba(X_test)
else:
    logreg = LogisticRegression(
        multi_class="multinomial",
        solver="saga",
        penalty="l2",
        C=1.0,
        max_iter=2000,
        n_jobs=-1,
        random_state=RANDOM_STATE,
    )
    logreg.fit(X_train, y_train)
    val_proba_lr = logreg.predict_proba(X_val)
    test_proba_lr = logreg.predict_proba(X_test)

metrics_val_lr  = evaluate_model("Logistic Regression validation", logreg, X_val, y_val, proba=val_proba_lr)
metrics_test_lr = evaluate_model("Logistic Regression test", logreg, X_test, y_test, proba=test_proba_lr)

# ===== Linear SVM with probability calibration for valid multiclass AUC =====
if use_cuml:
    # cuML SVC supports probability=True with Platt scaling on GPU
    svm_base = cuSVC(C=1.0, kernel="linear", probability=True, random_state=RANDOM_STATE)
    svm_base.fit(X_train, y_train)
    val_proba_svm = svm_base.predict_proba(X_val)
    test_proba_svm = svm_base.predict_proba(X_test)
    # Use the same object for predict
    svm_for_pred = svm_base
else:
    # scikit LinearSVC has no predict_proba
    # Calibrate with sigmoid to get probabilities
    svm_linear = LinearSVC(C=1.0, random_state=RANDOM_STATE)
    svm = CalibratedClassifierCV(svm_linear, method="sigmoid", cv=5)
    svm.fit(X_train, y_train)
    val_proba_svm = svm.predict_proba(X_val)
    test_proba_svm = svm.predict_proba(X_test)
    svm_for_pred = svm

metrics_val_svm  = evaluate_model("Linear SVM validation", svm_for_pred, X_val, y_val, proba=val_proba_svm)
metrics_test_svm = evaluate_model("Linear SVM test", svm_for_pred, X_test, y_test, proba=test_proba_svm)

# ===== persist artifacts and summary =====
try:
    data_warehouse
except NameError:
    data_warehouse = Path.cwd()

art_dir = data_warehouse / "mental_health_splits_no_stress" / "baselines_all_roberta_large_v1"
art_dir.mkdir(parents=True, exist_ok=True)

# Save models
if use_cuml:
    # cuML models are picklable with joblib
    joblib.dump(logreg, art_dir / "logreg_cuml.joblib")
    joblib.dump(svm_for_pred, art_dir / "linear_svm_cuml.joblib")
else:
    joblib.dump(logreg, art_dir / "logreg.joblib")
    joblib.dump(svm_for_pred, art_dir / "linear_svm_calibrated.joblib")

# Save metrics and a compact table for quick comparison
metrics = {
    "val_logreg": metrics_val_lr,
    "test_logreg": metrics_test_lr,
    "val_linear_svm": metrics_val_svm,
    "test_linear_svm": metrics_test_svm,
}
with open(art_dir / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

summary_rows = []
for split in ["val", "test"]:
    for model_name, m in [("logreg", metrics[f"{split}_logreg"]), ("linear_svm", metrics[f"{split}_linear_svm"])]:
        summary_rows.append({
            "split": split,
            "model": model_name,
            **m
        })
summary = pd.DataFrame(summary_rows)
summary.to_csv(art_dir / "metrics_summary.csv", index=False)

print(f"\nSaved models and metrics to {art_dir.resolve()}")
print(summary)


Batches: 100%|██████████| 15/15 [07:50<00:00, 31.35s/it]
Batches: 100%|██████████| 2/2 [01:05<00:00, 33.00s/it]
Batches: 100%|██████████| 2/2 [01:01<00:00, 30.89s/it]


Embedding shapes: (3756, 1024) (470, 1024) (470, 1024)
Embeddings computed on device: cuda

=== Logistic Regression validation ===
accuracy: 0.8000
precision_macro: 0.8005  precision_weighted: 0.7939
recall_macro: 0.7412  recall_weighted: 0.8000
f1_macro: 0.7634  f1_weighted: 0.7899
roc_auc_ovr: 0.9392
              precision    recall  f1-score   support

           0     0.8056    0.6905    0.7436        42
           1     0.7836    0.9052    0.8400       232
           2     0.7647    0.6341    0.6933        41
           3     0.7018    0.4762    0.5674        84
           4     0.9467    1.0000    0.9726        71

    accuracy                         0.8000       470
   macro avg     0.8005    0.7412    0.7634       470
weighted avg     0.7939    0.8000    0.7899       470

Confusion matrix:
 [[ 29  10   2   1   0]
 [  2 210   4  15   1]
 [  5   8  26   1   1]
 [  0  40   2  40   2]
 [  0   0   0   0  71]]

=== Logistic Regression test ===
accuracy: 0.8021
precision_macro: 0.81

In [1]:
# ==========================================
# Fine tune google-bert/bert-large-uncased for multiclass classification
# ==========================================

from pathlib import Path
import os, json
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    DataCollatorWithPadding,
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)
from torch.utils.data import Dataset

# ---------- progress visibility & Windows safety ----------
os.environ.pop("HF_DISABLE_PROGRESS_BARS", None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

class ConsoleLogger(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs:
            return
        parts = []
        if state.global_step is not None: parts.append(f"step {state.global_step}")
        if state.epoch is not None: parts.append(f"epoch {state.epoch:.2f}")
        if "loss" in logs: parts.append(f"loss {logs['loss']:.4f}")
        if "learning_rate" in logs: parts.append(f"lr {logs['learning_rate']:.6f}")
        print(" | ".join(parts), flush=True)

# ---------- config ----------
RANDOM_STATE = 42
MODEL_NAME = "google-bert/bert-large-uncased"
MAX_LENGTH = 512
EPOCHS = 5
LR = 2e-5
WD = 0.01
TRAIN_BS = 8           # per-device train batch (small for memory)
EVAL_BS = 16           # per-device eval batch
GRAD_ACCUM_STEPS = 8   # effective batch = 8 * 8 = 64
USE_CLASS_WEIGHTS = True

# ---------- locate Data_Warehouse ----------
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate a folder named Data_Warehouse")

try:
    script_dir = Path(__file__).resolve().parent
except NameError:
    script_dir = Path.cwd()
data_warehouse = find_data_warehouse(script_dir)

# ---------- load splits (already created earlier) ----------
split_dir = data_warehouse / "mental_health_splits_no_stress"
df_train = pd.read_csv(split_dir / "train.csv")
df_val   = pd.read_csv(split_dir / "val.csv")
df_test  = pd.read_csv(split_dir / "test.csv")

# checks
for name, df_ in [("train", df_train), ("val", df_val), ("test", df_test)]:
    req = {"text", "label", "label_enc"}
    if not req.issubset(df_.columns):
        raise ValueError(f"{name} split missing required columns: {req}")

num_labels = int(df_train["label_enc"].max()) + 1
# build mapping from the split (keeps your 'none' -> 4 if that’s how you saved it)
enc_to_label = df_train[["label_enc", "label"]].drop_duplicates().sort_values("label_enc")
id2label = {int(r.label_enc): str(r.label) for _, r in enc_to_label.iterrows()}
label2id = {v: k for k, v in id2label.items()}

print("Label mapping used:")
for k in sorted(id2label.keys()):
    print(f"{k}: {id2label[k]}")

# ---------- Dataset wrapper with dynamic padding ----------
class TextClsDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df["text"].astype(str).tolist()
        self.labels = df["label_enc"].astype(int).tolist()
        self.tokenizer = tokenizer
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,          # dynamic padding via collator
            max_length=MAX_LENGTH,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ---------- tokenizer & model ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch device:", device)

tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
if torch.cuda.is_available():
    model.gradient_checkpointing_enable()
model.config.use_cache = False
model.to(device)

# ---------- class weights ----------
class_weights = None
if USE_CLASS_WEIGHTS:
    counts = df_train["label_enc"].value_counts().sort_index()
    total = counts.sum()
    # inverse frequency: total / (num_labels * count_c)
    weights = total / (num_labels * counts)
    class_weights = torch.tensor(weights.to_numpy(), dtype=torch.float, device=device)
    print("Using class weights:", [float(x) for x in class_weights])

# ---------- custom Trainer with robust compute_loss ----------
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=class_weights) if class_weights is not None else F.cross_entropy(logits, labels)
        return (loss, outputs) if return_outputs else loss

# ---------- datasets & collator ----------
train_ds = TextClsDataset(df_train, tokenizer)
val_ds   = TextClsDataset(df_val, tokenizer)
test_ds  = TextClsDataset(df_test, tokenizer)

collator = DataCollatorWithPadding(
    tokenizer,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None,
)

# ---------- metrics ----------
def compute_metrics_from_logits(logits, labels):
    preds = np.argmax(logits, axis=1)
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
    out = {
        "accuracy": float(accuracy_score(labels, preds)),
        "precision_macro": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "precision_weighted": float(precision_score(labels, preds, average="weighted", zero_division=0)),
        "recall_macro": float(recall_score(labels, preds, average="macro", zero_division=0)),
        "recall_weighted": float(recall_score(labels, preds, average="weighted", zero_division=0)),
        "f1_macro": float(f1_score(labels, preds, average="macro", zero_division=0)),
        "f1_weighted": float(f1_score(labels, preds, average="weighted", zero_division=0)),
    }
    try:
        out["roc_auc_ovr"] = float(roc_auc_score(labels, probs, multi_class="ovr"))
    except Exception:
        out["roc_auc_ovr"] = None
    return out, preds

# ---------- output dir ----------
out_dir = data_warehouse / "mental_health_splits_no_stress" / "bert_large_multiclass"
out_dir.mkdir(parents=True, exist_ok=True)

# ---------- TrainingArguments (Windows-friendly) ----------
args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WD,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    dataloader_num_workers=0,     # <-- critical on Windows to avoid hangs
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    disable_tqdm=False,
    logging_steps=50,
    logging_first_step=True,
    report_to=[],
    seed=RANDOM_STATE,
    # optim="adamw_torch_fused",  # optional (PyTorch 2.0+ on recent GPUs); enable if available
)

TrainerClass = WeightedTrainer if USE_CLASS_WEIGHTS else Trainer
trainer = TrainerClass(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,   # (we still do manual eval)
    tokenizer=tokenizer,
    data_collator=collator,
)
trainer.add_callback(ConsoleLogger())

# ---------- train ----------
print("\nStarting trainer.train() ...", flush=True)
train_result = trainer.train()
print("Finished trainer.train()", flush=True)
trainer.save_model(out_dir)
tokenizer.save_pretrained(out_dir)

with open(out_dir / "train_metrics.json", "w") as f:
    json.dump({k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in train_result.metrics.items()}, f, indent=2)

# ---------- manual evaluation ----------
val_out = trainer.predict(val_ds)
val_metrics, _ = compute_metrics_from_logits(val_out.predictions, val_out.label_ids)

test_out = trainer.predict(test_ds)
test_metrics, test_preds = compute_metrics_from_logits(test_out.predictions, test_out.label_ids)

with open(out_dir / "val_metrics.json", "w") as f:
    json.dump(val_metrics, f, indent=2)
with open(out_dir / "test_metrics.json", "w") as f:
    json.dump(test_metrics, f, indent=2)

rep = classification_report(
    test_out.label_ids, test_preds,
    target_names=[id2label[i] for i in range(num_labels)],
    digits=4, zero_division=0
)
cm = confusion_matrix(test_out.label_ids, test_preds)

with open(out_dir / "test_classification_report.txt", "w") as f:
    f.write(rep)

pd.DataFrame(
    cm,
    index=[f"true_{id2label[i]}" for i in range(num_labels)],
    columns=[f"pred_{id2label[i]}" for i in range(num_labels)],
).to_csv(out_dir / "test_confusion_matrix.csv", index=True)

print("\nValidation metrics:", val_metrics)
print("Test metrics:", test_metrics)
print("\nSaved full model and metrics to:", out_dir.resolve())


Label mapping used:
0: anxiety
1: depression
2: ptsd
3: suicide
4: none
Torch device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using class weights: [2.262650489807129, 0.40430569648742676, 2.262650489807129, 1.1211940050125122, 1.3319149017333984]

Starting trainer.train() ...


  trainer = TrainerClass(


Step,Training Loss
1,1.8306
50,1.26
100,0.8372
150,0.6094
200,0.438
250,0.351


step 1 | epoch 0.02 | loss 1.8306 | lr 0.000020
step 50 | epoch 0.85 | loss 1.2600 | lr 0.000017
step 100 | epoch 1.70 | loss 0.8372 | lr 0.000013
step 150 | epoch 2.54 | loss 0.6094 | lr 0.000010
step 200 | epoch 3.39 | loss 0.4380 | lr 0.000007
step 250 | epoch 4.24 | loss 0.3510 | lr 0.000003
step 295 | epoch 5.00
Finished trainer.train()



Validation metrics: {'accuracy': 0.723404255319149, 'precision_macro': 0.728296305957712, 'precision_weighted': 0.7620640699864728, 'recall_macro': 0.761778337532907, 'recall_weighted': 0.723404255319149, 'f1_macro': 0.7347998942531804, 'f1_weighted': 0.730514158415949, 'roc_auc_ovr': 0.9333625458178337}
Test metrics: {'accuracy': 0.8085106382978723, 'precision_macro': 0.8052368226148715, 'precision_weighted': 0.8268573199943205, 'recall_macro': 0.8187317766988812, 'recall_weighted': 0.8085106382978723, 'f1_macro': 0.8084007414176005, 'f1_weighted': 0.8138389978863159, 'roc_auc_ovr': 0.9601324886755472}

Saved full model and metrics to: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_no_stress\bert_large_multiclass


In [7]:
# ==========================================
# Fine tune sentence-transformers/all-mpnet-base-v2 for multiclass classification
# ==========================================

from pathlib import Path
import os, json
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    DataCollatorWithPadding,
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)
from torch.utils.data import Dataset

# ---------- progress visibility & Windows safety ----------
os.environ.pop("HF_DISABLE_PROGRESS_BARS", None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

class ConsoleLogger(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs:
            return
        parts = []
        if state.global_step is not None: parts.append(f"step {state.global_step}")
        if state.epoch is not None: parts.append(f"epoch {state.epoch:.2f}")
        if "loss" in logs: parts.append(f"loss {logs['loss']:.4f}")
        if "learning_rate" in logs: parts.append(f"lr {logs['learning_rate']:.6f}")
        print(" | ".join(parts), flush=True)

# ---------- config ----------
RANDOM_STATE = 42
TOKENIZER_NAME = "sentence-transformers/all-mpnet-base-v2"  # tokenizer
BACKBONE_NAME  = "microsoft/mpnet-base"                     # classification head
MAX_LENGTH = 512
EPOCHS = 5
LR = 2e-5
WD = 0.01
TRAIN_BS = 8
EVAL_BS = 16
GRAD_ACCUM_STEPS = 8     # effective batch = 64 (reduce if OOM)
USE_CLASS_WEIGHTS = True

# ---------- locate Data_Warehouse ----------
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate a folder named Data_Warehouse")

try:
    script_dir = Path(__file__).resolve().parent
except NameError:
    script_dir = Path.cwd()
data_warehouse = find_data_warehouse(script_dir)

# ---------- load splits ----------
split_dir = data_warehouse / "mental_health_splits_no_stress"
df_train = pd.read_csv(split_dir / "train.csv")
df_val   = pd.read_csv(split_dir / "val.csv")
df_test  = pd.read_csv(split_dir / "test.csv")

for name, df_ in [("train", df_train), ("val", df_val), ("test", df_test)]:
    req = {"text", "label", "label_enc"}
    if not req.issubset(df_.columns):
        raise ValueError(f"{name} split missing required columns: {req}")

num_labels = int(df_train["label_enc"].max()) + 1
enc_to_label = df_train[["label_enc", "label"]].drop_duplicates().sort_values("label_enc")
id2label = {int(r.label_enc): str(r.label) for _, r in enc_to_label.iterrows()}
label2id = {v: k for k, v in id2label.items()}

print("Label mapping used:")
for k in sorted(id2label.keys()):
    print(f"{k}: {id2label[k]}")

# ---------- dataset (dynamic padding) ----------
class TextClsDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df["text"].astype(str).tolist()
        self.labels = df["label_enc"].astype(int).tolist()
        self.tokenizer = tokenizer
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,      # pad per-batch via collator
            max_length=MAX_LENGTH,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ---------- tokenizer & model ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch device:", device)

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    BACKBONE_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# MPNet does not support gradient checkpointing → guard it
gc_enabled = False
try:
    if getattr(model, "supports_gradient_checkpointing", False):
        model.gradient_checkpointing_enable()
        gc_enabled = True
except Exception as e:
    print(f"Gradient checkpointing not available for MPNet: {e}")

# some configs lack use_cache; guard it
if hasattr(model.config, "use_cache"):
    model.config.use_cache = False

model.to(device)

# ---------- class weights ----------
class_weights = None
if USE_CLASS_WEIGHTS:
    counts = df_train["label_enc"].value_counts().sort_index()
    total = counts.sum()
    weights = total / (num_labels * counts)  # inverse frequency
    class_weights = torch.tensor(weights.to_numpy(), dtype=torch.float, device=device)
    print("Using class weights:", [float(x) for x in class_weights])

# ---------- custom Trainer ----------
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=class_weights) if class_weights is not None else F.cross_entropy(logits, labels)
        return (loss, outputs) if return_outputs else loss

# ---------- datasets & collator ----------
train_ds = TextClsDataset(df_train, tokenizer)
val_ds   = TextClsDataset(df_val, tokenizer)
test_ds  = TextClsDataset(df_test, tokenizer)

collator = DataCollatorWithPadding(
    tokenizer,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None,
)

# ---------- metrics ----------
def compute_metrics_from_logits(logits, labels):
    preds = np.argmax(logits, axis=1)
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
    out = {
        "accuracy": float(accuracy_score(labels, preds)),
        "precision_macro": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "precision_weighted": float(precision_score(labels, preds, average="weighted", zero_division=0)),
        "recall_macro": float(recall_score(labels, preds, average="macro", zero_division=0)),
        "recall_weighted": float(recall_score(labels, preds, average="weighted", zero_division=0)),
        "f1_macro": float(f1_score(labels, preds, average="macro", zero_division=0)),
        "f1_weighted": float(f1_score(labels, preds, average="weighted", zero_division=0)),
    }
    try:
        out["roc_auc_ovr"] = float(roc_auc_score(labels, probs, multi_class="ovr"))
    except Exception:
        out["roc_auc_ovr"] = None
    return out, preds

# ---------- output dir ----------
out_dir = data_warehouse / "mental_health_splits_no_stress" / "all_mpnet_base_v2_multiclass"
out_dir.mkdir(parents=True, exist_ok=True)

# ---------- TrainingArguments ----------
args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WD,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    dataloader_num_workers=0,   # Windows-safe
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    disable_tqdm=False,
    logging_steps=50,
    logging_first_step=True,
    report_to=[],
    seed=RANDOM_STATE,
    # optim="adamw_torch_fused",  # optional on PyTorch 2.x + recent GPUs
)

TrainerClass = WeightedTrainer if USE_CLASS_WEIGHTS else Trainer
trainer = TrainerClass(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=collator,
)
trainer.add_callback(ConsoleLogger())

# ---------- train ----------
print("\nStarting trainer.train() ...", flush=True)
train_result = trainer.train()
print("Finished trainer.train()", flush=True)
trainer.save_model(out_dir)
tokenizer.save_pretrained(out_dir)

with open(out_dir / "train_metrics.json", "w") as f:
    json.dump({k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in train_result.metrics.items()}, f, indent=2)

# ---------- manual evaluation ----------
val_out = trainer.predict(val_ds)
val_metrics, _ = compute_metrics_from_logits(val_out.predictions, val_out.label_ids)

test_out = trainer.predict(test_ds)
test_metrics, test_preds = compute_metrics_from_logits(test_out.predictions, test_out.label_ids)

with open(out_dir / "val_metrics.json", "w") as f:
    json.dump(val_metrics, f, indent=2)
with open(out_dir / "test_metrics.json", "w") as f:
    json.dump(test_metrics, f, indent=2)

rep = classification_report(
    test_out.label_ids, test_preds,
    target_names=[id2label[i] for i in range(num_labels)],
    digits=4, zero_division=0
)
cm = confusion_matrix(test_out.label_ids, test_preds)

with open(out_dir / "test_classification_report.txt", "w") as f:
    f.write(rep)

pd.DataFrame(
    cm,
    index=[f"true_{id2label[i]}" for i in range(num_labels)],
    columns=[f"pred_{id2label[i]}" for i in range(num_labels)],
).to_csv(out_dir / "test_confusion_matrix.csv", index=True)

print("\nValidation metrics:", val_metrics)
print("Test metrics:", test_metrics)
print("\nSaved MPNet model and metrics to:", out_dir.resolve())


Label mapping used:
0: anxiety
1: depression
2: ptsd
3: suicide
4: none
Torch device: cuda


Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using class weights: [2.262650489807129, 0.40430569648742676, 2.262650489807129, 1.1211940050125122, 1.3319149017333984]

Starting trainer.train() ...


  trainer = TrainerClass(


Step,Training Loss
1,1.6068
50,1.4168
100,1.1258
150,0.9415
200,0.8048
250,0.7397


step 1 | epoch 0.02 | loss 1.6068 | lr 0.000020
step 50 | epoch 0.85 | loss 1.4168 | lr 0.000017
step 100 | epoch 1.70 | loss 1.1258 | lr 0.000013
step 150 | epoch 2.54 | loss 0.9415 | lr 0.000010
step 200 | epoch 3.39 | loss 0.8048 | lr 0.000007
step 250 | epoch 4.24 | loss 0.7397 | lr 0.000003
step 295 | epoch 5.00
Finished trainer.train()



Validation metrics: {'accuracy': 0.7319148936170212, 'precision_macro': 0.7146765996128218, 'precision_weighted': 0.7601665893553777, 'recall_macro': 0.7511117447606864, 'recall_weighted': 0.7319148936170212, 'f1_macro': 0.7241587920832082, 'f1_weighted': 0.737719563727177, 'roc_auc_ovr': 0.9212729605008072}
Test metrics: {'accuracy': 0.7829787234042553, 'precision_macro': 0.7733648399802349, 'precision_weighted': 0.8035480535608602, 'recall_macro': 0.7887544216800665, 'recall_weighted': 0.7829787234042553, 'f1_macro': 0.7778025677880166, 'f1_weighted': 0.7892983734292202, 'roc_auc_ovr': 0.9464913487644114}

Saved MPNet model and metrics to: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_no_stress\all_mpnet_base_v2_multiclass


In [4]:
# ==========================================
# Fine tune sentence-transformers/all-roberta-large-v1
# with overfitting checks, early stopping, and label-wise display
# ==========================================

from pathlib import Path
import os, json
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    DataCollatorWithPadding,
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)
from torch.utils.data import Dataset

# ---------- progress visibility & Windows safety ----------
os.environ.pop("HF_DISABLE_PROGRESS_BARS", None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

class ConsoleLogger(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs:
            return
        parts = []
        if state.global_step is not None: parts.append(f"step {state.global_step}")
        if state.epoch is not None: parts.append(f"epoch {state.epoch:.2f}")
        if "loss" in logs: parts.append(f"loss {logs['loss']:.4f}")
        if "learning_rate" in logs: parts.append(f"lr {logs['learning_rate']:.6f}")
        print(" | ".join(parts), flush=True)

# ---------- config ----------
RANDOM_STATE = 42
TOKENIZER_NAME = "sentence-transformers/all-roberta-large-v1"  # ST tokenizer
BACKBONE_NAME  = "roberta-large"                               # classification backbone
MAX_LENGTH = 512
EPOCHS = 5
LR = 2e-5
WD = 0.01
TRAIN_BS = 4             # roberta-large is heavy; keep modest
EVAL_BS = 16
GRAD_ACCUM_STEPS = 8     # effective batch = 4 * 8 = 32
USE_CLASS_WEIGHTS = True
LABEL_SMOOTHING = 0.0    # e.g., set 0.05 if you observe overfitting

EARLY_STOP_PATIENCE = 2  # epochs without improvement before stopping
EARLY_STOP_MONITOR = "f1_macro"

# ---------- locate Data_Warehouse ----------
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate a folder named Data_Warehouse")

try:
    script_dir = Path(__file__).resolve().parent
except NameError:
    script_dir = Path.cwd()
data_warehouse = find_data_warehouse(script_dir)

# ---------- load splits ----------
split_dir = data_warehouse / "mental_health_splits_no_stress"
df_train = pd.read_csv(split_dir / "train.csv")
df_val   = pd.read_csv(split_dir / "val.csv")
df_test  = pd.read_csv(split_dir / "test.csv")

# checks
for name, df_ in [("train", df_train), ("val", df_val), ("test", df_test)]:
    req = {"text", "label", "label_enc"}
    if not req.issubset(df_.columns):
        raise ValueError(f"{name} split missing required columns: {req}")

num_labels = int(df_train["label_enc"].max()) + 1
enc_to_label = df_train[["label_enc", "label"]].drop_duplicates().sort_values("label_enc")
id2label = {int(r.label_enc): str(r.label) for _, r in enc_to_label.iterrows()}
label2id = {v: k for k, v in id2label.items()}
label_names = [id2label[i] for i in range(num_labels)]

print("Label mapping used:")
for k in sorted(id2label.keys()):
    print(f"{k}: {id2label[k]}")

# ---------- dataset (dynamic padding) ----------
class TextClsDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df["text"].astype(str).tolist()
        self.labels = df["label_enc"].astype(int).tolist()
        self.tokenizer = tokenizer
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,      # pad per-batch via collator
            max_length=MAX_LENGTH,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ---------- tokenizer & model ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch device:", device)

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    BACKBONE_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# Try to enable gradient checkpointing to save memory
if torch.cuda.is_available():
    try:
        model.gradient_checkpointing_enable()
    except Exception as e:
        print(f"Could not enable gradient checkpointing: {e}")

if hasattr(model.config, "use_cache"):
    model.config.use_cache = False

model.to(device)

# ---------- class weights ----------
class_weights = None
if USE_CLASS_WEIGHTS:
    counts = df_train["label_enc"].value_counts().sort_index()
    total = counts.sum()
    weights = total / (num_labels * counts)  # inverse frequency
    class_weights = torch.tensor(weights.to_numpy(), dtype=torch.float, device=device)
    print("Using class weights:", [float(x) for x in class_weights])

# ---------- custom Trainer (weighted CE + optional label smoothing) ----------
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        loss = F.cross_entropy(
            logits, labels,
            weight=class_weights,
            label_smoothing=LABEL_SMOOTHING if LABEL_SMOOTHING > 0 else 0.0
        ) if class_weights is not None else F.cross_entropy(
            logits, labels, label_smoothing=LABEL_SMOOTHING if LABEL_SMOOTHING > 0 else 0.0
        )
        return (loss, outputs) if return_outputs else loss

# ---------- datasets & collator ----------
train_ds = TextClsDataset(df_train, tokenizer)
val_ds   = TextClsDataset(df_val, tokenizer)
test_ds  = TextClsDataset(df_test, tokenizer)

collator = DataCollatorWithPadding(
    tokenizer,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None,
)

# ---------- metrics (overall) ----------
def compute_metrics_from_logits(logits, labels):
    preds = np.argmax(logits, axis=1)
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
    out = {
        "accuracy": float(accuracy_score(labels, preds)),
        "precision_macro": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "precision_weighted": float(precision_score(labels, preds, average="weighted", zero_division=0)),
        "recall_macro": float(recall_score(labels, preds, average="macro", zero_division=0)),
        "recall_weighted": float(recall_score(labels, preds, average="weighted", zero_division=0)),
        "f1_macro": float(f1_score(labels, preds, average="macro", zero_division=0)),
        "f1_weighted": float(f1_score(labels, preds, average="weighted", zero_division=0)),
    }
    try:
        out["roc_auc_ovr"] = float(roc_auc_score(labels, probs, multi_class="ovr"))
    except Exception:
        out["roc_auc_ovr"] = None
    return out, preds, probs

# ---------- label-wise metrics (DISPLAY ONLY) ----------
def per_label_report(logits, labels, label_names):
    preds = np.argmax(logits, axis=1)
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()

    rep = classification_report(
        labels, preds, target_names=label_names,
        output_dict=True, zero_division=0
    )
    df = pd.DataFrame(rep).T

    # add one-vs-rest ROC-AUC per class
    aucs = {}
    y_true = np.array(labels)
    for i, name in enumerate(label_names):
        try:
            y_bin = (y_true == i).astype(int)
            aucs[name] = roc_auc_score(y_bin, probs[:, i])
        except Exception:
            aucs[name] = np.nan

    for name, auc in aucs.items():
        if name in df.index:
            df.loc[name, "roc_auc_ovr"] = float(auc) if auc == auc else None

    # keep useful columns for display
    cols = ["precision", "recall", "f1-score", "support", "roc_auc_ovr"]
    for c in cols:
        if c not in df.columns:
            df[c] = np.nan
    df = df[cols]
    return df

def show_labelwise(df, title):
    print("\n" + "="*len(title))
    print(title)
    print("="*len(title))
    print(df.round(4).to_string())

# ---------- output dir ----------
out_dir = data_warehouse / "mental_health_splits_no_stress" / "all_roberta_large_v1_multiclass"
out_dir.mkdir(parents=True, exist_ok=True)

# ---------- TrainingArguments (Windows-friendly) ----------
args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WD,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    dataloader_num_workers=0,   # Windows: avoid hangs
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    disable_tqdm=False,
    logging_steps=50,
    logging_first_step=True,
    report_to=[],
    seed=RANDOM_STATE,
    # optim="adamw_torch_fused",  # optional on PyTorch 2.x + recent GPUs
)

TrainerClass = WeightedTrainer if USE_CLASS_WEIGHTS else Trainer
trainer = TrainerClass(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,   # we run our own eval per epoch too
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer.add_callback(ConsoleLogger())

# ---------- Overfitting monitor: eval each epoch + early stopping + best checkpoint ----------
class EvalEveryEpochCallback(TrainerCallback):
    def __init__(self, trainer_ref, val_ds, out_dir, patience=2, monitor="f1_macro"):
        self.trainer_ref = trainer_ref
        self.val_ds = val_ds
        self.out_dir = Path(out_dir)
        self.monitor = monitor
        self.best = -float("inf")
        self.bad_epochs = 0
        self.patience = patience
        self.log_path = self.out_dir / "epoch_metrics.jsonl"
        self._fh = None

    def on_train_begin(self, args, state, control, **kwargs):
        self._fh = open(self.log_path, "a", encoding="utf-8")

    def on_epoch_end(self, args, state, control, **kwargs):
        out = self.trainer_ref.predict(self.val_ds)
        metrics, _, _ = compute_metrics_from_logits(out.predictions, out.label_ids)
        metrics["epoch"] = float(state.epoch)
        self._fh.write(json.dumps(metrics) + "\n"); self._fh.flush()
        print(f"[epoch {metrics['epoch']:.2f}] val f1_macro={metrics['f1_macro']:.4f} "
              f"acc={metrics['accuracy']:.4f}", flush=True)
        score = metrics.get(self.monitor, -float("inf"))
        if score > self.best + 1e-8:
            self.best = score
            self.bad_epochs = 0
            self.trainer_ref.save_model(self.out_dir / "best")
        else:
            self.bad_epochs += 1
            if self.bad_epochs >= self.patience:
                print(f"Early stopping: no improvement in {self.patience} epoch(s).", flush=True)
                control.should_training_stop = True

    def on_train_end(self, args, state, control, **kwargs):
        if self._fh:
            self._fh.close()

trainer.add_callback(EvalEveryEpochCallback(trainer, val_ds, out_dir, patience=EARLY_STOP_PATIENCE, monitor=EARLY_STOP_MONITOR))

# ---------- train ----------
print("\nStarting trainer.train() ...", flush=True)
train_result = trainer.train()
print("Finished trainer.train()", flush=True)

# save "last" model + tokenizer
trainer.save_model(out_dir)
tokenizer.save_pretrained(out_dir)

with open(out_dir / "train_metrics.json", "w") as f:
    json.dump({k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in train_result.metrics.items()}, f, indent=2)

# ---------- manual evaluation (last model) ----------
val_out = trainer.predict(val_ds)
val_metrics, val_preds, val_probs = compute_metrics_from_logits(val_out.predictions, val_out.label_ids)

test_out = trainer.predict(test_ds)
test_metrics, test_preds, test_probs = compute_metrics_from_logits(test_out.predictions, test_out.label_ids)

# also evaluate TRAIN to check generalization gap
#train_out = trainer.predict(train_dataset=train_ds)
train_out = trainer.predict(train_ds)
train_metrics, train_preds, train_probs = compute_metrics_from_logits(train_out.predictions, train_out.label_ids)

# save overall metrics
with open(out_dir / "train_eval_metrics.json", "w") as f:
    json.dump(train_metrics, f, indent=2)
with open(out_dir / "val_metrics.json", "w") as f:
    json.dump(val_metrics, f, indent=2)
with open(out_dir / "test_metrics.json", "w") as f:
    json.dump(test_metrics, f, indent=2)

# ---------- DISPLAY label-wise metrics for each split ----------
train_labelwise = per_label_report(train_out.predictions, train_out.label_ids, label_names)
val_labelwise   = per_label_report(val_out.predictions,   val_out.label_ids,   label_names)
test_labelwise  = per_label_report(test_out.predictions,  test_out.label_ids,  label_names)

show_labelwise(train_labelwise, "Label-wise metrics — TRAIN")
show_labelwise(val_labelwise,   "Label-wise metrics — VAL")
show_labelwise(test_labelwise,  "Label-wise metrics — TEST")

# ---------- classification report & confusion matrix on test ----------
rep = classification_report(
    test_out.label_ids, test_preds,
    target_names=label_names,
    digits=4, zero_division=0
)
cm = confusion_matrix(test_out.label_ids, test_preds)

with open(out_dir / "test_classification_report.txt", "w") as f:
    f.write(rep)
pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names],
).to_csv(out_dir / "test_confusion_matrix.csv", index=True)

# ---------- simple gap printout ----------
print("\nGeneralization gaps (train - val):",
      {"acc": train_metrics["accuracy"] - val_metrics["accuracy"],
       "f1_macro": train_metrics["f1_macro"] - val_metrics["f1_macro"],
       "f1_weighted": train_metrics["f1_weighted"] - val_metrics["f1_weighted"]})

print("\nTrain metrics:", train_metrics)
print("Val metrics:", val_metrics)
print("Test metrics:", test_metrics)

print("\nSaved model, best checkpoint (if any), and metrics to:", out_dir.resolve())

# ---------- (Optional) Evaluate the saved 'best' checkpoint and DISPLAY label-wise ----------
best_dir = out_dir / "best"
if best_dir.exists():
    print("\nEvaluating best checkpoint ...")
    best_model = AutoModelForSequenceClassification.from_pretrained(best_dir).to(device)
    best_trainer = Trainer(
        model=best_model,
        args=args,
        tokenizer=tokenizer,
        data_collator=collator,
    )
    b_val = best_trainer.predict(val_ds)
    b_val_metrics, _, _ = compute_metrics_from_logits(b_val.predictions, b_val.label_ids)
    b_test = best_trainer.predict(test_ds)
    b_test_metrics, _, _ = compute_metrics_from_logits(b_test.predictions, b_test.label_ids)

    print("\nBest checkpoint metrics:")
    print("Val:", b_val_metrics)
    print("Test:", b_test_metrics)

    # label-wise display for best test
    b_test_labelwise = per_label_report(b_test.predictions, b_test.label_ids, label_names)
    show_labelwise(b_test_labelwise, "Label-wise metrics — TEST (Best checkpoint)")


Label mapping used:
0: anxiety
1: depression
2: ptsd
3: suicide
4: none
Torch device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using class weights: [2.262650489807129, 0.40430569648742676, 2.262650489807129, 1.1211940050125122, 1.3319149017333984]

Starting trainer.train() ...


  trainer = TrainerClass(


Step,Training Loss
1,1.7303
50,1.1968
100,0.704
150,0.5011
200,0.4195
250,0.4003
300,0.2804
350,0.2734
400,0.1728
450,0.1641


step 1 | epoch 0.01 | loss 1.7303 | lr 0.000020
step 50 | epoch 0.43 | loss 1.1968 | lr 0.000018
step 100 | epoch 0.85 | loss 0.7040 | lr 0.000017
[epoch 1.00] val f1_macro=0.6953 acc=0.7021
step 150 | epoch 1.27 | loss 0.5011 | lr 0.000015
step 200 | epoch 1.70 | loss 0.4195 | lr 0.000013
[epoch 2.00] val f1_macro=0.7736 acc=0.8064
step 250 | epoch 2.12 | loss 0.4003 | lr 0.000012
step 300 | epoch 2.55 | loss 0.2804 | lr 0.000010
step 350 | epoch 2.97 | loss 0.2734 | lr 0.000008
[epoch 3.00] val f1_macro=0.7879 acc=0.8149
step 400 | epoch 3.39 | loss 0.1728 | lr 0.000006
step 450 | epoch 3.82 | loss 0.1641 | lr 0.000005
[epoch 4.00] val f1_macro=0.7912 acc=0.8191
step 500 | epoch 4.24 | loss 0.1272 | lr 0.000003
step 550 | epoch 4.66 | loss 0.0783 | lr 0.000001
[epoch 5.00] val f1_macro=0.7946 acc=0.8191
step 590 | epoch 5.00
Finished trainer.train()



Label-wise metrics — TRAIN
              precision  recall  f1-score    support  roc_auc_ovr
anxiety          0.9764  0.9970    0.9866   332.0000       0.9998
depression       0.9950  0.9645    0.9795  1858.0000       0.9974
ptsd             0.9970  0.9940    0.9955   332.0000       1.0000
suicide          0.9196  0.9896    0.9533   670.0000       0.9970
none             1.0000  1.0000    1.0000   564.0000       1.0000
accuracy         0.9798  0.9798    0.9798     0.9798          NaN
macro avg        0.9776  0.9890    0.9830  3756.0000          NaN
weighted avg     0.9808  0.9798    0.9799  3756.0000          NaN

Label-wise metrics — VAL
              precision  recall  f1-score   support  roc_auc_ovr
anxiety          0.6731  0.8333    0.7447   42.0000       0.9772
depression       0.8690  0.8578    0.8633  232.0000       0.9456
ptsd             0.8056  0.7073    0.7532   41.0000       0.9810
suicide          0.6548  0.6548    0.6548   84.0000       0.9280
none             0.9710  0.

  best_trainer = Trainer(



Best checkpoint metrics:
Val: {'accuracy': 0.8191489361702128, 'precision_macro': 0.7946809018671559, 'precision_weighted': 0.823076647727828, 'recall_macro': 0.7993665807573221, 'recall_weighted': 0.8191489361702128, 'f1_macro': 0.7946345860413336, 'f1_weighted': 0.8200252475126119, 'roc_auc_ovr': 0.9661677556738134}
Test: {'accuracy': 0.8808510638297873, 'precision_macro': 0.8749145805884936, 'precision_weighted': 0.8867921864341846, 'recall_macro': 0.8691845635066355, 'recall_weighted': 0.8808510638297873, 'f1_macro': 0.8704369633950826, 'f1_weighted': 0.8827005935343879, 'roc_auc_ovr': 0.9792260466017186}

Label-wise metrics — TEST (Best checkpoint)
              precision  recall  f1-score   support  roc_auc_ovr
anxiety          0.7955  0.8333    0.8140   42.0000       0.9705
depression       0.9200  0.8922    0.9059  232.0000       0.9697
ptsd             0.9444  0.8293    0.8831   41.0000       0.9976
suicide          0.7292  0.8333    0.7778   84.0000       0.9596
none        