In [1]:
# Step 1
from pathlib import Path
import pandas as pd

# ===== config =====
DATASET_NAME = "combined_mental_condition_dataset_balanced.csv"

# Option A: set this if you know the path
# data_warehouse = Path(r"/absolute/path/to/Data_Warehouse")

# Option B: auto locate Data_Warehouse by walking up from current working folder
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate a folder named Data_Warehouse")

try:
    data_warehouse
except NameError:
    try:
        script_dir = Path(__file__).resolve().parent  # running as a script
    except NameError:
        script_dir = Path.cwd()                       # running in a notebook
    data_warehouse = find_data_warehouse(script_dir)

# ===== load =====
data_path = data_warehouse / DATASET_NAME
df_all = pd.read_csv(data_path)

# ===== basic checks =====
required = {"text", "label"}
missing = required - set(df_all.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# coerce types and minimal cleanup
df_all["text"] = df_all["text"].astype(str)
df_all["label"] = df_all["label"].astype(str).str.strip()

# drop any empty text or label
before = len(df_all)
df_all = df_all[(df_all["text"].str.strip() != "") & (df_all["label"].str.strip() != "")]
dropped = before - len(df_all)

# expected label set and sanity checks
expected_labels = {"anxiety", "depression", "ptsd", "suicide", "stress", "none"}
observed = set(df_all["label"].str.lower().unique())
missing_expected = expected_labels - observed

if missing_expected:
    print("Warning: the following expected labels were not found:", sorted(missing_expected))

# final echo
print(f"Loaded {len(df_all)} rows from {data_path}")
if dropped:
    print(f"Dropped {dropped} empty text or label rows")

print("\nLabel distribution:")
print(df_all["label"].str.lower().value_counts().reindex(sorted(observed), fill_value=0))

print("\nSample rows:")
print(df_all.sample(min(5, len(df_all)), random_state=42)[["text", "label"]])


Loaded 6970 rows from d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\combined_mental_condition_dataset_balanced.csv
Dropped 1 empty text or label rows

Label distribution:
label
anxiety        416
depression    2322
none           706
ptsd           414
stress        2274
suicide        838
Name: count, dtype: int64

Sample rows:
                                                   text       label
132   I always sound like I’m about to cry. I get re...     anxiety
3236  How do migraines work? Yesterday I had an AWFU...        none
2168  I have fallen down further, when I thought I c...  depression
4086  I've been planning on cleaning but then stupid...      stress
4410  This week has been really rough, for some reas...      stress


In [2]:
# Step 2
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

# ===== config =====
RANDOM_STATE = 42
TEST_SIZE = 0.10
VAL_SIZE = 0.10   # portion of the full dataset

# ===== input =====
# df_all is assumed to exist with columns ["text", "label"]

# ===== fixed label to id mapping with 'none' = 5 =====
canonical_order = ["anxiety", "depression", "ptsd", "suicide", "stress", "none"]
class_to_id = {
    "anxiety": 0,
    "depression": 1,
    "ptsd": 2,
    "suicide": 3,
    "stress": 4,
    "none": 5,
}
id_to_class = {v: k for k, v in class_to_id.items()}

# normalize and map
df_all = df_all.copy()
df_all["label_norm"] = df_all["label"].str.lower().str.strip()
unknown = sorted(set(df_all["label_norm"].unique()) - set(class_to_id.keys()))
if unknown:
    raise ValueError(f"Found unknown labels in dataset: {unknown}")

df_all["label_enc"] = df_all["label_norm"].map(class_to_id)

print("Classes and ids:")
for name in canonical_order:
    print(f"{name} -> {class_to_id[name]}")

# ===== split 80 10 10 with stratify =====
df_trainval, df_test = train_test_split(
    df_all,
    test_size=TEST_SIZE,
    stratify=df_all["label_enc"],
    random_state=RANDOM_STATE,
)

val_size_relative = VAL_SIZE / (1.0 - TEST_SIZE)  # 0.10 / 0.90
df_train, df_val = train_test_split(
    df_trainval,
    test_size=val_size_relative,
    stratify=df_trainval["label_enc"],
    random_state=RANDOM_STATE,
)

# ===== quick checks =====
def show_split_stats(name, frame):
    print(f"\n{name} size: {len(frame)}")
    print(frame["label_enc"].value_counts().sort_index())
    counts = frame["label_norm"].value_counts().reindex(canonical_order, fill_value=0)
    print("\n" + name + " label counts by name:")
    print(counts)

print("\nFinal split sizes")
print(f"Train: {len(df_train)}  Validation: {len(df_val)}  Test: {len(df_test)}")
show_split_stats("Train", df_train)
show_split_stats("Validation", df_val)
show_split_stats("Test", df_test)

# ===== save artifacts =====
try:
    data_warehouse
except NameError:
    data_warehouse = Path.cwd()

out_dir = data_warehouse / "mental_health_splits_with_stress"
out_dir.mkdir(parents=True, exist_ok=True)

cols_to_save = ["text", "label_norm", "label_enc"]
df_train[cols_to_save].to_csv(out_dir / "train.csv", index=False)
df_val[cols_to_save].to_csv(out_dir / "val.csv", index=False)
df_test[cols_to_save].to_csv(out_dir / "test.csv", index=False)

pd.Series(class_to_id).rename("id").to_csv(out_dir / "label_classes.csv")

print(f"\nSaved splits and label mapping to {out_dir.resolve()}")


Classes and ids:
anxiety -> 0
depression -> 1
ptsd -> 2
suicide -> 3
stress -> 4
none -> 5

Final split sizes
Train: 5576  Validation: 697  Test: 697

Train size: 5576
label_enc
0     332
1    1858
2     332
3     670
4    1820
5     564
Name: count, dtype: int64

Train label counts by name:
label_norm
anxiety        332
depression    1858
ptsd           332
suicide        670
stress        1820
none           564
Name: count, dtype: int64

Validation size: 697
label_enc
0     42
1    232
2     41
3     84
4    227
5     71
Name: count, dtype: int64

Validation label counts by name:
label_norm
anxiety        42
depression    232
ptsd           41
suicide        84
stress        227
none           71
Name: count, dtype: int64

Test size: 697
label_enc
0     42
1    232
2     41
3     84
4    227
5     71
Name: count, dtype: int64

Test label counts by name:
label_norm
anxiety        42
depression    232
ptsd           41
suicide        84
stress        227
none           71
Name: count,

In [4]:
# ===== baselines with GPU for embeddings and calibrated SVM for AUC =====
from pathlib import Path
import json
import numpy as np
import pandas as pd

import torch
from sentence_transformers import SentenceTransformer

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

import joblib

# Optional GPU accelerated models via RAPIDS cuML if available
use_cuml = False
try:
    import cuml
    from cuml.linear_model import LogisticRegression as cuLogisticRegression
    from cuml.svm import SVC as cuSVC
    use_cuml = True
except Exception:
    use_cuml = False

RANDOM_STATE = 42

# ===== locate Data_Warehouse and load splits if needed =====
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate a folder named Data_Warehouse")

try:
    data_warehouse
except NameError:
    try:
        script_dir = Path(__file__).resolve().parent
    except NameError:
        script_dir = Path.cwd()
    data_warehouse = find_data_warehouse(script_dir)

splits_dir = data_warehouse / "mental_health_splits_with_stress"

# If df_train df_val df_test are not already present, load them
globals_present = all(name in globals() for name in ["df_train", "df_val", "df_test"])
if not globals_present:
    df_train = pd.read_csv(splits_dir / "train.csv")
    df_val   = pd.read_csv(splits_dir / "val.csv")
    df_test  = pd.read_csv(splits_dir / "test.csv")

# These columns are saved by your Step 2 script
required_cols = {"text", "label_norm", "label_enc"}
assert required_cols.issubset(df_train.columns)
assert required_cols.issubset(df_val.columns)
assert required_cols.issubset(df_test.columns)

# ===== embeddings on GPU if available =====
device = "cuda" if torch.cuda.is_available() else "cpu"
st_model = SentenceTransformer("sentence-transformers/all-roberta-large-v1", device=device)

def embed_texts(model, texts, batch_size=256, show_progress=True):
    return model.encode(
        list(texts),
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=show_progress,
    )

X_train = embed_texts(st_model, df_train["text"])
y_train = df_train["label_enc"].to_numpy()

X_val   = embed_texts(st_model, df_val["text"])
y_val   = df_val["label_enc"].to_numpy()

X_test  = embed_texts(st_model, df_test["text"])
y_test  = df_test["label_enc"].to_numpy()

print("Embedding shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Embeddings computed on device:", device)

# ===== helper to evaluate =====
def evaluate_model(name, model, X, y_true, proba=None):
    y_pred = model.predict(X)
    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    f1_weighted = f1_score(y_true, y_pred, average="weighted")
    prec_macro = precision_score(y_true, y_pred, average="macro")
    prec_weighted = precision_score(y_true, y_pred, average="weighted")
    rec_macro = recall_score(y_true, y_pred, average="macro")
    rec_weighted = recall_score(y_true, y_pred, average="weighted")

    auc = None
    if proba is not None:
        auc = roc_auc_score(y_true, proba, multi_class="ovr")

    print(f"\n=== {name} ===")
    print(f"accuracy: {acc:.4f}")
    print(f"precision_macro: {prec_macro:.4f}  precision_weighted: {prec_weighted:.4f}")
    print(f"recall_macro: {rec_macro:.4f}  recall_weighted: {rec_weighted:.4f}")
    print(f"f1_macro: {f1_macro:.4f}  f1_weighted: {f1_weighted:.4f}")
    if auc is not None:
        print(f"roc_auc_ovr: {auc:.4f}")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))

    return {
        "accuracy": float(acc),
        "precision_macro": float(prec_macro),
        "precision_weighted": float(prec_weighted),
        "recall_macro": float(rec_macro),
        "recall_weighted": float(rec_weighted),
        "f1_macro": float(f1_macro),
        "f1_weighted": float(f1_weighted),
        "roc_auc_ovr": float(auc) if auc is not None else None,
    }

# ===== Logistic Regression =====
if use_cuml:
    logreg = cuLogisticRegression(
        penalty="l2",
        C=1.0,
        max_iter=2000,
        tol=1e-4,
        fit_intercept=True,
        multi_class="ovr",
        random_state=RANDOM_STATE,
        verbose=0,
    )
    logreg.fit(X_train, y_train)
    val_proba_lr = logreg.predict_proba(X_val)
    test_proba_lr = logreg.predict_proba(X_test)
else:
    logreg = LogisticRegression(
        multi_class="multinomial",
        solver="saga",
        penalty="l2",
        C=1.0,
        max_iter=2000,
        n_jobs=-1,
        random_state=RANDOM_STATE,
    )
    logreg.fit(X_train, y_train)
    val_proba_lr = logreg.predict_proba(X_val)
    test_proba_lr = logreg.predict_proba(X_test)

metrics_val_lr  = evaluate_model("Logistic Regression validation", logreg, X_val, y_val, proba=val_proba_lr)
metrics_test_lr = evaluate_model("Logistic Regression test", logreg, X_test, y_test, proba=test_proba_lr)

# ===== Linear SVM with probability calibration for valid multiclass AUC =====
if use_cuml:
    svm_base = cuSVC(C=1.0, kernel="linear", probability=True, random_state=RANDOM_STATE)
    svm_base.fit(X_train, y_train)
    val_proba_svm = svm_base.predict_proba(X_val)
    test_proba_svm = svm_base.predict_proba(X_test)
    svm_for_pred = svm_base
else:
    svm_linear = LinearSVC(C=1.0, random_state=RANDOM_STATE)
    svm = CalibratedClassifierCV(svm_linear, method="sigmoid", cv=5)
    svm.fit(X_train, y_train)
    val_proba_svm = svm.predict_proba(X_val)
    test_proba_svm = svm.predict_proba(X_test)
    svm_for_pred = svm

metrics_val_svm  = evaluate_model("Linear SVM validation", svm_for_pred, X_val, y_val, proba=val_proba_svm)
metrics_test_svm = evaluate_model("Linear SVM test", svm_for_pred, X_test, y_test, proba=test_proba_svm)

# ===== persist artifacts and summary =====
art_dir = splits_dir / "baselines_all_roberta_large_v1_none5"
art_dir.mkdir(parents=True, exist_ok=True)

# Save models
if use_cuml:
    joblib.dump(logreg, art_dir / "logreg_cuml.joblib")
    joblib.dump(svm_for_pred, art_dir / "linear_svm_cuml.joblib")
else:
    joblib.dump(logreg, art_dir / "logreg.joblib")
    joblib.dump(svm_for_pred, art_dir / "linear_svm_calibrated.joblib")

# Save metrics and a compact table for quick comparison
metrics = {
    "val_logreg": metrics_val_lr,
    "test_logreg": metrics_test_lr,
    "val_linear_svm": metrics_val_svm,
    "test_linear_svm": metrics_test_svm,
}
with open(art_dir / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

summary_rows = []
for split in ["val", "test"]:
    for model_name, m in [("logreg", metrics[f"{split}_logreg"]), ("linear_svm", metrics[f"{split}_linear_svm"])]:
        summary_rows.append({
            "split": split,
            "model": model_name,
            **m
        })
summary = pd.DataFrame(summary_rows)
summary.to_csv(art_dir / "metrics_summary.csv", index=False)

print(f"\nSaved models and metrics to {art_dir.resolve()}")
print(summary)


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Embedding shapes: (5576, 1024) (697, 1024) (697, 1024)
Embeddings computed on device: cuda





=== Logistic Regression validation ===
accuracy: 0.7690
precision_macro: 0.7209  precision_weighted: 0.7559
recall_macro: 0.6732  recall_weighted: 0.7690
f1_macro: 0.6905  f1_weighted: 0.7583
roc_auc_ovr: 0.9473
              precision    recall  f1-score   support

           0     0.6667    0.5714    0.6154        42
           1     0.7395    0.8319    0.7830       232
           2     0.6071    0.4146    0.4928        41
           3     0.5254    0.3690    0.4336        84
           4     0.8423    0.8943    0.8675       227
           5     0.9444    0.9577    0.9510        71

    accuracy                         0.7690       697
   macro avg     0.7209    0.6732    0.6905       697
weighted avg     0.7559    0.7690    0.7583       697

Confusion matrix:
 [[ 24  10   3   1   4   0]
 [  3 193   2  23  11   0]
 [  5   3  17   3  13   0]
 [  1  42   0  31   8   2]
 [  3  12   6   1 203   2]
 [  0   1   0   0   2  68]]

=== Logistic Regression test ===
accuracy: 0.7920
precision_m

In [5]:
# ==========================================
# Fine tune google-bert/bert-large-uncased for multiclass classification
# using the with_stress splits and label ids:
# anxiety 0, depression 1, ptsd 2, suicide 3, stress 4, none 5
# ==========================================

from pathlib import Path
import os, json
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    DataCollatorWithPadding,
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)
from torch.utils.data import Dataset

# ---------- progress visibility and windows safety ----------
os.environ.pop("HF_DISABLE_PROGRESS_BARS", None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

class ConsoleLogger(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs:
            return
        parts = []
        if state.global_step is not None: parts.append(f"step {state.global_step}")
        if state.epoch is not None: parts.append(f"epoch {state.epoch:.2f}")
        if "loss" in logs: parts.append(f"loss {logs['loss']:.4f}")
        if "learning_rate" in logs: parts.append(f"lr {logs['learning_rate']:.6f}")
        print(" | ".join(parts), flush=True)

# ---------- config ----------
RANDOM_STATE = 42
MODEL_NAME = "google-bert/bert-large-uncased"
MAX_LENGTH = 256          # 256 is usually enough and faster than 512
EPOCHS = 5
LR = 2e-5
WD = 0.01
TRAIN_BS = 8             # per device train batch
EVAL_BS = 16             # per device eval batch
GRAD_ACCUM_STEPS = 8     # effective batch = 8 * 8 = 64
USE_CLASS_WEIGHTS = True

# ---------- locate Data_Warehouse ----------
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate a folder named Data_Warehouse")

try:
    script_dir = Path(__file__).resolve().parent
except NameError:
    script_dir = Path.cwd()
data_warehouse = find_data_warehouse(script_dir)

# ---------- load with_stress splits ----------
split_dir = data_warehouse / "mental_health_splits_with_stress"
df_train = pd.read_csv(split_dir / "train.csv")
df_val   = pd.read_csv(split_dir / "val.csv")
df_test  = pd.read_csv(split_dir / "test.csv")

# accept either label_norm or label for readability
label_col = "label_norm" if "label_norm" in df_train.columns else "label"
for name, df_ in [("train", df_train), ("val", df_val), ("test", df_test)]:
    req = {"text", label_col, "label_enc"}
    if not req.issubset(df_.columns):
        raise ValueError(f"{name} split missing required columns: {req}")

num_labels = int(df_train["label_enc"].max()) + 1

# build id2label and label2id from the split to stay aligned with ids
enc_to_label = df_train[["label_enc", label_col]].drop_duplicates().sort_values("label_enc")
id2label = {int(r.label_enc): str(r[label_col]) for _, r in enc_to_label.iterrows()}
label2id = {v: k for k, v in id2label.items()}
print("Label mapping used:")
for k in range(num_labels):
    print(f"{k}: {id2label[k]}")

# ---------- Dataset wrapper with dynamic padding ----------
class TextClsDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df["text"].astype(str).tolist()
        self.labels = df["label_enc"].astype(int).tolist()
        self.tokenizer = tokenizer
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,          # dynamic padding via collator
            max_length=MAX_LENGTH,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ---------- tokenizer and model ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch device:", device)

tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
if torch.cuda.is_available():
    model.gradient_checkpointing_enable()
model.config.use_cache = False
model.to(device)

# ---------- class weights ----------
class_weights = None
if USE_CLASS_WEIGHTS:
    counts = df_train["label_enc"].value_counts().sort_index()
    total = counts.sum()
    weights = total / (num_labels * counts)   # inverse frequency
    class_weights = torch.tensor(weights.to_numpy(), dtype=torch.float, device=device)
    print("Using class weights:", [float(x) for x in class_weights])

# ---------- custom Trainer to apply class weights ----------
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=class_weights) if class_weights is not None else F.cross_entropy(logits, labels)
        return (loss, outputs) if return_outputs else loss

# ---------- datasets and collator ----------
train_ds = TextClsDataset(df_train, tokenizer)
val_ds   = TextClsDataset(df_val, tokenizer)
test_ds  = TextClsDataset(df_test, tokenizer)

collator = DataCollatorWithPadding(
    tokenizer,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None,
)

# ---------- metrics ----------
def compute_metrics_from_logits(logits, labels):
    preds = np.argmax(logits, axis=1)
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
    out = {
        "accuracy": float(accuracy_score(labels, preds)),
        "precision_macro": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "precision_weighted": float(precision_score(labels, preds, average="weighted", zero_division=0)),
        "recall_macro": float(recall_score(labels, preds, average="macro", zero_division=0)),
        "recall_weighted": float(recall_score(labels, preds, average="weighted", zero_division=0)),
        "f1_macro": float(f1_score(labels, preds, average="macro", zero_division=0)),
        "f1_weighted": float(f1_score(labels, preds, average="weighted", zero_division=0)),
    }
    try:
        out["roc_auc_ovr"] = float(roc_auc_score(labels, probs, multi_class="ovr"))
    except Exception:
        out["roc_auc_ovr"] = None
    return out, preds

# ---------- output dir ----------
out_dir = split_dir / "bert_large_with_stress"
out_dir.mkdir(parents=True, exist_ok=True)

# ---------- TrainingArguments ----------
args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WD,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    dataloader_num_workers=0,     # safer on windows
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    disable_tqdm=False,
    logging_steps=50,
    logging_first_step=True,
    report_to=[],
    seed=RANDOM_STATE,
    # optim="adamw_torch_fused",  # enable if available for your gpu and pytorch
)

TrainerClass = WeightedTrainer if USE_CLASS_WEIGHTS else Trainer
trainer = TrainerClass(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=collator,
)
trainer.add_callback(ConsoleLogger())

# ---------- train ----------
print("\nStarting trainer.train() ...", flush=True)
train_result = trainer.train()
print("Finished trainer.train()", flush=True)
trainer.save_model(out_dir)
tokenizer.save_pretrained(out_dir)

with open(out_dir / "train_metrics.json", "w") as f:
    json.dump({k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in train_result.metrics.items()}, f, indent=2)

# ---------- manual evaluation ----------
val_out = trainer.predict(val_ds)
val_metrics, _ = compute_metrics_from_logits(val_out.predictions, val_out.label_ids)

test_out = trainer.predict(test_ds)
test_metrics, test_preds = compute_metrics_from_logits(test_out.predictions, test_out.label_ids)

with open(out_dir / "val_metrics.json", "w") as f:
    json.dump(val_metrics, f, indent=2)
with open(out_dir / "test_metrics.json", "w") as f:
    json.dump(test_metrics, f, indent=2)

rep = classification_report(
    test_out.label_ids, test_preds,
    target_names=[id2label[i] for i in range(num_labels)],
    digits=4, zero_division=0
)
cm = confusion_matrix(test_out.label_ids, test_preds)

with open(out_dir / "test_classification_report.txt", "w") as f:
    f.write(rep)

pd.DataFrame(
    cm,
    index=[f"true_{id2label[i]}" for i in range(num_labels)],
    columns=[f"pred_{id2label[i]}" for i in range(num_labels)],
).to_csv(out_dir / "test_confusion_matrix.csv", index=True)

print("\nValidation metrics:", val_metrics)
print("Test metrics:", test_metrics)
print("\nSaved full model and metrics to:", out_dir.resolve())


Label mapping used:
0: anxiety
1: depression
2: ptsd
3: suicide
4: stress
5: none
Torch device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using class weights: [2.799196720123291, 0.5001794099807739, 2.799196720123291, 1.3870646953582764, 0.5106227397918701, 1.647754192352295]

Starting trainer.train() ...


  trainer = TrainerClass(


Step,Training Loss
1,1.9689
50,1.5067
100,1.0325
150,0.7335
200,0.5788
250,0.4717
300,0.3702
350,0.3128
400,0.2436


step 1 | epoch 0.01 | loss 1.9689 | lr 0.000020
step 50 | epoch 0.57 | loss 1.5067 | lr 0.000018
step 100 | epoch 1.14 | loss 1.0325 | lr 0.000016
step 150 | epoch 1.71 | loss 0.7335 | lr 0.000013
step 200 | epoch 2.28 | loss 0.5788 | lr 0.000011
step 250 | epoch 2.85 | loss 0.4717 | lr 0.000009
step 300 | epoch 3.41 | loss 0.3702 | lr 0.000006
step 350 | epoch 3.99 | loss 0.3128 | lr 0.000004
step 400 | epoch 4.55 | loss 0.2436 | lr 0.000002
step 440 | epoch 5.00
Finished trainer.train()



Validation metrics: {'accuracy': 0.7661406025824964, 'precision_macro': 0.70922170074391, 'precision_weighted': 0.7898243363525839, 'recall_macro': 0.7561041838909265, 'recall_weighted': 0.7661406025824964, 'f1_macro': 0.7252895924914541, 'f1_weighted': 0.7729052581721234, 'roc_auc_ovr': 0.9522017216091903}
Test metrics: {'accuracy': 0.7919655667144907, 'precision_macro': 0.7397003765274226, 'precision_weighted': 0.8045580865993706, 'recall_macro': 0.7680656297525918, 'recall_weighted': 0.7919655667144907, 'f1_macro': 0.7506335637887044, 'f1_weighted': 0.7959770912560766, 'roc_auc_ovr': 0.9570098201171756}

Saved full model and metrics to: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_with_stress\bert_large_with_stress


In [6]:
# ==========================================
# Fine tune sentence-transformers/all-mpnet-base-v2 for multiclass classification
# Dataset: Data_Warehouse/mental_health_splits_with_stress
# Label ids preserved from splits: anxiety 0, depression 1, ptsd 2, suicide 3, stress 4, none 5
# ==========================================

from pathlib import Path
import os, json
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    DataCollatorWithPadding,
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)
from torch.utils.data import Dataset

# ---------- progress visibility & Windows safety ----------
os.environ.pop("HF_DISABLE_PROGRESS_BARS", None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

class ConsoleLogger(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs:
            return
        parts = []
        if state.global_step is not None: parts.append(f"step {state.global_step}")
        if state.epoch is not None: parts.append(f"epoch {state.epoch:.2f}")
        if "loss" in logs: parts.append(f"loss {logs['loss']:.4f}")
        if "learning_rate" in logs: parts.append(f"lr {logs['learning_rate']:.6f}")
        print(" | ".join(parts), flush=True)

# ---------- config ----------
RANDOM_STATE = 42
TOKENIZER_NAME = "sentence-transformers/all-mpnet-base-v2"  # tokenizer
BACKBONE_NAME  = "microsoft/mpnet-base"                     # classification head
MAX_LENGTH = 256        # faster than 512 for short social text
EPOCHS = 5
LR = 2e-5
WD = 0.01
TRAIN_BS = 8
EVAL_BS = 16
GRAD_ACCUM_STEPS = 8     # effective batch = 64
USE_CLASS_WEIGHTS = True

# ---------- locate Data_Warehouse ----------
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate a folder named Data_Warehouse")

try:
    script_dir = Path(__file__).resolve().parent
except NameError:
    script_dir = Path.cwd()
data_warehouse = find_data_warehouse(script_dir)

# ---------- load splits (with stress) ----------
split_dir = data_warehouse / "mental_health_splits_with_stress"
df_train = pd.read_csv(split_dir / "train.csv")
df_val   = pd.read_csv(split_dir / "val.csv")
df_test  = pd.read_csv(split_dir / "test.csv")

# accept either label_norm or label (Step 2 saved label_norm)
label_col = "label_norm" if "label_norm" in df_train.columns else "label"
for name, df_ in [("train", df_train), ("val", df_val), ("test", df_test)]:
    req = {"text", label_col, "label_enc"}
    if not req.issubset(df_.columns):
        raise ValueError(f"{name} split missing required columns: {req}")

num_labels = int(df_train["label_enc"].max()) + 1
enc_to_label = df_train[["label_enc", label_col]].drop_duplicates().sort_values("label_enc")
id2label = {int(r.label_enc): str(r[label_col]) for _, r in enc_to_label.iterrows()}
label2id = {v: k for k, v in id2label.items()}

print("Label mapping used:")
for k in range(num_labels):
    print(f"{k}: {id2label[k]}")

# ---------- dataset (dynamic padding) ----------
class TextClsDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df["text"].astype(str).tolist()
        self.labels = df["label_enc"].astype(int).tolist()
        self.tokenizer = tokenizer
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,      # pad per-batch via collator
            max_length=MAX_LENGTH,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ---------- tokenizer & model ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch device:", device)

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    BACKBONE_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# MPNet gradient checkpointing is not always supported; enable if available
try:
    if getattr(model, "supports_gradient_checkpointing", False):
        model.gradient_checkpointing_enable()
except Exception as e:
    print(f"Gradient checkpointing not available for MPNet: {e}")

if hasattr(model.config, "use_cache"):
    model.config.use_cache = False

model.to(device)

# ---------- class weights ----------
class_weights = None
if USE_CLASS_WEIGHTS:
    counts = df_train["label_enc"].value_counts().sort_index()
    total = counts.sum()
    weights = total / (num_labels * counts)  # inverse frequency
    class_weights = torch.tensor(weights.to_numpy(), dtype=torch.float, device=device)
    print("Using class weights:", [float(x) for x in class_weights])

# ---------- custom Trainer ----------
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=class_weights) if class_weights is not None else F.cross_entropy(logits, labels)
        return (loss, outputs) if return_outputs else loss

# ---------- datasets & collator ----------
train_ds = TextClsDataset(df_train, tokenizer)
val_ds   = TextClsDataset(df_val, tokenizer)
test_ds  = TextClsDataset(df_test, tokenizer)

collator = DataCollatorWithPadding(
    tokenizer,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None,
)

# ---------- metrics ----------
def compute_metrics_from_logits(logits, labels):
    preds = np.argmax(logits, axis=1)
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
    out = {
        "accuracy": float(accuracy_score(labels, preds)),
        "precision_macro": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "precision_weighted": float(precision_score(labels, preds, average="weighted", zero_division=0)),
        "recall_macro": float(recall_score(labels, preds, average="macro", zero_division=0)),
        "recall_weighted": float(recall_score(labels, preds, average="weighted", zero_division=0)),
        "f1_macro": float(f1_score(labels, preds, average="macro", zero_division=0)),
        "f1_weighted": float(f1_score(labels, preds, average="weighted", zero_division=0)),
    }
    try:
        out["roc_auc_ovr"] = float(roc_auc_score(labels, probs, multi_class="ovr"))
    except Exception:
        out["roc_auc_ovr"] = None
    return out, preds

# ---------- output dir ----------
out_dir = split_dir / "all_mpnet_base_v2_with_stress_none5"
out_dir.mkdir(parents=True, exist_ok=True)

# ---------- TrainingArguments ----------
args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WD,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    dataloader_num_workers=0,   # Windows-safe
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    disable_tqdm=False,
    logging_steps=50,
    logging_first_step=True,
    report_to=[],
    seed=RANDOM_STATE,
    # optim="adamw_torch_fused",  # optional on PyTorch 2.x + recent GPUs
)

TrainerClass = WeightedTrainer if USE_CLASS_WEIGHTS else Trainer
trainer = TrainerClass(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=collator,
)
trainer.add_callback(ConsoleLogger())

# ---------- train ----------
print("\nStarting trainer.train() ...", flush=True)
train_result = trainer.train()
print("Finished trainer.train()", flush=True)
trainer.save_model(out_dir)
tokenizer.save_pretrained(out_dir)

with open(out_dir / "train_metrics.json", "w") as f:
    json.dump({k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in train_result.metrics.items()}, f, indent=2)

# ---------- manual evaluation ----------
val_out = trainer.predict(val_ds)
val_metrics, _ = compute_metrics_from_logits(val_out.predictions, val_out.label_ids)

test_out = trainer.predict(test_ds)
test_metrics, test_preds = compute_metrics_from_logits(test_out.predictions, test_out.label_ids)

with open(out_dir / "val_metrics.json", "w") as f:
    json.dump(val_metrics, f, indent=2)
with open(out_dir / "test_metrics.json", "w") as f:
    json.dump(test_metrics, f, indent=2)

rep = classification_report(
    test_out.label_ids, test_preds,
    target_names=[id2label[i] for i in range(num_labels)],
    digits=4, zero_division=0
)
cm = confusion_matrix(test_out.label_ids, test_preds)

with open(out_dir / "test_classification_report.txt", "w") as f:
    f.write(rep)

pd.DataFrame(
    cm,
    index=[f"true_{id2label[i]}" for i in range(num_labels)],
    columns=[f"pred_{id2label[i]}" for i in range(num_labels)],
).to_csv(out_dir / "test_confusion_matrix.csv", index=True)

print("\nValidation metrics:", val_metrics)
print("Test metrics:", test_metrics)
print("\nSaved MPNet model and metrics to:", out_dir.resolve())


Label mapping used:
0: anxiety
1: depression
2: ptsd
3: suicide
4: stress
5: none
Torch device: cuda


Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using class weights: [2.799196720123291, 0.5001794099807739, 2.799196720123291, 1.3870646953582764, 0.5106227397918701, 1.647754192352295]

Starting trainer.train() ...


  trainer = TrainerClass(


Step,Training Loss
1,1.7949
50,1.6446
100,1.2998
150,1.112
200,0.9804
250,0.8895
300,0.8073
350,0.7784
400,0.7231


step 1 | epoch 0.01 | loss 1.7949 | lr 0.000020
step 50 | epoch 0.57 | loss 1.6446 | lr 0.000018
step 100 | epoch 1.14 | loss 1.2998 | lr 0.000016
step 150 | epoch 1.71 | loss 1.1120 | lr 0.000013
step 200 | epoch 2.28 | loss 0.9804 | lr 0.000011
step 250 | epoch 2.85 | loss 0.8895 | lr 0.000009
step 300 | epoch 3.41 | loss 0.8073 | lr 0.000006
step 350 | epoch 3.99 | loss 0.7784 | lr 0.000004
step 400 | epoch 4.55 | loss 0.7231 | lr 0.000002
step 440 | epoch 5.00
Finished trainer.train()



Validation metrics: {'accuracy': 0.7589670014347202, 'precision_macro': 0.7044509493515791, 'precision_weighted': 0.8053597583209949, 'recall_macro': 0.7698897287742175, 'recall_weighted': 0.7589670014347202, 'f1_macro': 0.7222192495132426, 'f1_weighted': 0.769913016663927, 'roc_auc_ovr': 0.9506540530295969}
Test metrics: {'accuracy': 0.7919655667144907, 'precision_macro': 0.7339690340084853, 'precision_weighted': 0.8150682474836102, 'recall_macro': 0.7727898153367679, 'recall_weighted': 0.7919655667144907, 'f1_macro': 0.7477005138049471, 'f1_weighted': 0.7982862365338192, 'roc_auc_ovr': 0.9497015352897065}

Saved MPNet model and metrics to: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_with_stress\all_mpnet_base_v2_with_stress_none5


In [7]:
# ==========================================
# Fine tune sentence-transformers/all-roberta-large-v1
# with overfitting checks, early stopping, and label-wise display
# Dataset: Data_Warehouse/mental_health_splits_with_stress
# Label ids preserved from splits: anxiety 0, depression 1, ptsd 2, suicide 3, stress 4, none 5
# ==========================================

from pathlib import Path
import os, json
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    DataCollatorWithPadding,
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)
from torch.utils.data import Dataset

# ---------- progress visibility & Windows safety ----------
os.environ.pop("HF_DISABLE_PROGRESS_BARS", None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

class ConsoleLogger(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs:
            return
        parts = []
        if state.global_step is not None: parts.append(f"step {state.global_step}")
        if state.epoch is not None: parts.append(f"epoch {state.epoch:.2f}")
        if "loss" in logs: parts.append(f"loss {logs['loss']:.4f}")
        if "learning_rate" in logs: parts.append(f"lr {logs['learning_rate']:.6f}")
        print(" | ".join(parts), flush=True)

# ---------- config ----------
RANDOM_STATE = 42
TOKENIZER_NAME = "sentence-transformers/all-roberta-large-v1"  # ST tokenizer
BACKBONE_NAME  = "roberta-large"                               # classification backbone
MAX_LENGTH = 512
EPOCHS = 5
LR = 2e-5
WD = 0.01
TRAIN_BS = 4             # roberta-large is heavy; keep modest
EVAL_BS = 16
GRAD_ACCUM_STEPS = 8     # effective batch = 4 * 8 = 32
USE_CLASS_WEIGHTS = True
LABEL_SMOOTHING = 0.0    # set ~0.05 if you observe overfitting

EARLY_STOP_PATIENCE = 2  # epochs without improvement before stopping
EARLY_STOP_MONITOR = "f1_macro"

# ---------- locate Data_Warehouse ----------
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate a folder named Data_Warehouse")

try:
    script_dir = Path(__file__).resolve().parent
except NameError:
    script_dir = Path.cwd()
data_warehouse = find_data_warehouse(script_dir)

# ---------- load splits (WITH STRESS) ----------
split_dir = data_warehouse / "mental_health_splits_with_stress"
df_train = pd.read_csv(split_dir / "train.csv")
df_val   = pd.read_csv(split_dir / "val.csv")
df_test  = pd.read_csv(split_dir / "test.csv")

# prefer label_norm if present
label_col = "label_norm" if "label_norm" in df_train.columns else "label"
for name, df_ in [("train", df_train), ("val", df_val), ("test", df_test)]:
    req = {"text", label_col, "label_enc"}
    if not req.issubset(df_.columns):
        raise ValueError(f"{name} split missing required columns: {req}")

num_labels = int(df_train["label_enc"].max()) + 1
enc_to_label = df_train[["label_enc", label_col]].drop_duplicates().sort_values("label_enc")
id2label = {int(r.label_enc): str(r[label_col]) for _, r in enc_to_label.iterrows()}
label2id = {v: k for k, v in id2label.items()}
label_names = [id2label[i] for i in range(num_labels)]

print("Label mapping used:")
for k in range(num_labels):
    print(f"{k}: {id2label[k]}")

# ---------- dataset (dynamic padding) ----------
class TextClsDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df["text"].astype(str).tolist()
        self.labels = df["label_enc"].astype(int).tolist()
        self.tokenizer = tokenizer
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,      # pad per-batch via collator
            max_length=MAX_LENGTH,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ---------- tokenizer & model ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch device:", device)

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    BACKBONE_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# Try to enable gradient checkpointing to save memory
if torch.cuda.is_available():
    try:
        model.gradient_checkpointing_enable()
    except Exception as e:
        print(f"Could not enable gradient checkpointing: {e}")

if hasattr(model.config, "use_cache"):
    model.config.use_cache = False

model.to(device)

# ---------- class weights ----------
class_weights = None
if USE_CLASS_WEIGHTS:
    counts = df_train["label_enc"].value_counts().sort_index()
    total = counts.sum()
    weights = total / (num_labels * counts)  # inverse frequency
    class_weights = torch.tensor(weights.to_numpy(), dtype=torch.float, device=device)
    print("Using class weights:", [float(x) for x in class_weights])

# ---------- custom Trainer (weighted CE + optional label smoothing) ----------
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        loss = F.cross_entropy(
            logits, labels,
            weight=class_weights,
            label_smoothing=LABEL_SMOOTHING if LABEL_SMOOTHING > 0 else 0.0
        ) if class_weights is not None else F.cross_entropy(
            logits, labels, label_smoothing=LABEL_SMOOTHING if LABEL_SMOOTHING > 0 else 0.0
        )
        return (loss, outputs) if return_outputs else loss

# ---------- datasets & collator ----------
train_ds = TextClsDataset(df_train, tokenizer)
val_ds   = TextClsDataset(df_val, tokenizer)
test_ds  = TextClsDataset(df_test, tokenizer)

collator = DataCollatorWithPadding(
    tokenizer,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None,
)

# ---------- metrics (overall) ----------
def compute_metrics_from_logits(logits, labels):
    preds = np.argmax(logits, axis=1)
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
    out = {
        "accuracy": float(accuracy_score(labels, preds)),
        "precision_macro": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "precision_weighted": float(precision_score(labels, preds, average="weighted", zero_division=0)),
        "recall_macro": float(recall_score(labels, preds, average="macro", zero_division=0)),
        "recall_weighted": float(recall_score(labels, preds, average="weighted", zero_division=0)),
        "f1_macro": float(f1_score(labels, preds, average="macro", zero_division=0)),
        "f1_weighted": float(f1_score(labels, preds, average="weighted", zero_division=0)),
    }
    try:
        out["roc_auc_ovr"] = float(roc_auc_score(labels, probs, multi_class="ovr"))
    except Exception:
        out["roc_auc_ovr"] = None
    return out, preds, probs

# ---------- label-wise metrics (DISPLAY ONLY) ----------
def per_label_report(logits, labels, label_names):
    preds = np.argmax(logits, axis=1)
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()

    rep = classification_report(
        labels, preds, target_names=label_names,
        output_dict=True, zero_division=0
    )
    df = pd.DataFrame(rep).T

    # add one-vs-rest ROC-AUC per class
    aucs = {}
    y_true = np.array(labels)
    for i, name in enumerate(label_names):
        try:
            y_bin = (y_true == i).astype(int)
            aucs[name] = roc_auc_score(y_bin, probs[:, i])
        except Exception:
            aucs[name] = np.nan

    for name, auc in aucs.items():
        if name in df.index:
            df.loc[name, "roc_auc_ovr"] = float(auc) if auc == auc else None

    cols = ["precision", "recall", "f1-score", "support", "roc_auc_ovr"]
    for c in cols:
        if c not in df.columns:
            df[c] = np.nan
    df = df[cols]
    return df

def show_labelwise(df, title):
    print("\n" + "="*len(title))
    print(title)
    print("="*len(title))
    print(df.round(4).to_string())

# ---------- output dir ----------
out_dir = split_dir / "all_roberta_large_v1_with_stress"
out_dir.mkdir(parents=True, exist_ok=True)

# ---------- TrainingArguments (Windows-friendly) ----------
args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WD,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    dataloader_num_workers=0,   # Windows: avoid hangs
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    disable_tqdm=False,
    logging_steps=50,
    logging_first_step=True,
    report_to=[],
    seed=RANDOM_STATE,
    # optim="adamw_torch_fused",  # optional on PyTorch 2.x + recent GPUs
)

TrainerClass = WeightedTrainer if USE_CLASS_WEIGHTS else Trainer
trainer = TrainerClass(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,   # we run our own eval per epoch too
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer.add_callback(ConsoleLogger())

# ---------- Overfitting monitor: eval each epoch + early stopping + best checkpoint ----------
class EvalEveryEpochCallback(TrainerCallback):
    def __init__(self, trainer_ref, val_ds, out_dir, patience=2, monitor="f1_macro"):
        self.trainer_ref = trainer_ref
        self.val_ds = val_ds
        self.out_dir = Path(out_dir)
        self.monitor = monitor
        self.best = -float("inf")
        self.bad_epochs = 0
        self.patience = patience
        self.log_path = self.out_dir / "epoch_metrics.jsonl"
        self._fh = None

    def on_train_begin(self, args, state, control, **kwargs):
        self._fh = open(self.log_path, "a", encoding="utf-8")

    def on_epoch_end(self, args, state, control, **kwargs):
        out = self.trainer_ref.predict(self.val_ds)
        metrics, _, _ = compute_metrics_from_logits(out.predictions, out.label_ids)
        metrics["epoch"] = float(state.epoch)
        self._fh.write(json.dumps(metrics) + "\n"); self._fh.flush()
        print(f"[epoch {metrics['epoch']:.2f}] val f1_macro={metrics['f1_macro']:.4f} "
              f"acc={metrics['accuracy']:.4f}", flush=True)
        score = metrics.get(self.monitor, -float("inf"))
        if score > self.best + 1e-8:
            self.best = score
            self.bad_epochs = 0
            self.trainer_ref.save_model(self.out_dir / "best")
        else:
            self.bad_epochs += 1
            if self.bad_epochs >= self.patience:
                print(f"Early stopping: no improvement in {self.patience} epoch(s).", flush=True)
                control.should_training_stop = True

    def on_train_end(self, args, state, control, **kwargs):
        if self._fh:
            self._fh.close()

trainer.add_callback(EvalEveryEpochCallback(trainer, val_ds, out_dir, patience=EARLY_STOP_PATIENCE, monitor=EARLY_STOP_MONITOR))

# ---------- train ----------
print("\nStarting trainer.train() ...", flush=True)
train_result = trainer.train()
print("Finished trainer.train()", flush=True)

# save "last" model + tokenizer
trainer.save_model(out_dir)
tokenizer.save_pretrained(out_dir)

with open(out_dir / "train_metrics.json", "w") as f:
    json.dump({k: (float(v) if isinstance(v, (int, float)) else str(v)) for k, v in train_result.metrics.items()}, f, indent=2)

# ---------- manual evaluation (last model) ----------
val_out = trainer.predict(val_ds)
val_metrics, val_preds, val_probs = compute_metrics_from_logits(val_out.predictions, val_out.label_ids)

test_out = trainer.predict(test_ds)
test_metrics, test_preds, test_probs = compute_metrics_from_logits(test_out.predictions, test_out.label_ids)

# also evaluate TRAIN to check generalization gap
train_out = trainer.predict(train_ds)
train_metrics, train_preds, train_probs = compute_metrics_from_logits(train_out.predictions, train_out.label_ids)

# save overall metrics
with open(out_dir / "train_eval_metrics.json", "w") as f:
    json.dump(train_metrics, f, indent=2)
with open(out_dir / "val_metrics.json", "w") as f:
    json.dump(val_metrics, f, indent=2)
with open(out_dir / "test_metrics.json", "w") as f:
    json.dump(test_metrics, f, indent=2)

# ---------- DISPLAY label-wise metrics for each split ----------
train_labelwise = per_label_report(train_out.predictions, train_out.label_ids, label_names)
val_labelwise   = per_label_report(val_out.predictions,   val_out.label_ids,   label_names)
test_labelwise  = per_label_report(test_out.predictions,  test_out.label_ids,  label_names)

show_labelwise(train_labelwise, "Label-wise metrics — TRAIN")
show_labelwise(val_labelwise,   "Label-wise metrics — VAL")
show_labelwise(test_labelwise,  "Label-wise metrics — TEST")

# ---------- classification report & confusion matrix on test ----------
rep = classification_report(
    test_out.label_ids, test_preds,
    target_names=label_names,
    digits=4, zero_division=0
)
cm = confusion_matrix(test_out.label_ids, test_preds)

with open(out_dir / "test_classification_report.txt", "w") as f:
    f.write(rep)
pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names],
).to_csv(out_dir / "test_confusion_matrix.csv", index=True)

# ---------- simple gap printout ----------
print("\nGeneralization gaps (train - val):",
      {"acc": train_metrics["accuracy"] - val_metrics["accuracy"],
       "f1_macro": train_metrics["f1_macro"] - val_metrics["f1_macro"],
       "f1_weighted": train_metrics["f1_weighted"] - val_metrics["f1_weighted"]})

print("\nTrain metrics:", train_metrics)
print("Val metrics:", val_metrics)
print("Test metrics:", test_metrics)

print("\nSaved model, best checkpoint (if any), and metrics to:", out_dir.resolve())

# ---------- (Optional) Evaluate the saved 'best' checkpoint and DISPLAY label-wise ----------
best_dir = out_dir / "best"
if best_dir.exists():
    print("\nEvaluating best checkpoint ...")
    best_model = AutoModelForSequenceClassification.from_pretrained(best_dir).to(device)
    best_trainer = Trainer(
        model=best_model,
        args=args,
        tokenizer=tokenizer,
        data_collator=collator,
    )
    b_val = best_trainer.predict(val_ds)
    b_val_metrics, _, _ = compute_metrics_from_logits(b_val.predictions, b_val.label_ids)
    b_test = best_trainer.predict(test_ds)
    b_test_metrics, _, _ = compute_metrics_from_logits(b_test.predictions, b_test.label_ids)

    print("\nBest checkpoint metrics:")
    print("Val:", b_val_metrics)
    print("Test:", b_test_metrics)

    b_test_labelwise = per_label_report(b_test.predictions, b_test.label_ids, label_names)
    show_labelwise(b_test_labelwise, "Label-wise metrics — TEST (Best checkpoint)")


Label mapping used:
0: anxiety
1: depression
2: ptsd
3: suicide
4: stress
5: none
Torch device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using class weights: [2.799196720123291, 0.5001794099807739, 2.799196720123291, 1.3870646953582764, 0.5106227397918701, 1.647754192352295]

Starting trainer.train() ...


  trainer = TrainerClass(


Step,Training Loss
1,1.9331
50,1.4169
100,0.7863
150,0.6689
200,0.5094
250,0.4856
300,0.4853
350,0.4474
400,0.3072
450,0.2879


step 1 | epoch 0.01 | loss 1.9331 | lr 0.000020
step 50 | epoch 0.29 | loss 1.4169 | lr 0.000019
step 100 | epoch 0.57 | loss 0.7863 | lr 0.000018
step 150 | epoch 0.86 | loss 0.6689 | lr 0.000017
[epoch 1.00] val f1_macro=0.7529 acc=0.8063
step 200 | epoch 1.14 | loss 0.5094 | lr 0.000015
step 250 | epoch 1.43 | loss 0.4856 | lr 0.000014
step 300 | epoch 1.72 | loss 0.4853 | lr 0.000013
step 350 | epoch 2.00 | loss 0.4474 | lr 0.000012
[epoch 2.00] val f1_macro=0.7607 acc=0.8164
step 400 | epoch 2.29 | loss 0.3072 | lr 0.000011
step 450 | epoch 2.57 | loss 0.2879 | lr 0.000010
step 500 | epoch 2.86 | loss 0.2907 | lr 0.000009
[epoch 3.00] val f1_macro=0.7829 acc=0.8307
step 550 | epoch 3.14 | loss 0.2301 | lr 0.000007
step 600 | epoch 3.43 | loss 0.1862 | lr 0.000006
step 650 | epoch 3.72 | loss 0.1765 | lr 0.000005
step 700 | epoch 4.00 | loss 0.1817 | lr 0.000004
[epoch 4.00] val f1_macro=0.7827 acc=0.8350
step 750 | epoch 4.29 | loss 0.0952 | lr 0.000003
step 800 | epoch 4.57 | los


Label-wise metrics — TRAIN
              precision  recall  f1-score    support  roc_auc_ovr
anxiety          0.9678  0.9970    0.9822   332.0000       0.9998
depression       0.9944  0.9634    0.9787  1858.0000       0.9985
ptsd             0.9509  0.9910    0.9705   332.0000       0.9996
suicide          0.9156  0.9881    0.9505   670.0000       0.9980
stress           0.9983  0.9879    0.9931  1820.0000       0.9995
none             1.0000  1.0000    1.0000   564.0000       1.0000
accuracy         0.9817  0.9817    0.9817     0.9817          NaN
macro avg        0.9712  0.9879    0.9792  5576.0000          NaN
weighted avg     0.9826  0.9817    0.9819  5576.0000          NaN

Label-wise metrics — VAL
              precision  recall  f1-score  support  roc_auc_ovr
anxiety          0.6545  0.8571    0.7423   42.000       0.9801
depression       0.8528  0.8491    0.8510  232.000       0.9708
ptsd             0.5897  0.5610    0.5750   41.000       0.9615
suicide          0.6310  0.631

  best_trainer = Trainer(



Best checkpoint metrics:
Val: {'accuracy': 0.8350071736011477, 'precision_macro': 0.773365483583706, 'precision_weighted': 0.8390996029388941, 'recall_macro': 0.7980127700492518, 'recall_weighted': 0.8350071736011477, 'f1_macro': 0.7832217674780155, 'f1_weighted': 0.8359738816038714, 'roc_auc_ovr': 0.974675144298052}
Test: {'accuracy': 0.8794835007173601, 'precision_macro': 0.8362884187080905, 'precision_weighted': 0.8818867128337583, 'recall_macro': 0.842822970693169, 'recall_weighted': 0.8794835007173601, 'f1_macro': 0.8389433261884545, 'f1_weighted': 0.8803947198142223, 'roc_auc_ovr': 0.9770603878619849}

Label-wise metrics — TEST (Best checkpoint)
              precision  recall  f1-score   support  roc_auc_ovr
anxiety          0.6809  0.7619    0.7191   42.0000       0.9529
depression       0.9039  0.8922    0.8980  232.0000       0.9783
ptsd             0.7436  0.7073    0.7250   41.0000       0.9781
suicide          0.7753  0.8214    0.7977   84.0000       0.9652
stress        

In [None]:
# ==========================================
# Fine tune mental/mental-roberta-base  (SAVES MODEL + BEST CHECKPOINT)
# Trainer-free, AMP-enabled
# Assumes: data_warehouse is already defined (from Step 1/2)
# Expects: df_train, df_val, df_test with columns: text, label_enc
# Saves to: Data_Warehouse/mental_health_splits_with_stress/mental_roberta_base_with_stress(/best)
# ==========================================

import os, time, json, copy
import numpy as np
import pandas as pd
from pathlib import Path
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)

# ---------------------- config ----------------------
RANDOM_STATE = 42
MODEL_NAME = "mental/mental-roberta-base"
HF_TOKEN = os.getenv('HF_TOKEN') # set if gated
MAX_LENGTH = 512
EPOCHS = 5
LR = 2e-5
WD = 0.01
TRAIN_BS = 8
EVAL_BS = 32
EARLY_STOP_PATIENCE = 2
LABEL_SMOOTHING = 0.0
GRAD_ACCUM_STEPS = 1

# ---------------- save dirs (uses data_warehouse from Step 1/2) ----------------
splits_dir = data_warehouse / "mental_health_splits_with_stress"
out_dir = splits_dir / "mental_roberta_base_with_stress"
best_dir = out_dir / "best"
out_dir.mkdir(parents=True, exist_ok=True)
best_dir.mkdir(parents=True, exist_ok=True)
print("Saving to:", out_dir.resolve())

# ---------------- label maps from your splits ----------------
num_labels = int(df_train["label_enc"].max()) + 1
if "label" in df_train.columns:
    enc_to_label = df_train[["label_enc", "label"]].drop_duplicates().sort_values("label_enc")
    id2label = {int(r.label_enc): str(r.label) for _, r in enc_to_label.iterrows()}
else:
    id2label = {i: f"label_{i}" for i in range(num_labels)}
label2id = {v: k for k, v in id2label.items()}
label_names = [id2label[i] for i in range(num_labels)]
print("Label map:", id2label)

# ---------------- dataset ----------------
class TextClsDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df["text"].astype(str).tolist()
        self.labels = df["label_enc"].astype(int).tolist()
        self.tok = tokenizer
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(
            self.texts[i],
            truncation=True,
            padding=False,     # per-batch padding via collate
            max_length=MAX_LENGTH,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[i], dtype=torch.long)
        return item

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch device:", device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, token=HF_TOKEN)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    token=HF_TOKEN
).to(device)
if hasattr(model.config, "use_cache"):
    model.config.use_cache = False

# ---------------- DataLoader with dynamic padding ----------------
def collate(batch):
    keys = batch[0].keys()
    out = {}
    for k in ["input_ids", "attention_mask", "token_type_ids"]:
        if k in keys and all(k in x for x in batch):
            pad_id = tokenizer.pad_token_id if k == "input_ids" else 0
            out[k] = torch.nn.utils.rnn.pad_sequence(
                [x[k] for x in batch], batch_first=True, padding_value=pad_id
            )
    out["labels"] = torch.tensor([x["labels"].item() for x in batch], dtype=torch.long)
    return out

train_ds = TextClsDataset(df_train, tokenizer)
val_ds   = TextClsDataset(df_val, tokenizer)
test_ds  = TextClsDataset(df_test, tokenizer)

g = torch.Generator(); g.manual_seed(RANDOM_STATE)
train_loader = DataLoader(train_ds, batch_size=TRAIN_BS, shuffle=True,  collate_fn=collate, generator=g)
val_loader   = DataLoader(val_ds,   batch_size=EVAL_BS, shuffle=False, collate_fn=collate)
test_loader  = DataLoader(test_ds,  batch_size=EVAL_BS, shuffle=False, collate_fn=collate)

# ---------------- class weights ----------------
counts = df_train["label_enc"].value_counts().sort_index()
weights = (counts.sum() / (num_labels * counts)).astype("float32").to_numpy()
class_weights = torch.tensor(weights, device=device)
print("Class weights:", [float(x) for x in class_weights])

# ---------------- optim + sched + amp ----------------
optim = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
num_steps = max(1, len(train_loader)) * EPOCHS // max(1, GRAD_ACCUM_STEPS)
sched = get_linear_schedule_with_warmup(
    optim,
    num_warmup_steps=int(0.06 * num_steps),
    num_training_steps=num_steps
)
scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda"))

# ---------------- helpers ----------------
@torch.no_grad()
def forward_eval(loader):
    model.eval()
    all_logits, all_labels, loss_sum, n_steps = [], [], 0.0, 0
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        out = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
        logits = out.logits
        loss = F.cross_entropy(logits, batch["labels"], weight=class_weights, label_smoothing=LABEL_SMOOTHING)
        all_logits.append(logits.detach().cpu())
        all_labels.append(batch["labels"].detach().cpu())
        loss_sum += loss.item(); n_steps += 1
    logits = torch.cat(all_logits, dim=0).numpy()
    labels = torch.cat(all_labels, dim=0).numpy()
    return loss_sum / max(1, n_steps), logits, labels

def compute_metrics_from_logits(logits, labels, names):
    preds = np.argmax(logits, axis=1)
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
    out = {
        "accuracy": float(accuracy_score(labels, preds)),
        "precision_macro": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "precision_weighted": float(precision_score(labels, preds, average="weighted", zero_division=0)),
        "recall_macro": float(recall_score(labels, preds, average="macro", zero_division=0)),
        "recall_weighted": float(recall_score(labels, preds, average="weighted", zero_division=0)),
        "f1_macro": float(f1_score(labels, preds, average="macro", zero_division=0)),
        "f1_weighted": float(f1_score(labels, preds, average="weighted", zero_division=0)),
    }
    try:
        out["roc_auc_ovr"] = float(roc_auc_score(labels, probs, multi_class="ovr"))
    except Exception:
        out["roc_auc_ovr"] = None

    rep = classification_report(labels, preds, target_names=names, digits=4, zero_division=0, output_dict=True)
    rep_df = pd.DataFrame(rep).T.rename(columns={"f1-score": "f1_score"})

    # per class auc
    try:
        for i, n in enumerate(names):
            y_bin = (labels == i).astype(int)
            rep_df.loc[n, "roc_auc_ovr"] = float(roc_auc_score(y_bin, probs[:, i]))
    except Exception:
        pass

    return out, rep_df

def save_metrics_block(prefix: str, metrics: dict, rep_df: pd.DataFrame):
    (out_dir / f"{prefix}_metrics.json").write_text(json.dumps(metrics, indent=2))
    rep_df.to_csv(out_dir / f"{prefix}_classification_report.csv", index=True)

def save_confusion_matrix(prefix: str, labels, logits):
    preds = np.argmax(logits, axis=1)
    cm = confusion_matrix(labels, preds)
    cm_df = pd.DataFrame(cm, index=[f"true_{n}" for n in label_names], columns=[f"pred_{n}" for n in label_names])
    cm_df.to_csv(out_dir / f"{prefix}_confusion_matrix.csv", index=True)

# ---------------- training loop with best checkpoint ----------------
best_f1 = -1.0
bad_epochs = 0

print("\nStarting training")
model.train()
optim.zero_grad(set_to_none=True)
step_in_accum = 0

for ep in range(1, EPOCHS + 1):
    t0 = time.time()
    model.train()
    running = 0.0; nsteps = 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.cuda.amp.autocast(enabled=(device == "cuda")):
            out = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            loss = F.cross_entropy(out.logits, batch["labels"], weight=class_weights, label_smoothing=LABEL_SMOOTHING)

        step_in_accum += 1
        scaler.scale(loss / GRAD_ACCUM_STEPS).backward()
        if step_in_accum % GRAD_ACCUM_STEPS == 0:
            scaler.step(optim); scaler.update()
            optim.zero_grad(set_to_none=True)
            sched.step()

        running += loss.item(); nsteps += 1

    # end-epoch eval
    val_loss, val_logits, val_labels = forward_eval(val_loader)
    val_metrics, _ = compute_metrics_from_logits(val_logits, val_labels, label_names)
    dt = time.time() - t0
    print(f"epoch {ep} train_loss {running/max(1,nsteps):.4f} "
          f"val_loss {val_loss:.4f} val_f1_macro {val_metrics['f1_macro']:.4f} time {dt:.1f}s")

    # save best immediately
    if val_metrics["f1_macro"] > best_f1 + 1e-8:
        best_f1 = val_metrics["f1_macro"]
        model.save_pretrained(best_dir)
        tokenizer.save_pretrained(best_dir)
        cfg = AutoModelForSequenceClassification.from_pretrained(best_dir).config
        cfg.id2label = id2label; cfg.label2id = label2id
        cfg.save_pretrained(best_dir)
        bad_epochs = 0
    else:
        bad_epochs += 1
        if bad_epochs >= EARLY_STOP_PATIENCE:
            print(f"Early stopping after {ep} epoch(s) without improvement")
            break

# ---------------- final eval + SAVE last ----------------
train_loss, train_logits, train_labels = forward_eval(train_loader)
val_loss,   val_logits,   val_labels   = forward_eval(val_loader)
test_loss,  test_logits,  test_labels  = forward_eval(test_loader)

train_metrics, train_rep = compute_metrics_from_logits(train_logits, train_labels, label_names)
val_metrics,   val_rep   = compute_metrics_from_logits(val_logits,   val_labels,   label_names)
test_metrics,  test_rep  = compute_metrics_from_logits(test_logits,  test_labels,  label_names)

save_metrics_block("train", train_metrics, train_rep)
save_metrics_block("val",   val_metrics,   val_rep)
save_metrics_block("test",  test_metrics,  test_rep)
save_confusion_matrix("test", test_labels, test_logits)

model.save_pretrained(out_dir)
tokenizer.save_pretrained(out_dir)
cfg = AutoModelForSequenceClassification.from_pretrained(out_dir).config
cfg.id2label = id2label; cfg.label2id = label2id
cfg.save_pretrained(out_dir)

summary = {
    "epochs_trained": ep,
    "best_val_f1_macro": best_f1,
    "out_dir": str(out_dir.resolve()),
    "best_dir": str(best_dir.resolve()),
}
(out_dir / "summary.json").write_text(json.dumps(summary, indent=2))

print("\nSaved LAST model to:", out_dir.resolve())
print("Saved BEST checkpoint to:", best_dir.resolve())


Saving to: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_with_stress\mental_roberta_base_with_stress
Label map: {0: 'anxiety', 1: 'depression', 2: 'ptsd', 3: 'suicide', 4: 'stress', 5: 'none'}
Torch device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mental/mental-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights: [2.799196720123291, 0.5001794099807739, 2.799196720123291, 1.3870646953582764, 0.5106227397918701, 1.647754192352295]

Starting training


  scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda"))
  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


epoch 1 train_loss 0.9748 val_loss 0.5404 val_f1_macro 0.7546 time 105.0s


  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


epoch 2 train_loss 0.4693 val_loss 0.5203 val_f1_macro 0.7660 time 106.5s


  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


epoch 3 train_loss 0.2743 val_loss 0.5080 val_f1_macro 0.7903 time 107.3s


  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


epoch 4 train_loss 0.1609 val_loss 0.6293 val_f1_macro 0.7912 time 107.8s


  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


epoch 5 train_loss 0.0989 val_loss 0.6699 val_f1_macro 0.7986 time 107.7s

Saved LAST model to: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_with_stress\mental_roberta_base_with_stress
Saved BEST checkpoint to: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_with_stress\mental_roberta_base_with_stress\best


In [6]:
# ---- DISPLAY label-wise metrics (precision/recall/F1/support + per-class AUC) ----
cols = ["precision", "recall", "f1_score", "support", "roc_auc_ovr"]

def show_labelwise(df, title):
    # keep only label rows (drop 'accuracy', 'macro avg', 'weighted avg' if present)
    df_show = df.loc[[n for n in df.index if n in label_names], cols].copy()
    print("\n" + "="*len(title))
    print(title)
    print("="*len(title))
    print(df_show.round(4).to_string())

show_labelwise(train_rep, "Label-wise metrics — TRAIN")
show_labelwise(val_rep,   "Label-wise metrics — VAL")
show_labelwise(test_rep,  "Label-wise metrics — TEST")



Label-wise metrics — TRAIN
            precision  recall  f1_score  support  roc_auc_ovr
anxiety        0.9566  0.9970    0.9764    332.0       0.9995
depression     0.9955  0.9510    0.9727   1858.0       0.9977
ptsd           0.9375  0.9940    0.9649    332.0       0.9992
suicide        0.8850  0.9881    0.9337    670.0       0.9971
stress         0.9983  0.9824    0.9903   1820.0       0.9995
none           0.9982  0.9982    0.9982    564.0       1.0000

Label-wise metrics — VAL
            precision  recall  f1_score  support  roc_auc_ovr
anxiety        0.6364  0.8333    0.7216     42.0       0.9737
depression     0.8491  0.8491    0.8491    232.0       0.9693
ptsd           0.6923  0.6585    0.6750     41.0       0.9636
suicide        0.6667  0.6429    0.6545     84.0       0.9530
stress         0.9312  0.8943    0.9124    227.0       0.9871
none           0.9722  0.9859    0.9790     71.0       0.9997

Label-wise metrics — TEST
            precision  recall  f1_score  support  r

In [7]:
from sklearn.metrics import confusion_matrix
print("\nConfusion matrix — TEST (rows=true, cols=pred):")
print(pd.DataFrame(
    confusion_matrix(test_labels, np.argmax(test_logits, axis=1)),
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names],
).to_string())



Confusion matrix — TEST (rows=true, cols=pred):
                 pred_anxiety  pred_depression  pred_ptsd  pred_suicide  pred_stress  pred_none
true_anxiety               32                4          1             0            5          0
true_depression             6              195          0            27            4          0
true_ptsd                   1                1         30             0            9          0
true_suicide                2               14          0            64            4          0
true_stress                 9                4          4             0          209          1
true_none                   0                1          0             1            1         68
