
#### Robustness eval (no sampling)


In [1]:

from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, roc_auc_score
)



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ----------------------------
# config (edit this list!)
# ----------------------------
model_dirs = [
    "Data_Warehouse/mental_health_splits_with_stress/all_roberta_large_v1_with_stress/best",
    "Data_Warehouse/mental_health_splits_with_stress/mental_roberta_base_with_stress/best",
    "Data_Warehouse/mental_health_splits_no_stress/all_roberta_large_v1_multiclass/best",
    "Data_Warehouse/mental_health_splits_no_stress/mental_roberta_base_no_stress/best",
]
# Point this to the dataset you want to evaluate (e.g., 50/50 sample)
#dataset_csv = "Data_Warehouse/erisk_task2_sample_50_50.csv"
#dataset_csv = "Data_Warehouse/erisk_task2_sample_100_100.csv"
#dataset_csv = "Data_Warehouse/erisk_task2_sample_200_200.csv"
#dataset_csv = "Data_Warehouse/erisk_task2_sample_50_250.csv"
dataset_csv = "Data_Warehouse/erisk_task2_userlevel_50_50.csv"
batch_size = 16
max_length = 512
seed = 42


In [5]:
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, roc_auc_score, classification_report, balanced_accuracy_score
)

# ----------------------------
# data utilities
# ----------------------------
class PostDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = list(texts)
        self.tok = tokenizer
        self.max_length = max_length
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(
            str(self.texts[i]),
            truncation=True,
            padding=False,   # dynamic padding via collator
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {k: v.squeeze(0) for k, v in enc.items()}

def normalize_label(s: str) -> str:
    return "depression" if str(s).strip().lower() == "depression" else "non depression"

def to_binary_int(lbl: str) -> int:
    return 1 if normalize_label(lbl) == "depression" else 0

# ----------------------------
# inference utilities
# ----------------------------
def find_depression_index(model) -> int:
    id2label = getattr(model.config, "id2label", None) or {i: f"label_{i}" for i in range(model.config.num_labels)}
    id2label_norm = {int(k): str(v).strip().lower() for k, v in id2label.items()}
    for i, name in id2label_norm.items():
        if name == "depression":
            return int(i)
    if model.config.num_labels > 1:
        print("warning: 'depression' not found in id2label; falling back to index 1")
        return 1
    raise ValueError(f"Could not resolve 'depression' class from id2label: {id2label}")

@torch.no_grad()
def infer(model, tokenizer, texts, batch_size=16, max_length=512, device="cpu"):
    ds = PostDataset(texts, tokenizer, max_length=max_length)
    collator = DataCollatorWithPadding(
        tokenizer,
        pad_to_multiple_of=8 if torch.cuda.is_available() else None,
    )
    dl = DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=torch.cuda.is_available(),
        collate_fn=collator,
    )

    logits_list = []
    model.eval()
    for batch in dl:
        batch = {k: v.to(device) for k, v in batch.items()}
        out = model(**batch)
        logits_list.append(out.logits.detach().cpu())

    logits = torch.cat(logits_list, dim=0).numpy()
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
    preds = probs.argmax(axis=1)
    return preds, probs

def evaluate_binary(y_true, y_pred, p_dep=None):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0]).astype(int)
    auc = None
    if p_dep is not None and len(np.unique(y_true)) > 1:
        try:
            auc = roc_auc_score(y_true, p_dep)
        except Exception:
            auc = None
    return acc, prec, rec, f1, auc, cm

def overall_metrics(y_true, y_pred):
    """Macro/weighted P/R/F1 and balanced accuracy."""
    out = {}
    for avg in ["macro", "weighted"]:
        p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average=avg, zero_division=0)
        out[f"precision_{avg}"] = p
        out[f"recall_{avg}"] = r
        out[f"f1_{avg}"] = f
    out["balanced_accuracy"] = balanced_accuracy_score(y_true, y_pred)
    return out

def labelwise_report(y_true, y_pred, p_dep=None):
    """
    Returns a small dataframe with per-class precision, recall, f1, support,
    and (if probs provided) per-class ROC-AUC.
    Classes shown: ['non depression', 'depression'] in that order.
    """
    target_names = ['non depression', 'depression']  # map 0 -> non dep, 1 -> dep
    rep = classification_report(
        y_true, y_pred,
        target_names=target_names,
        digits=4, zero_division=0, output_dict=True
    )
    df_rep = pd.DataFrame(rep).T.loc[target_names, ["precision", "recall", "f1-score", "support"]]
    df_rep = df_rep.rename(columns={"f1-score": "f1"})
    # add per-class AUC if probs provided
    if p_dep is not None and len(np.unique(y_true)) > 1:
        try:
            auc_dep = roc_auc_score(y_true, p_dep)  # positive=depression (1)
            auc_non = roc_auc_score(1 - y_true, 1 - p_dep)  # treat non-dep as positive
            df_rep.loc["depression", "roc_auc"] = float(auc_dep)
            df_rep.loc["non depression", "roc_auc"] = float(auc_non)
        except Exception:
            df_rep["roc_auc"] = np.nan
    else:
        df_rep["roc_auc"] = np.nan
    return df_rep

# ----------------------------
# main logic (notebook style)
# ----------------------------
# (keep your existing config vars set above this block)
# model_dirs = [...]
# dataset_csv = ...
# batch_size, max_length, seed = ...

np.random.seed(seed)
torch.manual_seed(seed)

df = pd.read_csv(dataset_csv)
if not {"text", "label"}.issubset(df.columns):
    raise ValueError("dataset_csv must contain columns: text,label")

df["label"] = df["label"].map(normalize_label)
texts = df["text"].astype(str).tolist()
y_true = np.array([to_binary_int(x) for x in df["label"]])
print(f"Loaded dataset: {len(df)} rows | class counts:\n{df['label'].value_counts()}")

device = "cuda" if torch.cuda.is_available() else "cpu"

for mdir in model_dirs:
    print("\nEvaluating:", mdir)
    tok = AutoTokenizer.from_pretrained(mdir, use_fast=True)
    mdl = AutoModelForSequenceClassification.from_pretrained(mdir).to(device)
    mdl.eval()

    dep_idx = find_depression_index(mdl)
    yhat_ids, probs = infer(mdl, tok, texts, batch_size=batch_size, max_length=max_length, device=device)
    y_pred = np.array([1 if int(i) == dep_idx else 0 for i in yhat_ids])
    p_dep = probs[:, dep_idx]

    acc, prec, rec, f1, auc, cm = evaluate_binary(y_true, y_pred, p_dep)
    print(f"accuracy={acc:.4f}  precision(dep)={prec:.4f}  recall(dep)={rec:.4f}  f1(dep)={f1:.4f}  auc(dep)={auc}")
    print("confusion matrix rows [dep, non]:")
    print(cm)

    # ---- per-class table ----
    lw = labelwise_report(y_true, y_pred, p_dep)
    print("\nLabel-wise metrics (per class):")
    print(lw.round(4).to_string())

    # ---- overall metrics (macro/weighted) ----
    om = overall_metrics(y_true, y_pred)
    print("\nOverall metrics:")
    print(
        "macro  - precision={:.4f} recall={:.4f} f1={:.4f}\n"
        "weighted - precision={:.4f} recall={:.4f} f1={:.4f}\n"
        "balanced_accuracy={:.4f}".format(
            om["precision_macro"], om["recall_macro"], om["f1_macro"],
            om["precision_weighted"], om["recall_weighted"], om["f1_weighted"],
            om["balanced_accuracy"]
        )
    )


Loaded dataset: 100 rows | class counts:
label
depression        50
non depression    50
Name: count, dtype: int64

Evaluating: Data_Warehouse/mental_health_splits_with_stress/all_roberta_large_v1_with_stress/best
accuracy=0.6300  precision(dep)=0.5890  recall(dep)=0.8600  f1(dep)=0.6992  auc(dep)=0.7639999999999999
confusion matrix rows [dep, non]:
[[43  7]
 [30 20]]

Label-wise metrics (per class):
                precision  recall      f1  support  roc_auc
non depression     0.7407    0.40  0.5195     50.0    0.764
depression         0.5890    0.86  0.6992     50.0    0.764

Overall metrics:
macro  - precision=0.6649 recall=0.6300 f1=0.6093
weighted - precision=0.6649 recall=0.6300 f1=0.6093
balanced_accuracy=0.6300

Evaluating: Data_Warehouse/mental_health_splits_with_stress/mental_roberta_base_with_stress/best
accuracy=0.5800  precision(dep)=0.5513  recall(dep)=0.8600  f1(dep)=0.6719  auc(dep)=0.716
confusion matrix rows [dep, non]:
[[43  7]
 [35 15]]

Label-wise metrics (per clas