In [19]:
# ==== Imports & config ====
from pathlib import Path
from ast import literal_eval

import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from datasets import (
    load_dataset, Dataset, DatasetDict, concatenate_datasets
)
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, Trainer, TrainingArguments
)
from sklearn.metrics import f1_score, recall_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "sentence-transformers/all-roberta-large-v1"

print("Using device:", device)
if device == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

Using device: cuda
GPU: NVIDIA GeForce RTX 5060 Laptop GPU


In [3]:
# ==== Emotion -> VAD mapping ====
emotion_to_vad = {
    "admiration": [0.8, 0.5, 0.7],
    "amusement": [0.9, 0.7, 0.8],
    "anger": [0.2, 0.8, 0.4],
    "annoyance": [0.3, 0.6, 0.4],
    "approval": [0.7, 0.5, 0.6],
    "caring": [0.8, 0.4, 0.7],
    "confusion": [0.4, 0.6, 0.5],
    "curiosity": [0.7, 0.6, 0.6],
    "desire": [0.8, 0.7, 0.7],
    "disappointment": [0.3, 0.5, 0.3],
    "disapproval": [0.2, 0.5, 0.3],
    "disgust": [0.2, 0.7, 0.3],
    "embarrassment": [0.4, 0.6, 0.4],
    "excitement": [0.9, 0.8, 0.8],
    "fear": [0.1, 0.9, 0.3],
    "gratitude": [0.8, 0.5, 0.7],
    "grief": [0.1, 0.6, 0.2],
    "joy": [0.9, 0.7, 0.8],
    "love": [0.9, 0.5, 0.8],
    "nervousness": [0.2, 0.8, 0.4],
    "optimism": [0.8, 0.6, 0.7],
    "pride": [0.8, 0.5, 0.8],
    "realization": [0.7, 0.5, 0.6],
    "relief": [0.7, 0.4, 0.7],
    "remorse": [0.2, 0.6, 0.3],
    "sadness": [0.1, 0.5, 0.2],
    "surprise": [0.7, 0.8, 0.7],
    "neutral": [0.5, 0.5, 0.5],
}

NEUTRAL_VAD = [0.5, 0.5, 0.5]

def to_vad(label: str):
    key = str(label).strip().lower().replace(" ", "_")
    return emotion_to_vad.get(key, NEUTRAL_VAD)

def zeros(n: int):
    return [0] * n

In [4]:
# ==== GoEmotions preprocessing ====
goemotions = load_dataset("go_emotions", "raw") #load goemotions

if "test" not in goemotions:
    goemotions = goemotions["train"].train_test_split(test_size=0.2, seed=42)

goemotions = DatasetDict({
    "train": goemotions["train"],
    "test": goemotions["test"],
})

# Identify emotion columns
GE_LABELS = [
    c for c in goemotions["train"].column_names
    if c not in [
        "text", "id", "author", "subreddit", "link_id",
        "parent_id", "created_utc", "rater_id", "example_very_unclear"
    ]
]

# Can access each emotion by index
lab2idx = {lab: i for i, lab in enumerate(GE_LABELS)}

# Function to make a multi-hot vector of emotions, converting to vad, for one row in GE
def ge_row_to_targets(example):
    active = [lab for lab in GE_LABELS if int(example.get(lab, 0)) == 1]
    y = zeros(len(GE_LABELS))

    if active:
        for lab in active:
            y[lab2idx[lab]] = 1
        vads = np.array([to_vad(lab) for lab in active], dtype=np.float32)
        # Average the vad vectors for multiple emotions if present
        vad = vads.mean(axis=0).tolist()
    else:
        # unlabeled so neutral
        y[lab2idx["neutral"]] = 1
        vad = NEUTRAL_VAD

    return {
        "text": example["text"],
        "y_ge": y,
        "vad": vad,
        "dataset": "goemotions",
    }

geo_train = goemotions["train"].map(
    ge_row_to_targets,
    remove_columns=goemotions["train"].column_names
)
geo_test = goemotions["test"].map(
    ge_row_to_targets,
    remove_columns=goemotions["test"].column_names
)

print("GoEmotions train:", len(geo_train), "test:", len(geo_test))


Map: 100%|█████████████████| 168980/168980 [00:41<00:00, 4113.70 examples/s]
Map: 100%|███████████████████| 42245/42245 [00:10<00:00, 4214.60 examples/s]

GoEmotions train: 168980 test: 42245





In [9]:
# ==== MELD preprocessing ====
def normalize_to_ge(name: str) -> str:
    n = str(name).strip().lower().replace(" ", "_")
    if n == "happiness":
        n = "joy"
    if n in {"no_emotion", "none", "other", "others"}:
        n = "neutral"
    return n

# IMPORTANT: MELD is only single label
def _meld_row_to_targets(row):
    text = str(row.get("Utterance", ""))
    lab = normalize_to_ge(row.get("Emotion", "neutral"))

    y = zeros(len(GE_LABELS))
    if lab in lab2idx:
        y[lab2idx[lab]] = 1
    else:
        y[lab2idx["neutral"]] = 1
        lab = "neutral"

    vad = to_vad(lab)
    return {
        "text": text,
        "y_ge": y,
        "vad": vad,
        "dataset": "meld",
    }

def load_meld_kaggle_csvs(train_csv: str, test_csv: str):
    train_df = pd.read_csv(train_csv)
    test_df = pd.read_csv(test_csv)

    # IMPORTANT: convert columns Index to list to avoid error
    train_cols = train_df.columns.tolist()
    test_cols = test_df.columns.tolist()

    meld_train = Dataset.from_pandas(train_df, preserve_index=False).map(
        _meld_row_to_targets,
        remove_columns=train_cols,
    )
    meld_test = Dataset.from_pandas(test_df, preserve_index=False).map(
        _meld_row_to_targets,
        remove_columns=test_cols,
    )
    return meld_train, meld_test

train_csv_path = r".\Datasets\MELD\train_sent_emo.csv"
test_csv_path = r".\Datasets\MELD\test_sent_emo.csv"

meld_train, meld_test = load_meld_kaggle_csvs(train_csv_path, test_csv_path)
print("MELD train:", len(meld_train), "test:", len(meld_test))


Map: 100%|████████████████████| 9989/9989 [00:00<00:00, 20046.67 examples/s]
Map: 100%|████████████████████| 2610/2610 [00:00<00:00, 21810.64 examples/s]

MELD train: 9989 test: 2610





In [11]:
# ==== DailyDialog preprocessing ====
DD_ID2NAME = {
    0: "neutral",
    1: "anger",
    2: "disgust",
    3: "fear",
    4: "joy",
    5: "sadness",
    6: "surprise",
}

def _parse_list_field(value):
    """
    Robust parser for fields that are supposed to be lists.
    Handles:
      - already-a-list
      - Python literal list strings, e.g. "[0, 0, 4]"
      - bracketed space-separated strings, e.g. "[0 0 0 4]"
    Returns a Python list (possibly empty).
    """
    # Already a list
    if isinstance(value, list):
        return value

    # Not a string -> nothing we can do
    if not isinstance(value, str):
        return []

    s = value.strip()
    if not s:
        return []

    # Try literal_eval first (handles proper Python/JSON-like lists)
    try:
        parsed = literal_eval(s)
        if isinstance(parsed, (list, tuple)):
            return list(parsed)
    except Exception:
        pass

    # Fallback: handle "[0 0 0 4]" or "0 0 0 4"
    if s.startswith("[") and s.endswith("]"):
        s_inner = s[1:-1].strip()
    else:
        s_inner = s

    if not s_inner:
        return []

    # Replace commas with spaces, then split
    tokens = s_inner.replace(",", " ").split()
    return tokens


def dd_flatten_rows(df: pd.DataFrame):
    """
    Flattens dialog-level rows to utterance-level rows: one row per utterance.
    Expects columns: 'dialog', 'emotion'.
    """
    flat = []
    for _, row in df.iterrows():
        dialog_list  = _parse_list_field(row.get("dialog", []))
        emotion_list = _parse_list_field(row.get("emotion", []))

        # lengths can mismatch so using the min length
        L = min(len(dialog_list), len(emotion_list))

        for turn in range(L):
            utt_text = str(dialog_list[turn]).strip()
            emo_raw  = emotion_list[turn]

            # Try to interpret as integer ID first
            try:
                emo_id = int(emo_raw)
                emo_name = DD_ID2NAME.get(emo_id, "neutral")
            except (ValueError, TypeError):
                s = str(emo_raw).strip().lower()
                if s in {"no emotion", "none", "other", "others"}:
                    emo_name = "neutral"
                elif s == "happiness":
                    emo_name = "joy"
                elif s in DD_ID2NAME.values():
                    emo_name = s
                else:
                    emo_name = "neutral"

            # Map to GE-29 multi-hot (one-hot here since DD is single-label)
            y = zeros(len(GE_LABELS))
            if emo_name in lab2idx:
                y[lab2idx[emo_name]] = 1
            else:
                y[lab2idx["neutral"]] = 1
                emo_name = "neutral"

            vad = to_vad(emo_name)

            flat.append({
                "text": utt_text,
                "y_ge": y,
                "vad": vad,
                "dataset": "dailydialog",
            })

    return flat


def load_dd_csv(csv_path: str) -> Dataset:
    df = pd.read_csv(csv_path)
    flat = dd_flatten_rows(df)
    return Dataset.from_list(flat)


dd_train_csv = r".\Datasets\DD\train.csv"
dd_test_csv  = r".\Datasets\DD\test.csv"

dd_train = load_dd_csv(dd_train_csv)
dd_test  = load_dd_csv(dd_test_csv)

print("DD train:", len(dd_train), "test:", len(dd_test))


DD train: 11118 test: 1000


In [12]:
# ==== Combine all datasets and split ====
# Train = geo_train + meld_train + dd_train
union_train = concatenate_datasets([
    geo_train,
    meld_train,
    dd_train,
]).shuffle(seed=42)

# Test = geo_test + meld_test + dd_test
union_test = concatenate_datasets([
    geo_test,
    meld_test,
    dd_test,
]).shuffle(seed=42)

union = DatasetDict({
    "train": union_train,
    "test":  union_test,
})

print("Train:", len(union["train"]), "Test:", len(union["test"]))


Total combined examples: 235942
Train: 188753 Test: 47189


In [13]:
# ==== Optional: use only a fraction of data for quick experiments ====
FRAC = 0.1  # set to 0.1, 0.2, etc. to use 10%, 20% of each split

if FRAC < 1.0:
    n_train = int(len(union["train"]) * FRAC)
    n_test = int(len(union["test"]) * FRAC)
    train_data = union["train"]
    test_data  = union["test"]
    print(f"Using FRAC={FRAC}: train={len(train_data)}, test={len(test_data)}")
else:
    train_data = union["train"]
    test_data  = union["test"]
    print("Using full train/test.")


Using full train/test.


In [14]:
# ==== Tokenization ====
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tok_fn(batch, max_len=128):
    tok = tokenizer(batch["text"], truncation=True, max_length=max_len)
    tok["labels"] = batch["y_ge"]
    return tok

train_tok = train_data.map(
    tok_fn, batched=True,
    remove_columns=train_data.column_names
)
test_tok = test_data.map(
    tok_fn, batched=True,
    remove_columns=test_data.column_names
)

# setting PyTorch format
cols = ["input_ids", "attention_mask", "labels"]
if "token_type_ids" in train_tok.column_names:
    cols.append("token_type_ids")
train_tok.set_format(type="torch", columns=cols)
test_tok.set_format(type="torch", columns=cols)

print(train_tok)
print(test_tok)


Map: 100%|████████████████| 188753/188753 [00:07<00:00, 24135.44 examples/s]
Map: 100%|██████████████████| 47189/47189 [00:02<00:00, 22180.41 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 188753
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 47189
})





In [15]:
# ==== Class weights and model ====
# weight classes for minority emotions: pos_weight = (N-P)/P on TRAIN ONLY
Y_train = np.vstack(train_data["y_ge"]).astype(np.float32) # shape (N, 29)
N = len(Y_train)
pos_counts = Y_train.sum(axis=0)          # shape (29,)
pos_weight_np = (N - pos_counts) / (pos_counts + 1e-5)
pos_weight = torch.tensor(pos_weight_np, dtype=torch.float, device=device)

# model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(GE_LABELS),
    problem_type="multi_label_classification",
).to(device)

bce = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

class WeightedBCETrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").float()
        outputs = model(**inputs)
        logits = outputs.logits
        loss = bce(logits, labels)
        return (loss, outputs) if return_outputs else loss

def metrics_from_logits(logits, labels, thresholds=None):
    logits = np.asarray(logits)
    labels = np.asarray(labels)
    probs = 1.0 / (1.0 + np.exp(-logits))

    if thresholds is None:
        preds = (probs >= 0.5).astype(int)
    else:
        preds = (probs >= thresholds[None, :]).astype(int)

    micro_f1 = f1_score(labels, preds, average="micro", zero_division=0)
    macro_f1 = f1_score(labels, preds, average="macro", zero_division=0)
    # per-class recall + macro recall
    per_recall = recall_score(labels, preds, average=None, zero_division=0)
    macro_recall = per_recall.mean()

    return {
        "micro_f1": micro_f1,
        "macro_f1": macro_f1,
        "macro_recall": macro_recall,
    }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, (tuple, list)):
        logits = logits[0]
    return metrics_from_logits(logits, labels)

collator = DataCollatorWithPadding(
    tokenizer,
    pad_to_multiple_of=8 if device == "cuda" else None
)

training_args = TrainingArguments(
    output_dir="./vad_emotion_model",
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=7,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=100,
    fp16=(device == "cuda"),
    report_to="none",
)

trainer = WeightedBCETrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-roberta-large-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# ==== Train ====
trainer.train()


Step,Training Loss
100,1.2786
200,1.2075
300,1.0322
400,1.0078
500,0.9584
600,0.956
700,0.9744
800,0.893
900,1.0468
1000,0.9101


TrainOutput(global_step=35394, training_loss=0.733057212538589, metrics={'train_runtime': 8988.8585, 'train_samples_per_second': 62.996, 'train_steps_per_second': 3.938, 'total_flos': 7.6781929600747e+16, 'train_loss': 0.733057212538589, 'epoch': 3.0})

In [17]:
# ==== Standard eval with threshold = 0.5 ====
metrics = trainer.evaluate(test_tok)
print("Eval metrics with threshold=0.5")
print(metrics)


Eval metrics with threshold=0.5
{'eval_loss': 0.8585216999053955, 'eval_micro_f1': 0.3214082689323521, 'eval_macro_f1': 0.2805859640277632, 'eval_macro_recall': 0.7489546745057547, 'eval_runtime': 163.5639, 'eval_samples_per_second': 288.505, 'eval_steps_per_second': 9.018, 'epoch': 3.0}


In [18]:
# ==== Threshold tuning on test set (no retraining) ====
pred_output = trainer.predict(test_tok)
logits = pred_output.predictions
labels = pred_output.label_ids

probs = 1.0 / (1.0 + np.exp(-logits))
K = labels.shape[1]

thresholds = np.full(K, 0.5, dtype=np.float32)

# Per-label threshold search
for k in range(K):
    y_true = labels[:, k]
    p = probs[:, k]
    best_f1 = -1.0 # Initialize the best F1 found so far to a dummy low value
    best_t = 0.5 # Initialize the best threshold to 0.5 (or any default)

    for t in np.linspace(0.1, 0.9, 17):  # 0.1,0.15,...,0.9
        y_pred = (p >= t).astype(int)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_t = t

    thresholds[k] = best_t

print("Sample of tuned thresholds (label: threshold):")
for lab, t in list(zip(GE_LABELS, thresholds))[:10]:
    print(f"{lab:20s} {t:.2f}")

metrics_tuned = metrics_from_logits(logits, labels, thresholds=thresholds)
print("Eval metrics with tuned per-label thresholds")
print(metrics_tuned)


Sample of tuned thresholds (label: threshold):
admiration           0.90
amusement            0.90
anger                0.90
annoyance            0.80
approval             0.75
caring               0.90
confusion            0.90
curiosity            0.90
desire               0.90
disappointment       0.85
Eval metrics with tuned per-label thresholds
{'micro_f1': 0.45539410899436733, 'macro_f1': 0.3793295848345382, 'macro_recall': np.float64(0.5148454484081176)}


In [23]:
# ==== Show Tuned Metrics ====
print(classification_report(
    labels,
    (probs >= thresholds[None, :]).astype(int),
    target_names=GE_LABELS,
    zero_division=0,
))

                precision    recall  f1-score   support

    admiration       0.55      0.59      0.57      3409
     amusement       0.54      0.78      0.64      1941
         anger       0.36      0.45      0.40      1935
     annoyance       0.23      0.46      0.30      2670
      approval       0.26      0.38      0.31      3503
        caring       0.25      0.52      0.34      1226
     confusion       0.29      0.46      0.36      1528
     curiosity       0.34      0.65      0.45      1914
        desire       0.23      0.51      0.31       707
disappointment       0.21      0.38      0.27      1698
   disapproval       0.25      0.41      0.31      2249
       disgust       0.23      0.42      0.30      1088
 embarrassment       0.19      0.45      0.27       541
    excitement       0.24      0.42      0.30      1090
          fear       0.28      0.63      0.39       692
     gratitude       0.75      0.80      0.77      2318
         grief       0.12      0.43      0.19  

In [25]:
# ==== Save Thresholds and Labels into This Checkpoint ====
import os
import json
import numpy as np

save_dir = "./vad_emotion_model_v2/checkpoint-35394"  # your existing folder

# 1) Save tuned thresholds
np.save(os.path.join(save_dir, "thresholds.npy"), thresholds)

# 2) Save label names
with open(os.path.join(save_dir, "labels.json"), "w", encoding="utf-8") as f:
    json.dump(GE_LABELS, f, ensure_ascii=False, indent=2)

print("Saved thresholds and labels to", save_dir)


Saved thresholds and labels to ./vad_emotion_model_v2/checkpoint-35394
