In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)
from sklearn.model_selection import train_test_split
import evaluate
import numpy as np
import os, sys, json
module_path = os.path.abspath(os.path.join('..', '..')) # or the path to your source code
sys.path.insert(0, module_path)
from src.utils import load_emorynlp, load_isear

# ===========
# 1) load dataframe
#train_df = load_emorynlp(split='train')
#val_df = load_emorynlp(split='dev')
#test_df = load_emorynlp(split='test')
#df = load_isear()
df = pd.read_csv("hf://datasets/gsri-18/ISEAR-dataset-complete/ISEAR_dataset_complete.csv")

train_df, val_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42, stratify=val_df['label'])

train_df.head()

2025-08-28 14:03:32.151653: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loaded 7102 samples from ISEAR dataset


Unnamed: 0,id,label,text
1297,21794,fear,Ffs dreadful defending
4679,20333,fear,@markhberman2003 @LanceZierlein @790blessing e...
2559,40950,sadness,@Magrove86Mark @clareftballnews jersey gloves?...
163,11104,anger,@inthefade going back to blissful ignorance?!
5661,30168,joy,@followAdamA looking back on recent tweets see...


In [2]:


# ===========
# 2) encode labels (string -> int)
# ===========
le = LabelEncoder()
le.fit(train_df["label"])

train_df["label_id"] = le.transform(train_df["label"])
val_df["label_id"]   = le.transform(val_df["label"])
test_df["label_id"]  = le.transform(test_df["label"])

id2label = {i: l for i, l in enumerate(le.classes_)}
label2id = {l: i for i, l in id2label.items()}

print("Classes:", id2label)


# ===========
# 3) Convert pandas -> Hugging Face Dataset
# ===========
train_ds = Dataset.from_pandas(train_df[["text","label_id"]].rename(columns={"label_id":"labels"}))
val_ds   = Dataset.from_pandas(val_df[["text","label_id"]].rename(columns={"label_id":"labels"}))
test_ds  = Dataset.from_pandas(test_df[["text","label_id"]].rename(columns={"label_id":"labels"}))

dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

# ===========
# 4) Tokenizer & model
# ===========
model_name = "distilbert-base-multilingual-cased"
#model_name = "xlm-roberta-base"
#model_name = "meta-llama/Llama-3.2-1B" #"cardiffnlp/twitter-xlm-roberta-base-emotion"
tok = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tok(batch["text"], truncation=True, max_length=64) #128

dataset_tok = dataset.map(tokenize, batched=True)
collator = DataCollatorWithPadding(tokenizer=tok)

cfg = AutoConfig.from_pretrained(model_name, num_labels=len(le.classes_), id2label=id2label, label2id=label2id)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=cfg)

# ===========
# 5) Metrics
# ===========
acc = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

# ===========
# 6) Trainer
# ===========
args = TrainingArguments(
    output_dir="mdistilbert_emotions",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-4,#5e-5,
    per_device_train_batch_size=8,#16,
    per_device_eval_batch_size=16,#32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset_tok["train"],
    eval_dataset=dataset_tok["validation"],
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

# ===========
# 7) Train and evaluate
# ===========
trainer.train()
print("Final test results:", trainer.evaluate(dataset_tok["test"]))

Classes: {0: 'anger', 1: 'fear', 2: 'joy', 3: 'sadness'}


Map:   0%|          | 0/4971 [00:00<?, ? examples/s]

Map:   0%|          | 0/1065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/1866 [00:00<?, ?it/s]

{'loss': 1.4256, 'grad_norm': 4.1073503494262695, 'learning_rate': 0.00048660235798499466, 'epoch': 0.08}
{'loss': 1.4245, 'grad_norm': 3.7308976650238037, 'learning_rate': 0.0004732047159699893, 'epoch': 0.16}
{'loss': 1.393, 'grad_norm': 3.5613629817962646, 'learning_rate': 0.00045980707395498397, 'epoch': 0.24}
{'loss': 1.4177, 'grad_norm': 2.5845584869384766, 'learning_rate': 0.00044640943193997856, 'epoch': 0.32}
{'loss': 1.3927, 'grad_norm': 2.9472012519836426, 'learning_rate': 0.0004330117899249732, 'epoch': 0.4}
{'loss': 1.3901, 'grad_norm': 2.973587989807129, 'learning_rate': 0.00041961414790996787, 'epoch': 0.48}
{'loss': 1.411, 'grad_norm': 2.1490581035614014, 'learning_rate': 0.0004062165058949625, 'epoch': 0.56}
{'loss': 1.3915, 'grad_norm': 2.877133846282959, 'learning_rate': 0.0003928188638799571, 'epoch': 0.64}
{'loss': 1.384, 'grad_norm': 1.8343064785003662, 'learning_rate': 0.00037942122186495177, 'epoch': 0.72}
{'loss': 1.4001, 'grad_norm': 2.6202657222747803, 'learn

  0%|          | 0/67 [00:00<?, ?it/s]

{'eval_loss': 1.3821744918823242, 'eval_accuracy': 0.3173708920187793, 'eval_f1_macro': 0.12045616535994298, 'eval_runtime': 66.6407, 'eval_samples_per_second': 15.981, 'eval_steps_per_second': 1.005, 'epoch': 1.0}
{'loss': 1.4054, 'grad_norm': 1.8488689661026, 'learning_rate': 0.0003258306538049303, 'epoch': 1.05}
{'loss': 1.3872, 'grad_norm': 1.5437819957733154, 'learning_rate': 0.00031243301178992503, 'epoch': 1.13}
{'loss': 1.3745, 'grad_norm': 1.076107382774353, 'learning_rate': 0.0002990353697749196, 'epoch': 1.21}
{'loss': 1.3693, 'grad_norm': 0.9571390151977539, 'learning_rate': 0.0002856377277599143, 'epoch': 1.29}
{'loss': 1.3816, 'grad_norm': 2.0335021018981934, 'learning_rate': 0.0002722400857449089, 'epoch': 1.37}
{'loss': 1.389, 'grad_norm': 1.1628592014312744, 'learning_rate': 0.0002588424437299036, 'epoch': 1.45}
{'loss': 1.3863, 'grad_norm': 0.8662708401679993, 'learning_rate': 0.0002454448017148982, 'epoch': 1.53}
{'loss': 1.3844, 'grad_norm': 0.9215345978736877, 'lea

  0%|          | 0/67 [00:00<?, ?it/s]

{'eval_loss': 1.3764488697052002, 'eval_accuracy': 0.3173708920187793, 'eval_f1_macro': 0.12045616535994298, 'eval_runtime': 68.5075, 'eval_samples_per_second': 15.546, 'eval_steps_per_second': 0.978, 'epoch': 2.0}
{'loss': 1.3805, 'grad_norm': 1.47127103805542, 'learning_rate': 0.00016505894962486604, 'epoch': 2.01}
{'loss': 1.3717, 'grad_norm': 1.474887728691101, 'learning_rate': 0.00015166130760986066, 'epoch': 2.09}
{'loss': 1.3835, 'grad_norm': 1.297361969947815, 'learning_rate': 0.00013826366559485531, 'epoch': 2.17}
{'loss': 1.3746, 'grad_norm': 0.9128406047821045, 'learning_rate': 0.00012486602357984997, 'epoch': 2.25}
{'loss': 1.3754, 'grad_norm': 1.133837342262268, 'learning_rate': 0.00011146838156484459, 'epoch': 2.33}
{'loss': 1.3722, 'grad_norm': 1.0803107023239136, 'learning_rate': 9.807073954983923e-05, 'epoch': 2.41}
{'loss': 1.3696, 'grad_norm': 0.7425651550292969, 'learning_rate': 8.467309753483388e-05, 'epoch': 2.49}
{'loss': 1.3837, 'grad_norm': 1.3321400880813599, 

RuntimeError: [enforce fail at inline_container.cc:595] . unexpected pos 24128 vs 24020

{'loss': 24.9592, 'grad_norm': 1.0552517175674438, 'learning_rate': 0.048660235798499464, 'epoch': 0.08}
{'loss': 1.5166, 'grad_norm': 0.22348372638225555, 'learning_rate': 0.04732047159699893, 'epoch': 0.16}
{'loss': 1.3658, 'grad_norm': 0.3004882335662842, 'learning_rate': 0.045980707395498394, 'epoch': 0.24}
{'loss': 1.3918, 'grad_norm': 0.25394704937934875, 'learning_rate': 0.04464094319399786, 'epoch': 0.32}
{'loss': 1.3852, 'grad_norm': 0.3115262985229492, 'learning_rate': 0.04330117899249732, 'epoch': 0.4}
{'loss': 1.3673, 'grad_norm': 0.3080769181251526, 'learning_rate': 0.04196141479099679, 'epoch': 0.48}
{'loss': 1.3941, 'grad_norm': 0.1880466192960739, 'learning_rate': 0.04062165058949625, 'epoch': 0.56}