In [1]:
#!pip install -U datasets optuna

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [2]:
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import set_seed, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score
import optuna
import pandas as pd

In [3]:
seed = 42
set_seed(seed)

In [4]:
train = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", "rus", split="train")
# train_eng = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", "ukr", split="train")

val = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", "rus", split="dev")
# val_eng = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", "eng", split="dev")
test = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", "rus", split="test")

In [6]:
# train = concatenate_datasets([train, train_eng])

In [7]:
# val = concatenate_datasets([val, val_eng])

In [8]:
emotion_cols = ['anger', 'fear', 'joy', 'disgust', 'sadness', 'surprise']

In [9]:
def create_labels(examples):
    labels = []
    for i in range(len(examples['text'])):
        label = [float(examples[col][i]) for col in emotion_cols]
        labels.append(label)
    examples['labels'] = labels
    return examples

train = train.map(create_labels, batched=True)
val = val.map(create_labels, batched=True)
test = test.map(create_labels, batched=True)

Map:   0%|          | 0/2679 [00:00<?, ? examples/s]

In [10]:
model_name = "google-bert/bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=False, max_length=512)

train_tokenized = train.map(tokenize_function, batched=True)
val_tokenized = val.map(tokenize_function, batched=True)

Map:   0%|          | 0/2679 [00:00<?, ? examples/s]

In [12]:
train_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(predictions)).numpy()
    y_pred = predictions > 0.5

    results = {}
    for average in ['micro', 'macro']:
        results[f'{average}_recall'] = recall_score(labels, y_pred, average=average, zero_division=0)
        results[f'{average}_precision'] = precision_score(labels, y_pred, average=average, zero_division=0)
        results[f'{average}_f1'] = f1_score(labels, y_pred, average=average, zero_division=0)

    return results

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
    warmup_steps = trial.suggest_int("warmup_steps", 0, 500)
    num_epochs = trial.suggest_int("num_train_epochs", 2, 7)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(emotion_cols),
        problem_type="multi_label_classification"
    )

    training_args = TrainingArguments(
        output_dir=f'./results/trial_{trial.number}',
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        logging_steps=50,
        eval_strategy="epoch",
        metric_for_best_model="eval_macro_f1",
        logging_dir=f'./logs/trial_{trial.number}',
        save_strategy="no",
        report_to=None,
        seed=seed,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )

    trainer.train()

    eval_results = trainer.evaluate()

    return eval_results["eval_macro_f1"]

In [17]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("\nBest hyperparameters:")
print(study.best_params)
print(f"Best macro F1: {study.best_value:.4f}")

[I 2025-06-12 22:50:46,047] A new study created in memory with name: no-name-8dc3dd17-b8a6-4239-b0b6-998bec4f3f12
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2569,0.224113,0.709845,0.761111,0.734584,0.720573,0.798664,0.749046
2,0.1783,0.19261,0.766839,0.8,0.783069,0.781301,0.807663,0.787782
3,0.1412,0.194267,0.740933,0.877301,0.803371,0.752696,0.89588,0.809233
4,0.098,0.175378,0.772021,0.866279,0.816438,0.78885,0.877467,0.82731
5,0.0644,0.18399,0.756477,0.884848,0.815642,0.770042,0.889119,0.82207
6,0.0479,0.194129,0.782383,0.877907,0.827397,0.798349,0.88304,0.834268
7,0.0239,0.194067,0.782383,0.862857,0.820652,0.798349,0.869818,0.828647


[I 2025-06-12 22:53:21,906] Trial 0 finished with value: 0.8286472173023985 and parameters: {'learning_rate': 4.0880000250683644e-05, 'batch_size': 8, 'weight_decay': 0.1896718711665509, 'warmup_steps': 94, 'num_train_epochs': 7}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.4219,0.384289,0.202073,0.866667,0.327731,0.172951,0.274962,0.20754
2,0.3224,0.292123,0.435233,0.923077,0.591549,0.427599,0.756774,0.533381


[I 2025-06-12 22:53:52,202] Trial 1 finished with value: 0.5333814333814334 and parameters: {'learning_rate': 1.097708908459796e-05, 'batch_size': 16, 'weight_decay': 0.02690975343175591, 'warmup_steps': 260, 'num_train_epochs': 2}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2906,0.257719,0.601036,0.822695,0.694611,0.602244,0.848238,0.67177
2,0.1921,0.208931,0.725389,0.858896,0.786517,0.739293,0.882121,0.793057
3,0.1407,0.184106,0.704663,0.877419,0.781609,0.721193,0.890516,0.788572


[I 2025-06-12 22:54:59,200] Trial 2 finished with value: 0.7885720072716738 and parameters: {'learning_rate': 4.837599564129313e-05, 'batch_size': 8, 'weight_decay': 0.1478881164993717, 'warmup_steps': 465, 'num_train_epochs': 3}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.3973,0.439399,0.0,0.0,0.0,0.0,0.0,0.0
2,0.4289,0.438878,0.0,0.0,0.0,0.0,0.0,0.0
3,0.4411,0.442411,0.0,0.0,0.0,0.0,0.0,0.0
4,0.4331,0.43827,0.0,0.0,0.0,0.0,0.0,0.0
5,0.426,0.437776,0.0,0.0,0.0,0.0,0.0,0.0
6,0.4306,0.438905,0.0,0.0,0.0,0.0,0.0,0.0


[I 2025-06-12 22:57:10,533] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 0.00020096667289333227, 'batch_size': 8, 'weight_decay': 0.08350964310886518, 'warmup_steps': 197, 'num_train_epochs': 6}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.4451,0.441182,0.0,0.0,0.0,0.0,0.0,0.0
2,0.4289,0.440606,0.0,0.0,0.0,0.0,0.0,0.0
3,0.4385,0.442471,0.0,0.0,0.0,0.0,0.0,0.0
4,0.4328,0.438971,0.0,0.0,0.0,0.0,0.0,0.0
5,0.4254,0.438445,0.0,0.0,0.0,0.0,0.0,0.0
6,0.4298,0.439886,0.0,0.0,0.0,0.0,0.0,0.0


[I 2025-06-12 22:59:23,565] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 0.00020769807822696386, 'batch_size': 8, 'weight_decay': 0.27862641151560985, 'warmup_steps': 193, 'num_train_epochs': 6}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.3611,0.318311,0.331606,0.744186,0.458781,0.350122,0.372312,0.358608
2,0.2855,0.351824,0.279793,0.666667,0.394161,0.32159,0.455314,0.343062
3,0.2608,0.2821,0.590674,0.695122,0.638655,0.574798,0.654118,0.600279
4,0.2476,0.280462,0.590674,0.76,0.664723,0.581559,0.666206,0.61911
5,0.2043,0.281284,0.621762,0.710059,0.662983,0.611599,0.646681,0.623912
6,0.1883,0.256199,0.647668,0.78125,0.708215,0.637769,0.687343,0.658568
7,0.1519,0.25715,0.632124,0.767296,0.693182,0.625775,0.680613,0.648272


[I 2025-06-12 23:01:58,047] Trial 5 finished with value: 0.6482715659537543 and parameters: {'learning_rate': 0.00013954209946765182, 'batch_size': 8, 'weight_decay': 0.1484940977614722, 'warmup_steps': 250, 'num_train_epochs': 7}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.5414,0.350888,0.233161,0.75,0.355731,0.202363,0.247868,0.222751
2,0.2628,0.201046,0.699482,0.90604,0.789474,0.717072,0.901413,0.795182
3,0.1754,0.185641,0.725389,0.903226,0.804598,0.732702,0.914414,0.807528
4,0.1507,0.21986,0.663212,0.888889,0.759644,0.668438,0.909352,0.753056
5,0.1165,0.199624,0.751295,0.863095,0.803324,0.763209,0.864637,0.807713


[I 2025-06-12 23:03:00,200] Trial 6 finished with value: 0.8077129643832414 and parameters: {'learning_rate': 0.00011404349552787183, 'batch_size': 32, 'weight_decay': 0.23353708835005116, 'warmup_steps': 344, 'num_train_epochs': 5}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2747,0.248663,0.595855,0.746753,0.662824,0.615578,0.758761,0.675893
2,0.1826,0.221408,0.735751,0.78453,0.759358,0.749455,0.816191,0.778443
3,0.1606,0.186275,0.746114,0.9,0.815864,0.75582,0.915235,0.825667
4,0.1171,0.201565,0.735751,0.8875,0.804533,0.751843,0.898081,0.815112
5,0.0846,0.193448,0.746114,0.90566,0.818182,0.760291,0.907979,0.823274
6,0.0622,0.186879,0.772021,0.871345,0.818681,0.777968,0.882786,0.823307


[I 2025-06-12 23:05:14,126] Trial 7 finished with value: 0.8233069577941862 and parameters: {'learning_rate': 6.861230099609194e-05, 'batch_size': 8, 'weight_decay': 0.03700627649030581, 'warmup_steps': 73, 'num_train_epochs': 6}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.3314,0.598687,0.19171,0.229814,0.20904,0.248474,0.214527,0.157259
2,0.4329,0.440779,0.0,0.0,0.0,0.0,0.0,0.0
3,0.4412,0.441157,0.0,0.0,0.0,0.0,0.0,0.0
4,0.4326,0.438315,0.0,0.0,0.0,0.0,0.0,0.0
5,0.4275,0.437057,0.0,0.0,0.0,0.0,0.0,0.0
6,0.4303,0.43709,0.0,0.0,0.0,0.0,0.0,0.0
7,0.415,0.437974,0.0,0.0,0.0,0.0,0.0,0.0


[I 2025-06-12 23:07:47,740] Trial 8 finished with value: 0.0 and parameters: {'learning_rate': 0.0002059288170791529, 'batch_size': 8, 'weight_decay': 0.1022161063626476, 'warmup_steps': 417, 'num_train_epochs': 7}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.3285,0.2993,0.409326,0.724771,0.523179,0.436059,0.584375,0.452072
2,0.413,0.436703,0.0,0.0,0.0,0.0,0.0,0.0
3,0.4343,0.446019,0.0,0.0,0.0,0.0,0.0,0.0
4,0.4274,0.441123,0.0,0.0,0.0,0.0,0.0,0.0
5,0.4356,0.439794,0.0,0.0,0.0,0.0,0.0,0.0
6,0.4283,0.437522,0.0,0.0,0.0,0.0,0.0,0.0
7,0.4186,0.4385,0.0,0.0,0.0,0.0,0.0,0.0


[I 2025-06-12 23:09:29,783] Trial 9 finished with value: 0.0 and parameters: {'learning_rate': 0.00047947502621289377, 'batch_size': 16, 'weight_decay': 0.19436212778263576, 'warmup_steps': 357, 'num_train_epochs': 7}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.4604,0.316791,0.352332,0.819277,0.492754,0.366624,0.631112,0.442442
2,0.247,0.224705,0.668394,0.86,0.752187,0.689188,0.871307,0.762448
3,0.1754,0.183538,0.73057,0.903846,0.808023,0.739705,0.917386,0.814989
4,0.1434,0.178058,0.740933,0.89375,0.810198,0.752373,0.897091,0.815579


[I 2025-06-12 23:10:19,554] Trial 10 finished with value: 0.8155792620078334 and parameters: {'learning_rate': 2.8118082064076645e-05, 'batch_size': 32, 'weight_decay': 0.2108715764777704, 'warmup_steps': 16, 'num_train_epochs': 4}. Best is trial 0 with value: 0.8286472173023985.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2708,0.250155,0.683938,0.729282,0.705882,0.695758,0.778007,0.722978
2,0.1807,0.193995,0.746114,0.79558,0.770053,0.758829,0.811112,0.779469
3,0.1348,0.166914,0.777202,0.887574,0.828729,0.783552,0.897775,0.833799
4,0.0878,0.16913,0.777202,0.862069,0.817439,0.783023,0.872561,0.824112
5,0.0688,0.178134,0.766839,0.902439,0.829132,0.775931,0.89969,0.830439


[I 2025-06-12 23:12:11,881] Trial 11 finished with value: 0.8304391511179802 and parameters: {'learning_rate': 3.6080447584106924e-05, 'batch_size': 8, 'weight_decay': 0.01575328599390322, 'warmup_steps': 33, 'num_train_epochs': 5}. Best is trial 11 with value: 0.8304391511179802.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2698,0.227168,0.689119,0.810976,0.745098,0.680764,0.883433,0.746411
2,0.1721,0.178116,0.756477,0.863905,0.80663,0.767905,0.87258,0.812162
3,0.1255,0.166217,0.761658,0.907407,0.828169,0.769574,0.912677,0.832382
4,0.088,0.162058,0.766839,0.89697,0.826816,0.776559,0.902448,0.832499


[I 2025-06-12 23:13:39,811] Trial 12 finished with value: 0.8324990546825356 and parameters: {'learning_rate': 2.868318448990469e-05, 'batch_size': 8, 'weight_decay': 0.004901939692684662, 'warmup_steps': 87, 'num_train_epochs': 4}. Best is trial 12 with value: 0.8324990546825356.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2942,0.258236,0.61658,0.804054,0.697947,0.608187,0.802675,0.673322
2,0.1794,0.184504,0.715026,0.896104,0.795389,0.734067,0.891982,0.802361
3,0.1469,0.176652,0.746114,0.872727,0.804469,0.761019,0.876137,0.812172
4,0.1091,0.178653,0.73057,0.892405,0.803419,0.748872,0.899714,0.812444


[I 2025-06-12 23:15:07,688] Trial 13 finished with value: 0.8124441130074933 and parameters: {'learning_rate': 1.9099410076449836e-05, 'batch_size': 8, 'weight_decay': 0.013061325859571286, 'warmup_steps': 18, 'num_train_epochs': 4}. Best is trial 12 with value: 0.8324990546825356.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.6003,0.386904,0.150259,0.966667,0.26009,0.123932,0.161111,0.140097
2,0.3334,0.255036,0.569948,0.839695,0.679012,0.565678,0.862515,0.66578
3,0.2095,0.208959,0.673575,0.878378,0.762463,0.690444,0.895207,0.76536
4,0.1642,0.182298,0.756477,0.874251,0.811111,0.761027,0.880656,0.813876
5,0.1359,0.175694,0.746114,0.872727,0.804469,0.754411,0.871324,0.807315


[I 2025-06-12 23:16:09,459] Trial 14 finished with value: 0.807314511019552 and parameters: {'learning_rate': 2.0782918150713653e-05, 'batch_size': 32, 'weight_decay': 0.07459355483959683, 'warmup_steps': 119, 'num_train_epochs': 5}. Best is trial 12 with value: 0.8324990546825356.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.3855,0.35046,0.233161,0.865385,0.367347,0.204176,0.603831,0.259517
2,0.2879,0.266206,0.523316,0.827869,0.64127,0.51524,0.687037,0.576435
3,0.2265,0.244682,0.569948,0.846154,0.681115,0.578813,0.859662,0.665903


[I 2025-06-12 23:16:54,056] Trial 15 finished with value: 0.6659028165720425 and parameters: {'learning_rate': 1.1679984563154842e-05, 'batch_size': 16, 'weight_decay': 0.05590774502094219, 'warmup_steps': 144, 'num_train_epochs': 3}. Best is trial 12 with value: 0.8324990546825356.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2543,0.216022,0.725389,0.8,0.76087,0.732872,0.843691,0.774051
2,0.1623,0.191692,0.772021,0.841808,0.805405,0.78912,0.85383,0.813855
3,0.1195,0.162737,0.751295,0.900621,0.819209,0.760821,0.900599,0.820389
4,0.0776,0.15706,0.777202,0.887574,0.828729,0.785007,0.89345,0.830656


[I 2025-06-12 23:18:23,645] Trial 16 finished with value: 0.8306558552978859 and parameters: {'learning_rate': 3.669527734640834e-05, 'batch_size': 8, 'weight_decay': 0.006464328297945909, 'warmup_steps': 48, 'num_train_epochs': 4}. Best is trial 12 with value: 0.8324990546825356.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.269,0.251701,0.694301,0.716578,0.705263,0.701647,0.789679,0.713287
2,0.1636,0.188981,0.740933,0.856287,0.794444,0.757374,0.85558,0.799377
3,0.1385,0.173772,0.751295,0.868263,0.805556,0.763784,0.868298,0.809589


[I 2025-06-12 23:19:31,090] Trial 17 finished with value: 0.8095885474800011 and parameters: {'learning_rate': 6.831230146811595e-05, 'batch_size': 8, 'weight_decay': 0.11994908355548693, 'warmup_steps': 175, 'num_train_epochs': 3}. Best is trial 12 with value: 0.8324990546825356.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.3462,0.307444,0.295337,0.850746,0.438462,0.262101,0.568793,0.329893
2,0.2289,0.217714,0.658031,0.907143,0.762763,0.669283,0.907359,0.761017
3,0.1637,0.182761,0.735751,0.898734,0.809117,0.748827,0.900708,0.815086
4,0.1311,0.178386,0.740933,0.89375,0.810198,0.744508,0.890126,0.808349


[I 2025-06-12 23:20:29,766] Trial 18 finished with value: 0.8083493272696304 and parameters: {'learning_rate': 1.945216770154496e-05, 'batch_size': 16, 'weight_decay': 0.0032558539641831833, 'warmup_steps': 62, 'num_train_epochs': 4}. Best is trial 12 with value: 0.8324990546825356.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.6385,0.410631,0.056995,1.0,0.107843,0.047009,0.166667,0.073333
2,0.3721,0.312186,0.38342,0.850575,0.528571,0.370661,0.719124,0.474571


[I 2025-06-12 23:20:55,365] Trial 19 finished with value: 0.474571137370383 and parameters: {'learning_rate': 2.8803463262445056e-05, 'batch_size': 32, 'weight_decay': 0.05853958492161683, 'warmup_steps': 286, 'num_train_epochs': 2}. Best is trial 12 with value: 0.8324990546825356.



Best hyperparameters:
{'learning_rate': 2.868318448990469e-05, 'batch_size': 8, 'weight_decay': 0.004901939692684662, 'warmup_steps': 87, 'num_train_epochs': 4}
Best macro F1: 0.8325


In [18]:
Best hyperparameters:
{'learning_rate': 1.952733015072204e-05, 'batch_size': 16, 'weight_decay': 0.28658541378102453, 'warmup_steps': 471, 'num_train_epochs': 7}
Best macro F1: 0.9054

SyntaxError: invalid syntax (4240920430.py, line 1)

In [19]:
best_params = study.best_params
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(emotion_cols),
    problem_type="multi_label_classification"
)

training_args = TrainingArguments(
    output_dir='./best_model',
    num_train_epochs=best_params["num_train_epochs"],
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"],
    learning_rate=best_params["learning_rate"],
    weight_decay=best_params["weight_decay"],
    warmup_steps=best_params["warmup_steps"],
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="no",
    logging_dir='./logs/best_model',
    seed=seed,
)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2625,0.234877,0.637306,0.81457,0.715116,0.636522,0.823705,0.704607
2,0.1661,0.189025,0.761658,0.825843,0.792453,0.775043,0.833577,0.798757
3,0.1305,0.164405,0.761658,0.913043,0.830508,0.765201,0.920944,0.831661
4,0.0907,0.160479,0.777202,0.892857,0.831025,0.780635,0.899369,0.833552


TrainOutput(global_step=1340, training_loss=0.1875365776802177, metrics={'train_runtime': 87.6756, 'train_samples_per_second': 122.223, 'train_steps_per_second': 15.284, 'total_flos': 248538900917736.0, 'train_loss': 0.1875365776802177, 'epoch': 4.0})

In [20]:
def find_best_threshold(model, val_dataset, thresholds=np.arange(0.1, 0.9, 0.05)):
    model.eval()

    predictions = trainer.predict(val_dataset)
    probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
    true_labels = predictions.label_ids

    optimal_thresholds = {}
    best_f1_scores = {}
    
    print("Finding optimal thresholds for each emotion:")
    print("=" * 60)
    
    # Find optimal threshold for each emotion separately
    for i, emotion in enumerate(emotion_cols):
        best_threshold = 0.5
        best_f1 = 0
        
        print(f"\n{emotion.upper()}:")
        print("-" * 30)
        
        for threshold in thresholds:
            # Binary predictions for this emotion only
            y_pred_emotion = (probs[:, i] > threshold).astype(int)
            y_true_emotion = true_labels[:, i]
            
            # Calculate F1 for this emotion
            f1_emotion = f1_score(y_true_emotion, y_pred_emotion, zero_division=0)
            
            print(f"Threshold {threshold:.2f}: F1 = {f1_emotion:.4f}")
            
            if f1_emotion > best_f1:
                best_f1 = f1_emotion
                best_threshold = threshold
        
        optimal_thresholds[emotion] = best_threshold
        best_f1_scores[emotion] = best_f1
        
        print(f"Best threshold for {emotion}: {best_threshold:.2f} (F1: {best_f1:.4f})")
    return optimal_thresholds

In [21]:
def find_best_threshold(model, val_dataset, thresholds=np.arange(0.1, 0.9, 0.05)):
    model.eval()
    predictions = trainer.predict(val_dataset)
    probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
    true_labels = predictions.label_ids
    best_threshold = 0.5
    best_f1 = 0
    print("Threshold optimization:")
    for threshold in thresholds:
        y_pred = probs > threshold
        f1_macro = f1_score(true_labels, y_pred, average='macro', zero_division=0)
        print(f"Threshold {threshold:.2f}: Macro F1 = {f1_macro:.4f}")
        if f1_macro > best_f1:
            best_f1 = f1_macro
            best_threshold = threshold
    print(f"\nBest threshold: {best_threshold:.2f} (Macro F1: {best_f1:.4f})")
    return best_threshold

In [22]:
best_threshold = find_best_threshold(model, val_tokenized)

Threshold optimization:
Threshold 0.10: Macro F1 = 0.7770
Threshold 0.15: Macro F1 = 0.8143
Threshold 0.20: Macro F1 = 0.8246
Threshold 0.25: Macro F1 = 0.8375
Threshold 0.30: Macro F1 = 0.8383
Threshold 0.35: Macro F1 = 0.8480
Threshold 0.40: Macro F1 = 0.8481
Threshold 0.45: Macro F1 = 0.8437
Threshold 0.50: Macro F1 = 0.8336
Threshold 0.55: Macro F1 = 0.8290
Threshold 0.60: Macro F1 = 0.8329
Threshold 0.65: Macro F1 = 0.8325
Threshold 0.70: Macro F1 = 0.8344
Threshold 0.75: Macro F1 = 0.8222
Threshold 0.80: Macro F1 = 0.8157
Threshold 0.85: Macro F1 = 0.7954

Best threshold: 0.40 (Macro F1: 0.8481)


In [23]:
# test_tokenized = test.map(tokenize_function, batched=True)
# test_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# test_predictions = trainer.predict(test_tokenized)
# test_probs = torch.sigmoid(torch.tensor(test_predictions.predictions)).numpy()
# test_pred_labels = test_probs > best_threshold
# true_test_labels = test_predictions.label_ids

In [24]:
test_tokenized = test.map(tokenize_function, batched=True)
test_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

test_predictions = trainer.predict(test_tokenized)
test_probs = torch.sigmoid(torch.tensor(test_predictions.predictions)).numpy()
test_pred_labels = np.zeros_like(test_probs)
for i, emotion in enumerate(emotion_cols):
    test_pred_labels[:, i] = (test_probs[:, i] > 0.5).astype(int)
true_test_labels = test_predictions.label_ids

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [25]:
# print(f"\nTest Results with optimal threshold ({best_threshold:.2f}):")
for average in ['micro', 'macro']:
    recall = recall_score(true_test_labels, test_pred_labels, average=average, zero_division=0)
    precision = precision_score(true_test_labels, test_pred_labels, average=average, zero_division=0)
    f1 = f1_score(true_test_labels, test_pred_labels, average=average, zero_division=0)
    print(f'{average.upper()} recall: {round(recall, 4)}, precision: {round(precision, 4)}, f1: {round(f1, 4)}')

print(f"\nPer-class Results:")
class_recall = recall_score(true_test_labels, test_pred_labels, average=None, zero_division=0)
class_precision = precision_score(true_test_labels, test_pred_labels, average=None, zero_division=0)
class_f1 = f1_score(true_test_labels, test_pred_labels, average=None, zero_division=0)

for i, emotion in enumerate(emotion_cols):
    print(f'{emotion.upper()}: recall: {round(class_recall[i], 4)}, precision: {round(class_precision[i], 4)}, f1: {round(class_f1[i], 4)}')

print(f"\nClass distribution in test set:")
for i, emotion in enumerate(emotion_cols):
    true_count = int(true_test_labels[:, i].sum())
    pred_count = int(test_pred_labels[:, i].sum())
    total = len(true_test_labels)
    print(f'{emotion.upper()}: true: {true_count}/{total} ({true_count/total:.1%}), predicted: {pred_count}/{total} ({pred_count/total:.1%})')

MICRO recall: 0.7777, precision: 0.8712, f1: 0.8218
MACRO recall: 0.7782, precision: 0.8762, f1: 0.8225

Per-class Results:
ANGER: recall: 0.7611, precision: 0.839, f1: 0.7981
FEAR: recall: 0.8611, precision: 0.8774, f1: 0.8692
JOY: recall: 0.8238, precision: 0.8933, f1: 0.8571
DISGUST: recall: 0.7049, precision: 0.9451, f1: 0.8075
SADNESS: recall: 0.7376, precision: 0.8889, f1: 0.8062
SURPRISE: recall: 0.7805, precision: 0.8136, f1: 0.7967

Class distribution in test set:
ANGER: true: 452/2000 (22.6%), predicted: 410/2000 (20.5%)
FEAR: true: 216/2000 (10.8%), predicted: 212/2000 (10.6%)
JOY: true: 386/2000 (19.3%), predicted: 356/2000 (17.8%)
DISGUST: true: 244/2000 (12.2%), predicted: 182/2000 (9.1%)
SADNESS: true: 282/2000 (14.1%), predicted: 234/2000 (11.7%)
SURPRISE: true: 246/2000 (12.3%), predicted: 236/2000 (11.8%)


In [36]:
np.save('bert.npy', test_probs)



In [37]:
arr_loaded

array([[0.01563433, 0.9697105 , 0.01789523, 0.0131724 , 0.01356052,
        0.01505389],
       [0.00515464, 0.00593253, 0.00646108, 0.00496511, 0.00630429,
        0.00407498],
       [0.98380107, 0.01522074, 0.01516229, 0.02613696, 0.02344373,
        0.01456004],
       ...,
       [0.09306297, 0.00388846, 0.00349513, 0.00452616, 0.03717936,
        0.00292134],
       [0.0076504 , 0.02186554, 0.02283526, 0.01351371, 0.01696276,
        0.96584934],
       [0.00909997, 0.01079418, 0.98410505, 0.00929993, 0.0122868 ,
        0.00937763]], dtype=float32)