In [1]:
#!pip install -U datasets optuna

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [2]:
from datasets import load_dataset
from transformers import set_seed, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score
import optuna
import pandas as pd

In [3]:
seed = 42
set_seed(seed)

In [4]:
train = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", "rus", split="train")
val = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", "rus", split="dev")
test = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", "rus", split="test")

In [5]:
emotion_cols = ['anger', 'fear', 'joy', 'disgust', 'sadness', 'surprise']

In [6]:
def create_labels(examples):
    labels = []
    for i in range(len(examples['text'])):
        label = [float(examples[col][i]) for col in emotion_cols]
        labels.append(label)
    examples['labels'] = labels
    return examples

train = train.map(create_labels, batched=True)
val = val.map(create_labels, batched=True)
test = test.map(create_labels, batched=True)

In [7]:
model_name = "DeepPavlov/rubert-base-cased-conversational"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=False, max_length=512)

train_tokenized = train.map(tokenize_function, batched=True)
val_tokenized = val.map(tokenize_function, batched=True)

Map:   0%|          | 0/2679 [00:00<?, ? examples/s]

In [9]:
train_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(predictions)).numpy()
    y_pred = predictions > 0.5

    results = {}
    for average in ['micro', 'macro']:
        results[f'{average}_recall'] = recall_score(labels, y_pred, average=average, zero_division=0)
        results[f'{average}_precision'] = precision_score(labels, y_pred, average=average, zero_division=0)
        results[f'{average}_f1'] = f1_score(labels, y_pred, average=average, zero_division=0)

    return results

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
    warmup_steps = trial.suggest_int("warmup_steps", 0, 500)
    num_epochs = trial.suggest_int("num_train_epochs", 2, 7)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(emotion_cols),
        problem_type="multi_label_classification"
    )

    training_args = TrainingArguments(
        output_dir=f'./results/trial_{trial.number}',
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        logging_steps=50,
        eval_strategy="epoch",
        metric_for_best_model="eval_macro_f1",
        logging_dir=f'./logs/trial_{trial.number}',
        save_strategy="no",
        report_to=None,
        seed=seed,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )

    trainer.train()

    eval_results = trainer.evaluate()

    return eval_results["eval_macro_f1"]

In [13]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("\nBest hyperparameters:")
print(study.best_params)
print(f"Best macro F1: {study.best_value:.4f}")

[I 2025-06-13 11:52:09,852] A new study created in memory with name: no-name-a8c9761e-cc31-4f90-bf99-51cc96d486cb
  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.413,0.332565,0.243523,0.921569,0.385246,0.210811,0.471703,0.251958
2,0.2019,0.183849,0.818653,0.840426,0.829396,0.826916,0.854287,0.839073
3,0.1075,0.132825,0.813472,0.923529,0.865014,0.825965,0.923163,0.868217
4,0.0651,0.121228,0.834197,0.914773,0.872629,0.845267,0.917648,0.876775


[I 2025-06-13 11:53:02,401] Trial 0 finished with value: 0.8767750219225855 and parameters: {'learning_rate': 2.9556907845227148e-05, 'batch_size': 16, 'weight_decay': 0.2925600475266136, 'warmup_steps': 345, 'num_train_epochs': 4}. Best is trial 0 with value: 0.8767750219225855.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.6614,0.42152,0.0,0.0,0.0,0.0,0.0,0.0
2,0.313,0.225734,0.673575,0.890411,0.766962,0.669614,0.897478,0.743425


[I 2025-06-13 11:53:24,051] Trial 1 finished with value: 0.7434251011949037 and parameters: {'learning_rate': 5.097294412875871e-05, 'batch_size': 32, 'weight_decay': 0.0654128843393699, 'warmup_steps': 330, 'num_train_epochs': 2}. Best is trial 0 with value: 0.8767750219225855.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.5006,0.422754,0.0,0.0,0.0,0.0,0.0,0.0
2,0.3158,0.270965,0.595855,0.858209,0.703364,0.569833,0.746447,0.632194


[I 2025-06-13 11:53:51,395] Trial 2 finished with value: 0.6321942398563923 and parameters: {'learning_rate': 1.119686969635339e-05, 'batch_size': 16, 'weight_decay': 0.01945222259408077, 'warmup_steps': 376, 'num_train_epochs': 2}. Best is trial 0 with value: 0.8767750219225855.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.6336,0.38835,0.067358,0.8125,0.124402,0.06184,0.294872,0.094732
2,0.2639,0.199064,0.746114,0.862275,0.8,0.746375,0.907056,0.811221
3,0.1317,0.147282,0.818653,0.940476,0.875346,0.829681,0.939281,0.87999
4,0.0876,0.144712,0.823834,0.913793,0.866485,0.82468,0.920712,0.860859
5,0.0581,0.143002,0.823834,0.913793,0.866485,0.83197,0.922094,0.873109
6,0.0434,0.133697,0.839378,0.9,0.868633,0.848912,0.902664,0.873761


[I 2025-06-13 11:54:52,314] Trial 3 finished with value: 0.8737606477537044 and parameters: {'learning_rate': 3.488716597796794e-05, 'batch_size': 32, 'weight_decay': 0.04321943339845891, 'warmup_steps': 172, 'num_train_epochs': 6}. Best is trial 0 with value: 0.8767750219225855.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2637,0.209471,0.782383,0.794737,0.788512,0.79974,0.819131,0.798103
2,0.1604,0.133964,0.803109,0.939394,0.865922,0.816906,0.946728,0.873539


[I 2025-06-13 11:55:19,688] Trial 4 finished with value: 0.8735392219427572 and parameters: {'learning_rate': 7.74612866828299e-05, 'batch_size': 16, 'weight_decay': 0.07771574963877871, 'warmup_steps': 244, 'num_train_epochs': 2}. Best is trial 0 with value: 0.8767750219225855.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2223,0.235574,0.683938,0.8,0.73743,0.70496,0.851263,0.7364
2,0.1637,0.165714,0.740933,0.910828,0.817143,0.749086,0.914455,0.81352
3,0.1212,0.133956,0.818653,0.918605,0.865753,0.825785,0.922508,0.867789
4,0.0944,0.153795,0.80829,0.886364,0.845528,0.811761,0.932045,0.859369
5,0.0632,0.151911,0.818653,0.88764,0.851752,0.826916,0.895166,0.857488
6,0.0424,0.148767,0.84456,0.881081,0.862434,0.852404,0.893675,0.870294
7,0.0185,0.154249,0.849741,0.863158,0.856397,0.86158,0.863509,0.861149


[I 2025-06-13 11:56:49,127] Trial 5 finished with value: 0.8611492211492212 and parameters: {'learning_rate': 0.00011435644240136533, 'batch_size': 16, 'weight_decay': 0.0672829866714041, 'warmup_steps': 202, 'num_train_epochs': 7}. Best is trial 0 with value: 0.8767750219225855.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2499,0.205779,0.715026,0.932432,0.809384,0.731338,0.933124,0.815074
2,0.1378,0.139798,0.823834,0.873626,0.848,0.833174,0.877318,0.852247
3,0.081,0.110247,0.860104,0.912088,0.885333,0.870656,0.911469,0.889836
4,0.0512,0.10579,0.88601,0.9,0.89295,0.889895,0.900856,0.894425
5,0.0319,0.117404,0.870466,0.913043,0.891247,0.878628,0.913559,0.8951
6,0.0243,0.122622,0.854922,0.891892,0.873016,0.869516,0.888059,0.876416
7,0.0159,0.12035,0.860104,0.892473,0.875989,0.868672,0.890368,0.878263


[I 2025-06-13 11:58:19,508] Trial 6 finished with value: 0.878262662573913 and parameters: {'learning_rate': 4.027084504854194e-05, 'batch_size': 16, 'weight_decay': 0.23960723774351578, 'warmup_steps': 128, 'num_train_epochs': 7}. Best is trial 6 with value: 0.878262662573913.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.4777,0.261662,0.590674,0.919355,0.719243,0.561304,0.769444,0.646781
2,0.1993,0.184756,0.761658,0.91875,0.832861,0.774575,0.924089,0.839189
3,0.1275,0.156403,0.792746,0.921687,0.852368,0.800018,0.924614,0.85412
4,0.1025,0.145294,0.80829,0.939759,0.869081,0.812587,0.942583,0.870224
5,0.0862,0.142406,0.803109,0.911765,0.853994,0.809822,0.914587,0.856959


[I 2025-06-13 11:59:10,916] Trial 7 finished with value: 0.8569591842855012 and parameters: {'learning_rate': 2.209837626245053e-05, 'batch_size': 32, 'weight_decay': 0.14777327013929262, 'warmup_steps': 7, 'num_train_epochs': 5}. Best is trial 6 with value: 0.878262662573913.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2081,0.234443,0.715026,0.797688,0.754098,0.741304,0.841793,0.762383
2,0.1664,0.170358,0.782383,0.904192,0.838889,0.787719,0.904784,0.836482
3,0.0975,0.138411,0.792746,0.889535,0.838356,0.79851,0.904748,0.846715


[I 2025-06-13 11:59:51,034] Trial 8 finished with value: 0.8467152609277674 and parameters: {'learning_rate': 0.00013782664087898998, 'batch_size': 16, 'weight_decay': 0.1941214096850256, 'warmup_steps': 193, 'num_train_epochs': 3}. Best is trial 6 with value: 0.878262662573913.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.3827,0.25318,0.642487,0.760736,0.696629,0.612666,0.874899,0.640538
2,0.2153,0.203717,0.756477,0.884848,0.815642,0.76689,0.892951,0.823772
3,0.1903,0.222892,0.709845,0.872611,0.782857,0.716201,0.877752,0.780549
4,0.1573,0.206456,0.746114,0.837209,0.789041,0.759987,0.853685,0.799238
5,0.1267,0.214721,0.715026,0.890323,0.793103,0.721982,0.904952,0.798363
6,0.1,0.195265,0.782383,0.867816,0.822888,0.796418,0.878634,0.832019


[I 2025-06-13 12:00:53,281] Trial 9 finished with value: 0.8320187454652755 and parameters: {'learning_rate': 0.00034006706559355586, 'batch_size': 32, 'weight_decay': 0.14669860864574422, 'warmup_steps': 58, 'num_train_epochs': 6}. Best is trial 6 with value: 0.878262662573913.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.3301,0.38408,0.419689,0.435484,0.427441,0.463585,0.459713,0.40046
2,0.4419,0.441987,0.0,0.0,0.0,0.0,0.0,0.0
3,0.4422,0.438143,0.0,0.0,0.0,0.0,0.0,0.0
4,0.4388,0.440072,0.0,0.0,0.0,0.0,0.0,0.0
5,0.4322,0.438093,0.0,0.0,0.0,0.0,0.0,0.0
6,0.4323,0.439386,0.0,0.0,0.0,0.0,0.0,0.0
7,0.4151,0.438462,0.0,0.0,0.0,0.0,0.0,0.0


[I 2025-06-13 12:03:25,196] Trial 10 finished with value: 0.0 and parameters: {'learning_rate': 0.00043680912171499434, 'batch_size': 8, 'weight_decay': 0.2753962266062293, 'warmup_steps': 475, 'num_train_epochs': 7}. Best is trial 6 with value: 0.878262662573913.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.296,0.238006,0.689119,0.869281,0.768786,0.684795,0.916418,0.769453
2,0.1621,0.159129,0.792746,0.9,0.842975,0.808207,0.906068,0.847788
3,0.1041,0.141109,0.80829,0.923077,0.861878,0.816861,0.925908,0.864795
4,0.0765,0.130507,0.803109,0.922619,0.858726,0.81045,0.925908,0.860418


[I 2025-06-13 12:04:18,168] Trial 11 finished with value: 0.8604176102300634 and parameters: {'learning_rate': 2.0702813679546856e-05, 'batch_size': 16, 'weight_decay': 0.29737015462791966, 'warmup_steps': 103, 'num_train_epochs': 4}. Best is trial 6 with value: 0.878262662573913.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2239,0.190267,0.740933,0.928571,0.824207,0.740288,0.932345,0.821189
2,0.1201,0.150947,0.797927,0.900585,0.846154,0.814716,0.909072,0.85419
3,0.0723,0.131573,0.834197,0.899441,0.865591,0.845419,0.904121,0.870215
4,0.0464,0.131189,0.854922,0.887097,0.870712,0.866634,0.890678,0.875759


[I 2025-06-13 12:05:46,642] Trial 12 finished with value: 0.8757585317833766 and parameters: {'learning_rate': 3.188146379706791e-05, 'batch_size': 8, 'weight_decay': 0.23320414929841077, 'warmup_steps': 327, 'num_train_epochs': 4}. Best is trial 6 with value: 0.878262662573913.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.5642,0.452132,0.0,0.0,0.0,0.0,0.0,0.0
2,0.3658,0.305073,0.435233,0.857143,0.57732,0.368359,0.590835,0.437805
3,0.2085,0.203786,0.73057,0.921569,0.815029,0.73518,0.923573,0.812556
4,0.1413,0.163961,0.782383,0.932099,0.850704,0.785689,0.939067,0.852893
5,0.1157,0.157136,0.792746,0.905325,0.845304,0.800494,0.906985,0.848332


[I 2025-06-13 12:06:51,142] Trial 13 finished with value: 0.8483322447494409 and parameters: {'learning_rate': 1.028354169058226e-05, 'batch_size': 16, 'weight_decay': 0.2440217932081633, 'warmup_steps': 462, 'num_train_epochs': 5}. Best is trial 6 with value: 0.878262662573913.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2281,0.185208,0.787565,0.826087,0.806366,0.79605,0.854824,0.820914
2,0.138,0.133236,0.818653,0.923977,0.868132,0.831037,0.92745,0.875749
3,0.0807,0.125325,0.839378,0.9,0.868633,0.848184,0.900599,0.872589


[I 2025-06-13 12:07:30,628] Trial 14 finished with value: 0.8725894972406599 and parameters: {'learning_rate': 5.9512365711679685e-05, 'batch_size': 16, 'weight_decay': 0.198743096461684, 'warmup_steps': 132, 'num_train_epochs': 3}. Best is trial 6 with value: 0.878262662573913.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.4124,0.344172,0.238342,0.938776,0.380165,0.197774,0.45846,0.256595
2,0.2173,0.194067,0.756477,0.843931,0.797814,0.759142,0.863711,0.797943
3,0.1243,0.134984,0.813472,0.94012,0.872222,0.822221,0.941077,0.874882
4,0.0788,0.127742,0.854922,0.921788,0.887097,0.866059,0.92858,0.89462
5,0.0569,0.128004,0.84456,0.920904,0.881081,0.86263,0.925069,0.889298
6,0.0498,0.118489,0.854922,0.921788,0.887097,0.867415,0.927152,0.894607


[I 2025-06-13 12:08:47,129] Trial 15 finished with value: 0.8946073182742188 and parameters: {'learning_rate': 1.913996384380825e-05, 'batch_size': 16, 'weight_decay': 0.2573376860542623, 'warmup_steps': 286, 'num_train_epochs': 6}. Best is trial 15 with value: 0.8946073182742188.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2646,0.229851,0.725389,0.838323,0.777778,0.727548,0.839071,0.775631
2,0.137,0.143073,0.818653,0.913295,0.863388,0.836918,0.91332,0.869943
3,0.0922,0.123146,0.839378,0.910112,0.873315,0.850321,0.913313,0.878924
4,0.0591,0.127748,0.839378,0.910112,0.873315,0.848184,0.91068,0.877297
5,0.0354,0.126904,0.849741,0.916201,0.88172,0.858868,0.922723,0.887313
6,0.0318,0.125889,0.84456,0.900552,0.871658,0.852458,0.901297,0.87405


[I 2025-06-13 12:10:59,146] Trial 16 finished with value: 0.8740500301156038 and parameters: {'learning_rate': 1.791602412982399e-05, 'batch_size': 8, 'weight_decay': 0.18770390070768944, 'warmup_steps': 266, 'num_train_epochs': 6}. Best is trial 15 with value: 0.8946073182742188.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.2174,0.209829,0.73057,0.792135,0.760108,0.740773,0.845204,0.772781
2,0.2299,0.270709,0.673575,0.730337,0.700809,0.667154,0.613048,0.635242
3,0.2049,0.212455,0.704663,0.912752,0.795322,0.72289,0.925346,0.805152
4,0.1723,0.22518,0.709845,0.825301,0.763231,0.720465,0.875876,0.776808
5,0.1394,0.194053,0.772021,0.856322,0.811989,0.79624,0.858184,0.821686
6,0.1183,0.189034,0.782383,0.89881,0.836565,0.799857,0.912537,0.848182
7,0.0915,0.174372,0.797927,0.88,0.836957,0.810496,0.895399,0.845772


[I 2025-06-13 12:12:28,824] Trial 17 finished with value: 0.8457717051467052 and parameters: {'learning_rate': 0.0002091026818573618, 'batch_size': 16, 'weight_decay': 0.24447385408638483, 'warmup_steps': 259, 'num_train_epochs': 7}. Best is trial 15 with value: 0.8946073182742188.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.4667,0.410509,0.0,0.0,0.0,0.0,0.0,0.0
2,0.2753,0.236483,0.663212,0.870748,0.752941,0.656677,0.892372,0.724262
3,0.155,0.1564,0.813472,0.928994,0.867403,0.82161,0.934255,0.869977
4,0.0964,0.142663,0.80829,0.917647,0.859504,0.812111,0.931788,0.86409
5,0.0727,0.142179,0.823834,0.898305,0.859459,0.834224,0.904339,0.862655
6,0.0613,0.131762,0.834197,0.889503,0.860963,0.845518,0.90216,0.869484


[I 2025-06-13 12:13:45,705] Trial 18 finished with value: 0.8694840809209832 and parameters: {'learning_rate': 1.5460476763315463e-05, 'batch_size': 16, 'weight_decay': 0.11877940018507384, 'warmup_steps': 408, 'num_train_epochs': 6}. Best is trial 15 with value: 0.8946073182742188.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.1738,0.15393,0.782383,0.909639,0.841226,0.795996,0.911195,0.845193
2,0.1062,0.147999,0.797927,0.885057,0.839237,0.811277,0.88697,0.840757
3,0.0771,0.139129,0.839378,0.880435,0.859416,0.853302,0.879569,0.863031
4,0.0403,0.116752,0.865285,0.912568,0.888298,0.869354,0.913639,0.889317
5,0.0222,0.140474,0.823834,0.893258,0.857143,0.831037,0.895888,0.860577
6,0.0151,0.136804,0.854922,0.911602,0.882353,0.865853,0.91632,0.889682
7,0.0063,0.139276,0.854922,0.896739,0.875332,0.864498,0.900224,0.880579


[I 2025-06-13 12:16:18,813] Trial 19 finished with value: 0.8805786587235577 and parameters: {'learning_rate': 4.4713771999601775e-05, 'batch_size': 8, 'weight_decay': 0.22245469980190463, 'warmup_steps': 83, 'num_train_epochs': 7}. Best is trial 15 with value: 0.8946073182742188.



Best hyperparameters:
{'learning_rate': 1.913996384380825e-05, 'batch_size': 16, 'weight_decay': 0.2573376860542623, 'warmup_steps': 286, 'num_train_epochs': 6}
Best macro F1: 0.8946


In [14]:
best_params = study.best_params
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(emotion_cols),
    problem_type="multi_label_classification"
)

training_args = TrainingArguments(
    output_dir='./best_model',
    num_train_epochs=best_params["num_train_epochs"],
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"],
    learning_rate=best_params["learning_rate"],
    weight_decay=best_params["weight_decay"],
    warmup_steps=best_params["warmup_steps"],
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="no",
    logging_dir='./logs/best_model',
    seed=seed,
)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Micro Recall,Micro Precision,Micro F1,Macro Recall,Macro Precision,Macro F1
1,0.4117,0.343846,0.264249,1.0,0.418033,0.215836,0.5,0.298086
2,0.2136,0.185761,0.766839,0.89697,0.826816,0.78051,0.906276,0.836338
3,0.1216,0.139141,0.803109,0.933735,0.86351,0.813314,0.934391,0.866683
4,0.0778,0.127788,0.849741,0.896175,0.87234,0.856785,0.915462,0.882698
5,0.0562,0.137023,0.834197,0.894444,0.863271,0.849181,0.903866,0.870918
6,0.0475,0.127227,0.849741,0.916201,0.88172,0.864039,0.927268,0.891356


TrainOutput(global_step=1008, training_loss=0.1778020536497472, metrics={'train_runtime': 74.6981, 'train_samples_per_second': 215.186, 'train_steps_per_second': 13.494, 'total_flos': 307607912530812.0, 'train_loss': 0.1778020536497472, 'epoch': 6.0})

In [21]:
def find_best_threshold(model, val_dataset, thresholds=np.arange(0.1, 0.9, 0.05)):
    model.eval()
    predictions = trainer.predict(val_dataset)
    probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
    true_labels = predictions.label_ids
    best_threshold = 0.5
    best_f1 = 0
    print("Threshold optimization:")
    for threshold in thresholds:
        y_pred = probs > threshold
        f1_macro = f1_score(true_labels, y_pred, average='macro', zero_division=0)
        print(f"Threshold {threshold:.2f}: Macro F1 = {f1_macro:.4f}")
        if f1_macro > best_f1:
            best_f1 = f1_macro
            best_threshold = threshold
    print(f"\nBest threshold: {best_threshold:.2f} (Macro F1: {best_f1:.4f})")
    return best_threshold

In [22]:
best_threshold = find_best_threshold(model, val_tokenized)

Threshold optimization:
Threshold 0.10: Macro F1 = 0.8174
Threshold 0.15: Macro F1 = 0.8610
Threshold 0.20: Macro F1 = 0.8696
Threshold 0.25: Macro F1 = 0.8751
Threshold 0.30: Macro F1 = 0.8695
Threshold 0.35: Macro F1 = 0.8812
Threshold 0.40: Macro F1 = 0.8860
Threshold 0.45: Macro F1 = 0.8878
Threshold 0.50: Macro F1 = 0.8914
Threshold 0.55: Macro F1 = 0.8886
Threshold 0.60: Macro F1 = 0.8834
Threshold 0.65: Macro F1 = 0.8677
Threshold 0.70: Macro F1 = 0.8561
Threshold 0.75: Macro F1 = 0.8505
Threshold 0.80: Macro F1 = 0.8468
Threshold 0.85: Macro F1 = 0.8245

Best threshold: 0.50 (Macro F1: 0.8914)


In [23]:
test_tokenized = test.map(tokenize_function, batched=True)
test_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

test_predictions = trainer.predict(test_tokenized)
test_probs = torch.sigmoid(torch.tensor(test_predictions.predictions)).numpy()
test_pred_labels = test_probs > best_threshold
true_test_labels = test_predictions.label_ids

In [24]:
for average in ['micro', 'macro']:
    recall = recall_score(true_test_labels, test_pred_labels, average=average, zero_division=0)
    precision = precision_score(true_test_labels, test_pred_labels, average=average, zero_division=0)
    f1 = f1_score(true_test_labels, test_pred_labels, average=average, zero_division=0)
    print(f'{average.upper()} recall: {round(recall, 4)}, precision: {round(precision, 4)}, f1: {round(f1, 4)}')

print(f"\nPer-class Results:")
class_recall = recall_score(true_test_labels, test_pred_labels, average=None, zero_division=0)
class_precision = precision_score(true_test_labels, test_pred_labels, average=None, zero_division=0)
class_f1 = f1_score(true_test_labels, test_pred_labels, average=None, zero_division=0)

for i, emotion in enumerate(emotion_cols):
    print(f'{emotion.upper()}: recall: {round(class_recall[i], 4)}, precision: {round(class_precision[i], 4)}, f1: {round(class_f1[i], 4)}')

print(f"\nClass distribution in test set:")
for i, emotion in enumerate(emotion_cols):
    true_count = int(true_test_labels[:, i].sum())
    pred_count = int(test_pred_labels[:, i].sum())
    total = len(true_test_labels)
    print(f'{emotion.upper()}: true: {true_count}/{total} ({true_count/total:.1%}), predicted: {pred_count}/{total} ({pred_count/total:.1%})')

MICRO recall: 0.851, precision: 0.8993, f1: 0.8745
MACRO recall: 0.8502, precision: 0.8983, f1: 0.8734

Per-class Results:
ANGER: recall: 0.8496, precision: 0.8972, f1: 0.8727
FEAR: recall: 0.8889, precision: 0.9697, f1: 0.9275
JOY: recall: 0.886, precision: 0.9607, f1: 0.9218
DISGUST: recall: 0.8279, precision: 0.8783, f1: 0.8523
SADNESS: recall: 0.7872, precision: 0.8222, f1: 0.8043
SURPRISE: recall: 0.8618, precision: 0.8618, f1: 0.8618

Class distribution in test set:
ANGER: true: 452/2000 (22.6%), predicted: 428/2000 (21.4%)
FEAR: true: 216/2000 (10.8%), predicted: 198/2000 (9.9%)
JOY: true: 386/2000 (19.3%), predicted: 356/2000 (17.8%)
DISGUST: true: 244/2000 (12.2%), predicted: 230/2000 (11.5%)
SADNESS: true: 282/2000 (14.1%), predicted: 270/2000 (13.5%)
SURPRISE: true: 246/2000 (12.3%), predicted: 246/2000 (12.3%)
