In [None]:
import os

import numpy as np
import pandas as pd
import torch
import wandb
from datasets import concatenate_datasets, load_dataset
from huggingface_hub import notebook_login
from sklearn.metrics import f1_score, roc_auc_score
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import AutoTokenizer, BertForSequenceClassification, DataCollatorWithPadding

In [None]:
full_config = {
    "project": "VKR",
    "config": {
        "dataset": "seara/ru_go_emotions",
        "num_labels": 28,
        "labels": {
            0: "admiration",
            1: "amusement",
            2: "anger",
            3: "annoyance",
            4: "approval",
            5: "caring",
            6: "confusion",
            7: "curiosity",
            8: "desire",
            9: "disappointment",
            10: "disapproval",
            11: "disgust",
            12: "embarrassment",
            13: "excitement",
            14: "fear",
            15: "gratitude",
            16: "grief",
            17: "joy",
            18: "love",
            19: "nervousness",
            20: "optimism",
            21: "pride",
            22: "realization",
            23: "relief",
            24: "remorse",
            25: "sadness",
            26: "surprise",
            27: "neutral",
        },
        "model": "cointegrated/rubert-tiny2",
        "tokenizer": "cointegrated/rubert-tiny2",
        "problem_type": "multi_label_classification",
        "batch_size": 64,
        "epochs": 50,
        "lr": 1e-5,
    },
    "name": "rubert-tiny2-ru-go-emotions",
}
config = full_config["config"]

In [None]:
def binarize_labels(labels, num_labels):
    return [int(i in labels) for i in range(num_labels)]


def label2id(class_labels):
    label2id = {label: i for i, label in enumerate(class_labels)}
    id2label = {i: label for label, i in label2id.items()}
    return label2id, id2label

In [None]:
def calculate_aucs(y_true, y_pred, num_labels):
    return [roc_auc_score(y_true[:, i], y_pred[:, i]) for i in range(num_labels)]


def calculate_f1_score(y_true, y_pred, average, num_labels):
    return [f1_score(y_true[:, i], y_pred[:, i] > 0.5, average=average) for i in range(num_labels)]


def calculate_metrics(y_true, y_pred, num_labels):
    auc_rocs = calculate_aucs(y_true, y_pred, num_labels)

    f1_scores_micro = calculate_f1_score(y_true, y_pred, "micro", num_labels)

    f1_scores_macro = calculate_f1_score(y_true, y_pred, "macro", num_labels)

    df = pd.DataFrame(
        [auc_rocs, f1_scores_micro, f1_scores_macro],
        columns=config["labels"].values(),
        index=["AUC ROC", "F1 micro", "F1 macro"],
    )
    df["mean"] = df.mean(axis=1)
    # df["mean(emotions)"] = df.drop("no emotion", axis=1).mean(axis=1)
    return df

In [None]:
def predict(model, dataloader):
    with torch.no_grad():
        y_true = []
        y_pred = []
        for batch in dataloader:
            batch = batch.to(model.device)
            output = model(**batch)
            y_true.append(batch.labels)
            y_pred.append(torch.softmax(output.logits, -1))
    return torch.cat(y_true).cpu().numpy(), torch.cat(y_pred).cpu().numpy()

In [None]:
def train(model, train_dataloader, optimizer, epochs, test_dataloader):
    tq = tqdm(range(epochs))

    for epoch in tq:
        model.train()
        for i, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            batch = batch.to(model.device)
            output = model(**batch)
            loss = output.loss
            loss.backward()
            optimizer.step()

        # if epoch % 5 == 0:
        model.eval()

        y_true, y_pred = predict(model, train_dataloader)
        train_auc = np.mean(calculate_aucs(y_true, y_pred, config["num_labels"]))

        y_true, y_pred = predict(model, test_dataloader)
        test_auc = np.mean(calculate_aucs(y_true, y_pred, config["num_labels"]))

        tq.set_description(f"loss: {loss.item():4.4f}, AUC: {test_auc:4.4f}")
        wandb.log({"train_auc": train_auc, "test_auc": test_auc, "train_loss": loss.item()})

In [None]:
dataset = load_dataset(config["dataset"])

Using custom data configuration seara--ru-go-emotions-010f1c10233a04e9
Found cached dataset parquet (/home/seara/.cache/huggingface/datasets/seara___parquet/seara--ru-go-emotions-010f1c10233a04e9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer"])

In [None]:
config["label2id"], config["id2label"] = label2id(config["labels"].values())
processed_dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True), batched=True).map(
    lambda x: {"label": [float(y) for y in binarize_labels(x["labels"], config["num_labels"])]},
    batched=False,
    remove_columns=["text", "labels", "id", "ru_text"],
)

Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/seara___parquet/seara--ru-go-emotions-010f1c10233a04e9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-6a398f7cc851dfd3.arrow
Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/seara___parquet/seara--ru-go-emotions-010f1c10233a04e9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-f9ddef66c5d9538a.arrow
Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/seara___parquet/seara--ru-go-emotions-010f1c10233a04e9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-cfc1b6bfe6f1a825.arrow
Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/seara___parquet/seara--ru-go-emotions-010f1c10233a04e9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-ec812005e89d8824.arrow
Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/

In [None]:
model = BertForSequenceClassification.from_pretrained(
    config["model"],
    num_labels=config["num_labels"],
    problem_type=config["problem_type"],
    label2id=config["label2id"],
    id2label=config["id2label"],
)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

train_dataloader = DataLoader(
    concatenate_datasets([processed_dataset["train"], processed_dataset["validation"]]),
    batch_size=config["batch_size"],
    drop_last=False,
    shuffle=True,
    num_workers=0,
    collate_fn=data_collator,
)
test_dataloader = DataLoader(
    processed_dataset["test"],
    batch_size=config["batch_size"],
    drop_last=False,
    shuffle=True,
    num_workers=0,
    collate_fn=data_collator,
)

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=config["lr"])

In [None]:
os.environ["WANDB_NOTEBOOK_NAME"] = f"{os.getcwd()}/{full_config['name']}.ipynb"
wandb.login()
wandb.init(**full_config)

[34m[1mwandb[0m: Currently logged in as: [33mseara[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
model.cuda()
train(model, train_dataloader, optimizer, config["epochs"], test_dataloader)

  0%|          | 0/50 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
calculate_metrics(*predict(model, test_dataloader), config["num_labels"]).round(4)

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,mean
AUC ROC,0.9102,0.9579,0.8627,0.7612,0.7321,0.7984,0.8912,0.8858,0.8444,0.7569,...,0.7929,0.8556,0.858,0.6598,0.9064,0.9909,0.8728,0.866,0.8012,0.8573
F1 micro,0.9355,0.9792,0.963,0.9396,0.9315,0.9751,0.9718,0.9488,0.9865,0.972,...,0.9958,0.972,0.9971,0.9746,0.998,0.9932,0.9764,0.9753,0.7481,0.9667
F1 macro,0.797,0.8848,0.685,0.5804,0.6324,0.6545,0.6403,0.6934,0.6846,0.5458,...,0.4989,0.7411,0.4993,0.5512,0.4995,0.8428,0.742,0.7191,0.7109,0.6821


In [None]:
# notebook_login()

In [None]:
model.push_to_hub(full_config["name"])
tokenizer.push_to_hub(full_config["name"])

pytorch_model.bin:   0%|          | 0.00/117M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/seara/rubert-tiny2-ru-go-emotions/commit/b6625e16f22a96889e8ca513bc268a4ae4278cf4', commit_message='Upload tokenizer', commit_description='', oid='b6625e16f22a96889e8ca513bc268a4ae4278cf4', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
wandb.finish()

0,1
test_auc,▁▁▂▃▄▅▅▆▇▇▇▇████████████████████████████
train_auc,▁▁▂▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
train_loss,▇▅▆▆▅▆▅▅▃▂▄█▇▄▂▃▃▃▂▃▄▂▁▃▂▃▂▂▄▁▁▆▃▄▁▃▃▂▂▁

0,1
test_auc,0.85726
train_auc,0.96611
train_loss,0.01727
