In [None]:
import os

import numpy as np
import pandas as pd
import torch
import wandb
from datasets import load_dataset
from huggingface_hub import notebook_login
from sklearn.metrics import f1_score, roc_auc_score
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import AutoTokenizer, BertForSequenceClassification, DataCollatorWithPadding

In [None]:
full_config = {
    "project": "VKR",
    "config": {
        "dataset": "cedr",
        "num_labels": 6,
        "labels": {
            0: "no emotion",
            1: "joy",
            2: "sadness",
            3: "surprise",
            4: "fear",
            5: "anger",
        },
        "model": "DeepPavlov/rubert-base-cased",
        "tokenizer": "DeepPavlov/rubert-base-cased",
        "problem_type": "multi_label_classification",
        "batch_size": 64,
        "epochs": 60,
        "lr": 1e-5,
    },
    "name": "rubert-tiny2-cedr",
}
config = full_config["config"]

In [None]:
def binarize_labels(labels, num_labels):
    return [int(len(labels) == 0)] + [int(i in labels) for i in range(num_labels)]


def label2id(class_labels):
    label2id = {label: i for i, label in enumerate(class_labels)}
    id2label = {i: label for label, i in label2id.items()}
    return label2id, id2label

In [None]:
def calculate_aucs(y_true, y_pred, num_labels):
    return [roc_auc_score(y_true[:, i], y_pred[:, i]) for i in range(num_labels)]


def calculate_f1_score(y_true, y_pred, average, num_labels):
    return [f1_score(y_true[:, i], y_pred[:, i] > 0.5, average=average) for i in range(num_labels)]


def calculate_metrics(y_true, y_pred, num_labels):
    auc_rocs = calculate_aucs(y_true, y_pred, num_labels)

    f1_scores_micro = calculate_f1_score(y_true, y_pred, "micro", num_labels)

    f1_scores_macro = calculate_f1_score(y_true, y_pred, "macro", num_labels)

    df = pd.DataFrame(
        [auc_rocs, f1_scores_micro, f1_scores_macro],
        columns=config["labels"].values(),
        index=["AUC ROC", "F1 micro", "F1 macro"],
    )
    df["mean"] = df.mean(axis=1)
    df["mean(emotions)"] = df.drop("no emotion", axis=1).mean(axis=1)
    return df

In [None]:
def predict(model, dataloader):
    with torch.no_grad():
        y_true = []
        y_pred = []
        for batch in dataloader:
            batch = batch.to(model.device)
            output = model(**batch)
            y_true.append(batch.labels)
            y_pred.append(torch.softmax(output.logits, -1))
    return torch.cat(y_true).cpu().numpy(), torch.cat(y_pred).cpu().numpy()

In [None]:
def train(model, train_dataloader, optimizer, epochs, test_dataloader):
    tq = tqdm(range(epochs))

    for epoch in tq:
        model.train()
        for i, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            batch = batch.to(model.device)
            output = model(**batch)
            loss = output.loss
            loss.backward()
            optimizer.step()

        # if epoch % 5 == 0:
        model.eval()

        y_true, y_pred = predict(model, train_dataloader)
        train_auc = np.mean(calculate_aucs(y_true, y_pred, config["num_labels"]))

        y_true, y_pred = predict(model, test_dataloader)
        test_auc = np.mean(calculate_aucs(y_true, y_pred, config["num_labels"]))

        tq.set_description(f"loss: {loss.item():4.4f}, AUC: {test_auc:4.4f}")
        wandb.log({"train_auc": train_auc, "test_auc": test_auc, "train_loss": loss.item()})

In [None]:
dataset = load_dataset(config["dataset"])

No config specified, defaulting to: cedr/main
Found cached dataset cedr (/home/seara/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer"])

In [None]:
config["label2id"], config["id2label"] = label2id(config["labels"].values())
processed_dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True), batched=True).map(
    lambda x: {"label": [float(y) for y in binarize_labels(x["labels"], config["num_labels"] - 1)]},
    batched=False,
    remove_columns=["text", "labels", "source"],
)

Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66/cache-13dea0cb312c0820.arrow
Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66/cache-988743ea8596b197.arrow


  0%|          | 0/7528 [00:00<?, ?ex/s]

  0%|          | 0/1882 [00:00<?, ?ex/s]

In [None]:
model = BertForSequenceClassification.from_pretrained(
    config["model"],
    num_labels=config["num_labels"],
    problem_type=config["problem_type"],
    label2id=config["label2id"],
    id2label=config["id2label"],
)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

train_dataloader = DataLoader(
    processed_dataset["train"],
    batch_size=config["batch_size"],
    drop_last=False,
    shuffle=True,
    num_workers=0,
    collate_fn=data_collator,
)
test_dataloader = DataLoader(
    processed_dataset["test"],
    batch_size=config["batch_size"],
    drop_last=False,
    shuffle=True,
    num_workers=0,
    collate_fn=data_collator,
)

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=config["lr"])

In [None]:
os.environ["WANDB_NOTEBOOK_NAME"] = "/home/seara/Desktop/Github/vkr/new_era/rubert-tiny2-cedr.ipynb"
wandb.login()
wandb.init(**full_config)

[34m[1mwandb[0m: Currently logged in as: [33mseara[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
model.cuda()
train(model, train_dataloader, optimizer, config["epochs"], test_dataloader)

  0%|          | 0/60 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 5.78 GiB total capacity; 3.35 GiB already allocated; 335.81 MiB free; 4.51 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
calculate_metrics(*predict(model, test_dataloader), config["num_labels"]).round(4)

Unnamed: 0,no emotion,joy,sadness,surprise,fear,anger,mean,mean(emotions)
AUC ROC,0.9261,0.9501,0.9593,0.8951,0.891,0.7711,0.8988,0.8942
F1 micro,0.8587,0.9325,0.9277,0.9421,0.9458,0.9118,0.9198,0.93
F1 macro,0.851,0.8854,0.889,0.8252,0.7862,0.6733,0.8183,0.8129


In [None]:
# notebook_login()

In [None]:
model.push_to_hub(full_config["name"])
tokenizer.push_to_hub(full_config["name"])

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/117M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/seara/rubert-tiny2-cedr/commit/49d08f3e9f0d5ec37bf4207a19c25f7e858693be', commit_message='Upload tokenizer', commit_description='', oid='49d08f3e9f0d5ec37bf4207a19c25f7e858693be', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
wandb.finish()

0,1
test_auc,▁▁▃▅▆▆▇▇▇▇▇▇▇█▇████▇█▇███▇███▇█▇▇█▇███▇█
train_auc,▁▁▃▄▅▅▆▆▇▇▇▇▇███████████████████████████
train_loss,█▇▅▆▅▄▄▄▄▄▃▃▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
test_auc,0.89878
train_auc,0.99884
train_loss,0.00968
