In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from src.data.cedr import get_dataloaders
from src.model.models import get_model
from src.trainer.eval import eval
from transformers import pipeline

pd.options.display.max_colwidth = 80
labels_id = {
    0: "радость",
    1: "грусть",
    2: "удивление",
    3: "страх",
    4: "злость",
}
labels_trans = {
    "no_emotion": "нет эмоции",
    "joy": "радость",
    "sadness": "грусть",
    "surprise": "удивление",
    "fear": "страх",
    "anger": "злость",
}

In [None]:
dataset = load_dataset("cedr")
pipe = pipeline(model="seara/rubert-tiny2-cedr")

No config specified, defaulting to: cedr/main
Found cached dataset cedr (/home/seara/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
answers = pipe(dataset["test"]["text"], top_k=3)
answers

[[{'label': 'no_emotion', 'score': 0.9811407327651978},
  {'label': 'joy', 'score': 0.013523037545382977},
  {'label': 'sadness', 'score': 0.011762895621359348}],
 [{'label': 'no_emotion', 'score': 0.9634974002838135},
  {'label': 'joy', 'score': 0.029892388731241226},
  {'label': 'anger', 'score': 0.013595214113593102}],
 [{'label': 'no_emotion', 'score': 0.9808269739151001},
  {'label': 'sadness', 'score': 0.01337971817702055},
  {'label': 'joy', 'score': 0.011798517778515816}],
 [{'label': 'anger', 'score': 0.568587601184845},
  {'label': 'surprise', 'score': 0.24865998327732086},
  {'label': 'fear', 'score': 0.11023784428834915}],
 [{'label': 'fear', 'score': 0.8533727526664734},
  {'label': 'joy', 'score': 0.20098423957824707},
  {'label': 'sadness', 'score': 0.06891262531280518}],
 [{'label': 'no_emotion', 'score': 0.8050156831741333},
  {'label': 'surprise', 'score': 0.04751094803214073},
  {'label': 'anger', 'score': 0.030041445046663284}],
 [{'label': 'sadness', 'score': 0.936

In [None]:
processed_answers = []
for i in range(len(answers)):
    new_array = []
    first = True
    for j in range(len(answers[i])):
        if first == True:
            new_array.append(labels_trans[answers[i][j]["label"]])
            first = False
        elif answers[i][j]["score"] >= 0.5:
            new_array.append(labels_trans[answers[i][j]["label"]])
    processed_answers.append(new_array)

In [None]:
processed_answers

[['нет эмоции'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['злость'],
 ['страх'],
 ['нет эмоции'],
 ['грусть'],
 ['грусть'],
 ['удивление'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['удивление'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['радость'],
 ['радость'],
 ['страх'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['грусть'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['удивление'],
 ['грусть'],
 ['грусть'],
 ['нет эмоции'],
 ['радость'],
 ['радость'],
 ['радость'],
 ['радость'],
 ['грусть'],
 ['радость'],
 ['нет эмоции'],
 ['удивление'],
 ['нет эмоции'],
 ['грусть'],
 ['удивление'],
 ['грусть'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['злость'],
 ['нет эмоции'],
 ['грусть'],
 ['нет эмоции'],
 ['нет эмоции'],
 ['удивление'],
 ['нет эмоции'],
 ['удивление'],
 ['радость'],
 ['нет эмоции'],
 ['радость'],
 ['нет эмоции'],
 ['удивление'],
 ['грусть'],
 ['радость'],
 ['нет эмоции'],
 ['грусть'],
 ['нет эмоции'],
 ['грусть'],

In [None]:
true_answers = dataset["test"]["labels"]

In [None]:
true_answers = [[labels_id[idsh] for idsh in item] for item in true_answers]
true_answers = [item if item else ["нет эмоции"] for item in true_answers]

In [None]:
errors = pd.DataFrame(
    {
        "text": dataset["test"]["text"],
        "Предсказание": processed_answers,
        "Факт": true_answers,
    }
)
errors["slava"] = [len(set(a).intersection(b)) for a, b in zip(errors["Факт"], errors["Предсказание"])]

In [None]:
dataset["test"][1878]

{'text': 'Как же я соскучился по Испании :(',
 'labels': [1],
 'source': 'twitter'}

In [None]:
errors[errors["slava"] == 0].sample(5).drop("slava", axis=1)

Unnamed: 0,text,Предсказание,Факт
1476,"Я всегда знала,что ""девки"",которые играю в бакскетбол, злые кобылочки Ну не ...",[радость],[злость]
1007,"Приснилось то,чего я больше всего боюсь в жизни:(проснулась и обрадовалась,ч...",[страх],"[грусть, радость]"
1771,"Тебе мои песни , О немая , суровая мать !",[грусть],[нет эмоции]
1140,"да пацан, но как то все оно странно(",[грусть],[удивление]
1590,"И перед нами разворачивается эта почти "" шекспировкая "" трагедия , где финал...",[нет эмоции],[грусть]


In [None]:
dataset = load_dataset("cedr")
tokenizer, model = get_model("seara/rubert-tiny2-cedr", None, None, None, "eval")
train_dataloader, val_dataloader, test_dataloader = get_dataloaders(
    tokenizer,
    max_length=None,
    batch_size=64,
    shuffle=False,
    num_workers=4,
    pin_memory=False,
    drop_last=False,
)

No config specified, defaulting to: cedr/main
Found cached dataset cedr (/home/seara/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66)


  0%|          | 0/2 [00:00<?, ?it/s]

Loaded pretrained model


No config specified, defaulting to: cedr/main
Found cached dataset cedr (/home/seara/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66/cache-c6df481222e9754e.arrow
Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66/cache-b235303d5425ff75.arrow
Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66/cache-26ba0194bb8998e2.arrow
Loading cached processed dataset at /home/seara/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66/cache-a177f94c30ed16ec.arrow


Loaded dataloaders: train 103, val 15, test 30


In [None]:
labels = {
    0: "нет эмоции",
    1: "радость",
    2: "грусть",
    3: "удивление",
    4: "страх",
    5: "злость",
}

In [None]:
test_y_true, test_y_pred, df = eval(model, test_dataloader, labels, "multi_label_classification", True)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


                нет эмоции   радость    грусть  удивление     страх    злость  \
precision           0.8176    0.8371    0.8425     0.7902    0.7833    0.5467   
recall              0.8365    0.8300    0.8470     0.6647    0.6667    0.3280   
f1-score            0.8269    0.8336    0.8447     0.7220    0.7203    0.4100   
support           734.0000  353.0000  379.0000   170.0000  141.0000  125.0000   
auc-roc             0.9241    0.9649    0.9557     0.9130    0.9118    0.7732   
wrong f1 micro      0.8634    0.9378    0.9373     0.9538    0.9612    0.9373   
wrong f1 macro      0.8571    0.8977    0.9027     0.8484    0.8497    0.6884   

                micro avg  macro avg  weighted avg  samples avg  
precision          0.8110     0.7696        0.8034       0.7811  
recall             0.7760     0.6955        0.7760       0.7792  
f1-score           0.7931     0.7263        0.7870       0.7788  
support         1902.0000  1902.0000     1902.0000    1902.0000  
auc-roc              

In [None]:
next(iter(test_dataloader))["input_ids"][0]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([    2,  5634, 30011, 59193,   314, 30773,  8896,     1,  2389,  5943,
        30011, 64445,   778, 80086,    18,     3,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])

In [None]:
tokenizer(dataset["test"]["text"][0])

{'input_ids': [2, 5634, 30011, 59193, 314, 30773, 8896, 1, 2389, 5943, 30011, 64445, 778, 80086, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
test_y_true

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

In [None]:
len(test_y_true)

1882

In [None]:
len(dataset["test"]["text"])

1882

In [None]:
errors = pd.DataFrame(
    {
        "text": dataset["test"]["text"],
        "Предсказание": np.argmax(test_y_pred, axis=-1),
        "Факт": np.argmax(test_y_true, axis=-1),
    }
)

In [None]:
errors["Факт"] = errors["Факт"].apply(lambda x: labels[x])
errors["Предсказание"] = errors["Предсказание"].apply(lambda x: labels[x])

In [None]:
errors[errors["Факт"] != errors["Предсказание"]].groupby("Факт").sample(1)

Unnamed: 0,text,Предсказание,Факт
1404,Эта проблема возникала у меня периодически один - два раза в год на протяжен...,нет эмоции,грусть
1064,"Вот это состояние..давно такого не было..:( мне плохо и я злая капец, если к...",грусть,злость
1101,"И как удивительно, что именно мужчина решил обесценить эту тему», — написал он.",удивление,нет эмоции
1349,"не знаю :D тут что-то странное в классе происходит,какие-то фигуристы, какие...",удивление,радость
851,"Злоумышленник подошел к пострадавшей, осторожно ступая и щурясь.",нет эмоции,страх
530,"А на стоянке поезда топая за мороженкой ... оторопел , увидев Екатерину , ту...",злость,удивление
