<a href="https://colab.research.google.com/gist/avidale/e678c5478086c1d1adc52a85cb2b93e6/train-rubert-tiny-sentiment-classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

В этом блокноте мы обучаем маленький BERT распознавать в коротких русских текстах сентимент. Результирующая модель выложена в открытый доступ: https://huggingface.co/cointegrated/rubert-tiny-sentiment-balanced. 

Данные возьмём из проекта Сметанина: https://github.com/sismetanin/sentiment-analysis-in-russian

Для удобства я вручную скачал все данные и сложил на гугл диск. Результирующий датасет (он собирается посередине этого блокнота) вы можете выкачать [тут](https://drive.google.com/file/d/1dir_lixYfReDXxRS5oGGljH8T_f7vVqm/view?usp=sharing).


In [None]:
import gc
import random

import joblib
import numpy as np
import pandas as pd
import torch
import xmltodict
from datasets import Dataset, DatasetDict
from google.colab import drive
from IPython.display import display
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, roc_auc_score
from sklearn.pipeline import make_pipeline
from torch.utils.data import DataLoader
from tqdm.auto import tqdm, trange
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding

drive.mount("/gd")

Drive already mounted at /gd; to attempt to forcibly remount, call drive.mount("/gd", force_remount=True).


In [None]:
! pip install datasets transformers xmltodict --quiet

[K     |████████████████████████████████| 264 kB 5.0 MB/s 
[K     |████████████████████████████████| 2.6 MB 65.4 MB/s 
[K     |████████████████████████████████| 118 kB 68.6 MB/s 
[K     |████████████████████████████████| 243 kB 70.7 MB/s 
[K     |████████████████████████████████| 43 kB 2.5 MB/s 
[K     |████████████████████████████████| 895 kB 57.0 MB/s 
[K     |████████████████████████████████| 3.3 MB 61.1 MB/s 
[K     |████████████████████████████████| 636 kB 61.7 MB/s 
[?25h

# Load and unify data

## SentiRuEval2016

http://www.dialog-21.ru/evaluation/2016/sentiment/

https://drive.google.com/drive/folders/0BxlA8wH3PTUfV1F1UTBwVTJPd3c?resourcekey=0-k9mcoCJ0D8bfaHa9h3fIWw

In [None]:
!ls /gd/MyDrive/datasets/nlp/sentiment/SentiRuEval_2016

 banks_test_2016.xml	 tkk_test_2016.xml
 banks_test_etalon.xml	 tkk_test_etalon.xml
 bank_train_2016.xml	 tkk_train_2016.xml
 eval			'Результаты SentiRueval 2016.gsheet'


In [None]:
dirname = "/gd/MyDrive/datasets/nlp/sentiment/SentiRuEval_2016/"

In [None]:
with open(dirname + "bank_train_2016.xml") as f:
    data = xmltodict.parse(f.read())
objects = [
    "sberbank",
    "vtb",
    "gazprom",
    "alfabank",
    "bankmoskvy",
    "raiffeisen",
    "uralsib",
    "rshb",
]

senti_data_banks = []
for item in data["pma_xml_export"]["database"]["table"]:
    item_dict = {v["@name"]: v["#text"] for v in item["column"]}
    for o in objects:
        if item_dict[o] != "NULL":
            senti_data_banks.append({"text": item_dict["text"], "label": int(item_dict[o]), "object": o})
senti_data_banks = pd.DataFrame(senti_data_banks)
print(senti_data_banks.shape)
print(senti_data_banks.label.value_counts())

(10725, 3)
 0    7158
-1    2807
 1     760
Name: label, dtype: int64


In [None]:
def foo(x):
    if x == 1:
        return "positive"
    if x == 0:
        return "neutral"
    if x == -1:
        return "negative"
    return x


senti_data_banks.label = senti_data_banks.label.apply(foo)
senti_data_banks.label.value_counts()

neutral     7158
negative    2807
positive     760
Name: label, dtype: int64

In [None]:
with open(dirname + "tkk_train_2016.xml") as f:
    data = xmltodict.parse(f.read())

In [None]:
data["pma_xml_export"]["database"]["table"][0]

OrderedDict([('@name', 'tkk_train_2016'),
             ('column',
              [OrderedDict([('@name', 'id'), ('#text', '1')]),
               OrderedDict([('@name', 'twitid'),
                            ('#text', '492367588165680000')]),
               OrderedDict([('@name', 'date'), ('#text', '1406224555')]),
               OrderedDict([('@name', 'text'),
                            ('#text',
                             '@mkomov Максим, Вашем письмо мы получили. Наши сотрудники свяжутся с Вами завтра и направят запрос инженерам для проверки. #билайн')]),
               OrderedDict([('@name', 'beeline'), ('#text', '0')]),
               OrderedDict([('@name', 'mts'), ('#text', 'NULL')]),
               OrderedDict([('@name', 'megafon'), ('#text', 'NULL')]),
               OrderedDict([('@name', 'tele2'), ('#text', 'NULL')]),
               OrderedDict([('@name', 'rostelecom'), ('#text', 'NULL')]),
               OrderedDict([('@name', 'komstar'), ('#text', 'NULL')]),
              

In [None]:
with open(dirname + "tkk_train_2016.xml") as f:
    data = xmltodict.parse(f.read())
objects = ["beeline", "mts", "megafon", "tele2", "rostelecom", "komstar", "skylink"]

senti_data_tele = []
for item in data["pma_xml_export"]["database"]["table"]:
    item_dict = {v["@name"]: v["#text"] for v in item["column"]}
    for o in objects:
        if item_dict[o] != "NULL":
            senti_data_tele.append({"text": item_dict["text"], "label": int(item_dict[o]), "object": o})
senti_data_tele = pd.DataFrame(senti_data_tele)
print(senti_data_tele.shape)
print(senti_data_tele.label.value_counts())

(9209, 3)
 0    5213
-1    2611
 1    1385
Name: label, dtype: int64


In [None]:
def foo(x):
    if x == 1:
        return "positive"
    if x == 0:
        return "neutral"
    if x == -1:
        return "negative"
    return x


senti_data_tele.label = senti_data_tele.label.apply(foo)
senti_data_tele.label.value_counts()

neutral     5213
negative    2611
positive    1385
Name: label, dtype: int64

In [None]:
senti_data_tele["source"] = "SentiRuEval2016_tele"
senti_data_banks["source"] = "SentiRuEval2016_banks"

## SentiRuEval2015

https://drive.google.com/drive/folders/1f2bIJ-JDxIRCI1gEdEdB1kMe7lGJK02m

In [None]:
!ls /gd/MyDrive/datasets/nlp/sentiment/SentiRuEval-2015

Aspects_guidelines.doc		 SentiRuEval_car_markup_train.xml
eval				 SentiRuEval_rest_markup_test.xml
readme.txt			 SentiRuEval_rest_markup_train.xml
SentiRuEval_car_markup_test.xml  SentiRuEval_results.gsheet


In [None]:
dirname = "/gd/MyDrive/datasets/nlp/sentiment/SentiRuEval-2015/"
with open(dirname + "SentiRuEval_car_markup_train.xml") as f:
    data = xmltodict.parse(f.read())
print(len(data["reviews"]["review"]))

217


In [None]:
dirname = "/gd/MyDrive/datasets/nlp/sentiment/SentiRuEval-2015/"
with open(dirname + "SentiRuEval_rest_markup_train.xml") as f:
    data = xmltodict.parse(f.read())
print(len(data["reviews"]["review"]))

201


Этот датасет направлен на анализ аспектов, а не текста в целом, самих текстов мало, и оценить их - сложно. Поэтому пока что забью на эту тему. 

## RuTweetCorp

https://study.mokoron.com/#download

Нейтральные тексты я сам насэмплил для единообразия. Для этого я обучил простой классификатор (логрег на символьных n-граммах) отличать позитивные и негативные твиты от каких попало твитов, и выбрал 200К случайных твитов, с высокой уверенностью классифицированных как "что попало". 

In [None]:
cols = [
    "id",
    "date",
    "username",
    "text",
    "label",
    "rep",
    "fav",
    "stcount",
    "fol",
    "frien",
    "listcount",
    "hz",
]
tweet_pos = pd.read_csv("/gd/MyDrive/datasets/nlp/sentiment/mocoron/positive.csv", sep=";", header=None)
tweet_pos.columns = cols
tweet_neg = pd.read_csv("/gd/MyDrive/datasets/nlp/sentiment/mocoron/negative.csv", sep=";", header=None)
tweet_neg.columns = cols

In [None]:
tweet_neut = pd.read_csv("/gd/MyDrive/datasets/nlp/sentiment/mocoron/neutral.csv")

In [None]:
tweet_data = pd.DataFrame(
    {
        "text": tweet_pos.text.tolist() + tweet_neg.text.tolist() + tweet_neut.text.tolist(),
        "label": ["positive"] * tweet_pos.shape[0]
        + ["negative"] * tweet_neg.shape[0]
        + ["neutral"] * tweet_neut.shape[0],
    }
)
print(tweet_data.shape)
tweet_data.label.value_counts()

(519592, 2)


neutral     292758
positive    114911
negative    111923
Name: label, dtype: int64

In [None]:
tweet_data["source"] = "mokoron"

## Linis

In [None]:
linis_data = pd.read_excel(
    "/gd/MyDrive/datasets/nlp/sentiment/linis-crowd-doc_comment_summary.xlsx",
    header=None,
)

In [None]:
linis_data.columns = ["text", "label"]
linis_data.label.value_counts()

0                        13930
-1                        9203
1                         1795
-2                        1534
2                          365
GalinaPozd                   6
Виталия Салина               5
Минченкова Елизавета         2
22158                        2
23523                        1
23486                        1
Иван Мишалкин                1
Yudenkova Dasha              1
21887                        1
Арина Макковеева             1
Анна Вейдер                  1
Анна Аникина                 1
Name: label, dtype: int64

In [None]:
linis_data = linis_data[linis_data.label.apply(lambda x: x in {-2, -1, 0, 1, 2})].copy()
linis_data.label.value_counts().sort_index()

-2     1534
-1     9203
 0    13930
 1     1795
 2      365
Name: label, dtype: int64

In [None]:
def foo(x):
    if x >= 1:
        return "positive"
    if x == 0:
        return "neutral"
    if x <= -1:
        return "negative"
    return x


linis_data.label = linis_data.label.apply(foo)
linis_data.label.value_counts()

neutral     13930
negative    10737
positive     2160
Name: label, dtype: int64

In [None]:
linis_data["source"] = "linis"

## RuSentiment

In [None]:
rusent_random = pd.read_csv("/gd/MyDrive/datasets/nlp/sentiment/Rusentiment/rusentiment_random_posts.csv")
print(rusent_random.shape)
rusent_random.sample(3)

(21268, 2)


Unnamed: 0,label,text
707,speech,С Днём рождения!!!\nПусть у тебя ВСЁ получится)))
565,speech,С Днем Рождения! Всех благ и просветления!)
18330,neutral,КАКОЙ ВОЗРАСТ ВАШИХ ДЕТОК:


In [None]:
rusent_active = pd.read_csv("/gd/MyDrive/datasets/nlp/sentiment/Rusentiment/rusentiment_preselected_posts.csv")
print(rusent_active.shape)
rusent_active.sample(3)

(6950, 2)


Unnamed: 0,label,text
3686,positive,вот смешно!)
1143,neutral,Все социальные отношения строятся на лжи. Начн...
2077,neutral,"Если победит невежество, то меня первого спаля..."


In [None]:
rusent_random.label.value_counts()

neutral     8323
positive    4635
skip        3190
speech      2826
negative    2294
Name: label, dtype: int64

In [None]:
pd.options.display.max_colwidth = 300

In [None]:
rusent_random.groupby("label").sample(3)

Unnamed: 0,label,text
226,negative,\nРебята мы ведь можем по 4 часа в день трудиться. нас рабски используют.Платят медный грош или совсем забывают заплатить.
140,negative,"Раньше слово ""ЛЮБЛЮ"" было, как платье от Коко Шанель. Теперь, как дешевая майка с вьетнамского рынка"
19127,negative,Бедный Влад..
906,neutral,После работы посидел
14041,neutral,ФУТБОЛЬЧИК МАЛЬЧИКИ ПОХОДЯТ НА КАЧКОВ
3830,neutral,"Ребят, кто оформлял визу в Европу, а именно в Италию, Поделитесь контактами хорошего проверенного визового центра, пожалуйста! ASAP! 🙏🙏🙏"
2342,positive,наконецто пятница и мы уежаем в Харьков...STREET WORKOUT...!!!
20865,positive,Верь в себя ;)
16093,positive,"жду тебя, как воробей лета!"
6728,skip,з прошедчим


In [None]:
rusent_active.label.value_counts()

neutral     2977
positive    1475
negative    1360
skip         904
speech       234
Name: label, dtype: int64

In [None]:
rusent_data = pd.concat([rusent_active, rusent_random], ignore_index=True)
rusent_data = rusent_data[rusent_data.label.apply(lambda x: x in {"neutral", "negative", "positive"})].copy()
rusent_data.label.value_counts()

neutral     11300
positive     6110
negative     3654
Name: label, dtype: int64

In [None]:
rusent_data["source"] = "rusentiment"

## Kaggle data

In [None]:
kaggle_df = pd.read_json("/gd/MyDrive/datasets/nlp/sentiment/kaggle-sentiment-analysis-in-russian/train.json")
print(kaggle_df.shape)

(8263, 3)


In [None]:
kaggle_df.sentiment.value_counts()

neutral     4034
positive    2795
negative    1434
Name: sentiment, dtype: int64

In [None]:
kaggle_df.sample(3)

Unnamed: 0,text,id,sentiment
2934,"Астана. 28 декабря. Kazakhstan Today - Движение поездов на участке Караганда - Астана, нарушенное из-за схода электровоза и 12 грузовых вагонов, восстановлено, сообщила пресс-служба АО ""НК ""Казакстан темір жолы"".\n""27 декабря текущего года в 23.00 движение поездов по станции Бабатай на участке К...",4988,neutral
4255,"\n\nВладельцы старых авто смогут сдать транспортное средство категории М1 (легковой автомобиль) на дальнейшую утилизацию и взамен получить денежную компенсацию, сообщает primeminister.kz. Стоит отметить, что транспортное средство будет оцениваться по двум категориям: полной комплектации – 150 ты...",6309,positive
4993,"С декабря ОТЛК формирует контейнерные поезда по новому маршруту Гамбург - Чунцин в направлении Европа-Китай на еженедельной основе, говорится в сообщении. \n \n У ОАО ""Российские железные дороги"" 99,84% акций в ОТЛК, у АО ""Национальная компания ""Казахстан темир жолы"" (Казахстанские железные доро...",7047,neutral


In [None]:
kaggle_data = kaggle_df[["text", "sentiment"]].copy()
kaggle_data.columns = ["text", "label"]

In [None]:
kaggle_data["source"] = "kaggle_news"

## RuReviews

In [None]:
rureviews_data = pd.read_csv(
    "/gd/MyDrive/datasets/nlp/sentiment/rureviews-women-clothing-accessories.3-class.balanced.csv",
    sep="\t",
)
print(rureviews_data.shape)
print(rureviews_data.sentiment.value_counts())
rureviews_data.sample(3)

(90000, 2)
neautral    30000
negative    30000
positive    30000
Name: sentiment, dtype: int64


Unnamed: 0,review,sentiment
76394,За свои деньги отлично. На рост 170 подошли. Отлично тянутся,positive
73217,"Футболка, хорошего качества, дошла быстро! спасибо!",positive
74278,заказом довольна за такие деньги местами нитки торчат но все швы прошиты правда пакет пришел порваный но вещь не повредилась,positive


In [None]:
rureviews_data.columns = ["text", "label"]


def foo(x):
    if x == "neautral":
        return "neutral"
    return x


rureviews_data.label = rureviews_data.label.apply(foo)
rureviews_data.label.value_counts()

negative    30000
positive    30000
neutral     30000
Name: label, dtype: int64

In [None]:
rureviews_data["source"] = "rureviews"

# Join data

In [None]:
all_dfs = [
    senti_data_tele,
    senti_data_banks,
    tweet_data,
    linis_data,
    rusent_data,
    kaggle_data,
    rureviews_data,
]

In [None]:
full_sentiment_data = pd.concat([df[["text", "label", "source"]] for df in all_dfs], ignore_index=True).dropna()

In [None]:
print(full_sentiment_data.shape)
full_sentiment_data.source.value_counts()

(685605, 3)


mokoron                  519592
rureviews                 90000
linis                     26752
rusentiment               21064
SentiRuEval2016_banks     10725
SentiRuEval2016_tele       9209
kaggle_news                8263
Name: source, dtype: int64

In [None]:
full_sentiment_data.label.value_counts()

neutral     364321
negative    163164
positive    158120
Name: label, dtype: int64

In [None]:
def do_split(x):
    if random.random() < 0.8:
        return "train"
    if random.random() < 0.5:
        return "dev"
    return "test"


random.seed(1)
full_sentiment_data["split"] = full_sentiment_data.text.apply(do_split)

In [None]:
full_sentiment_data["split"].head(20)

0     train
1      test
2     train
3     train
4     train
5     train
6     train
7     train
8     train
9       dev
10    train
11    train
12    train
13    train
14    train
15     test
16    train
17    train
18    train
19      dev
Name: split, dtype: object

```
0     train
1      test
2     train
3     train
4     train
5     train
6     train
7     train
8     train
9       dev
10    train
11    train
12    train
13    train
14    train
15     test
16    train
17    train
18    train
19      dev```

In [None]:
full_sentiment_data.to_csv("sentiment_data.tsv", sep="\t", index=None)

# Load the joint data

In [None]:
full_sentiment_data = pd.read_csv("/gd/MyDrive/datasets/nlp/sentiment/joint_sentiment_data.tsv", sep="\t")

In [None]:
dev_balanced = (
    full_sentiment_data[full_sentiment_data.split == "dev"].groupby(["source", "label"]).sample(70, random_state=1)
)
torch_dev_balanced = Dataset.from_pandas(dev_balanced[["text", "label"]].reset_index(drop=True))

In [None]:
train_data = full_sentiment_data[full_sentiment_data.split == "train"].dropna().reset_index(drop=True)
train_data.shape

(548422, 4)

In [None]:
torch_data = DatasetDict(
    {
        "train": Dataset.from_pandas(train_data[["text", "label"]].reset_index(drop=True)),
        "dev": Dataset.from_pandas(
            full_sentiment_data[full_sentiment_data.split == "dev"][["text", "label"]].reset_index(drop=True)
        ),
    }
)
torch_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 548422
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 68676
    })
})

# Modelling

In [None]:
all_labels = ["negative", "neutral", "positive"]

In [None]:
model_checkpoint = "cointegrated/rubert-tiny"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()


cleanup()

In [None]:
def evaluate_model(model, dev_dataloader, verbose=False, labels=None):
    facts, preds = predict_with_model(model, dev_dataloader)
    pfrs, aucs = get_classification_report(facts, preds, labels)
    if verbose:
        display(pfrs)
        print("aucs:", aucs, np.mean(aucs))
    return np.mean(aucs)


def predict_with_model(model, dataloader):
    preds = []
    facts = []

    for batch in tqdm(dataloader):
        facts.append(batch.labels.cpu().numpy())
        batch = batch.to(model.device)
        with torch.no_grad():
            pr = model(
                input_ids=batch.input_ids,
                attention_mask=batch.attention_mask,
                token_type_ids=batch.token_type_ids,
            )
        preds.append(torch.softmax(pr.logits, -1).cpu().numpy())
    facts = np.concatenate(facts)
    preds = np.concatenate(preds)
    return facts, preds


def get_classification_report(facts, preds, labels=None):
    pfrs = pd.DataFrame(
        dict(
            zip(
                ["p", "r", "f", "s"],
                precision_recall_fscore_support(facts, preds.argmax(1)),
            )
        )
    )
    aucs = [roc_auc_score(facts == i, preds[:, i]) for i in set(facts)]
    pfrs["a"] = aucs
    pfrs = pfrs.append(pfrs.mean(), ignore_index=True)
    if labels is not None:
        pfrs.index = list(labels) + ["mean"]
    return pfrs, aucs

In [None]:
torch_dev_balanced_tokenized = torch_dev_balanced.map(
    lambda x: tokenizer(x["text"], truncation=True),
    batched=True,
    remove_columns=["text"],
).map(lambda x: {"label": [all_labels.index(xl) for xl in x["label"]]}, batched=True)

torch_dev_balanced_loader = DataLoader(
    torch_dev_balanced_tokenized,
    batch_size=64,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=data_collator,
)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
data_tokenized = torch_data.map(
    lambda x: tokenizer(x["text"], truncation=True),
    batched=True,
    remove_columns=["text"],
)
data_tokenized = data_tokenized.map(lambda x: {"label": [all_labels.index(xl) for xl in x["label"]]}, batched=True)

  0%|          | 0/549 [00:00<?, ?ba/s]

  0%|          | 0/69 [00:00<?, ?ba/s]

  0%|          | 0/549 [00:00<?, ?ba/s]

  0%|          | 0/69 [00:00<?, ?ba/s]

# The first model: unbalanced data

In [None]:
batch_size = 64

train_dataloader = DataLoader(
    data_tokenized["train"],
    batch_size=batch_size,
    drop_last=False,
    shuffle=True,
    num_workers=0,
    collate_fn=data_collator,
)
dev_dataloader = DataLoader(
    data_tokenized["dev"],
    batch_size=batch_size,
    drop_last=False,
    shuffle=True,
    num_workers=0,
    collate_fn=data_collator,
)

In [None]:
evaluate_model(model, dev_dataloader, verbose=True)

  0%|          | 0/1074 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s
0,0.267435,0.421399,0.32721,16253.0
1,0.670845,0.53941,0.597991,36412.0
2,0.23564,0.202923,0.218061,16011.0
3,0.391306,0.387911,0.381087,22892.0


aucs: [0.3868994199141944, 0.6943497260285278, 0.578275696874899] 0.5531749476058737


0.5531749476058737

А вот после этого момента можно сразу переходить ко второй версии модели

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [None]:
gradient_accumulation_steps = 1
window = 500
cleanup_step = 100
report_step = 10000

In [None]:
ewm_loss = 0

In [None]:
model.train()
cleanup()

for epoch in trange(3):
    tq = tqdm(train_dataloader)

    for i, batch in enumerate(tq):
        try:
            batch = batch.to(model.device)
            output = model(**batch)
            loss = output.loss
            loss.backward()
        except RuntimeError as e:
            print("error on step", i, e)
            loss = None
            cleanup()
            continue

        if i and i % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if i % cleanup_step == 0:
            cleanup()

        w = 1 / min(i + 1, window)
        ewm_loss = ewm_loss * (1 - w) + loss.item() * w
        tq.set_description(f"loss: {ewm_loss:4.4f}")

        if i % report_step == 0:
            model.eval()
            eval_loss = evaluate_model(model, dev_dataloader, verbose=True)
            model.train()
            print(f"epoch {epoch}, step {i}: train loss: {ewm_loss:4.4f}  val auc: {eval_loss}")

model.eval()
eval_loss = evaluate_model(model, dev_dataloader, verbose=True)
print(f"epoch {epoch + 1}, step {i}: train loss: {ewm_loss:4.4f}  val auc: {eval_loss}")

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8570 [00:00<?, ?it/s]

  0%|          | 0/8570 [00:00<?, ?it/s]

  0%|          | 0/8570 [00:00<?, ?it/s]

  0%|          | 0/1074 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s
0,0.921001,0.869378,0.894445,16253.0
1,0.921058,0.948149,0.934407,36412.0
2,0.939688,0.930298,0.93497,16011.0
3,0.927249,0.915942,0.921274,22892.0


aucs: [0.9895661727076159, 0.9864192978440336, 0.9935832239563914] 0.9898562315026803
epoch 3, step 8569: train loss: 0.1743  val auc: 0.9898562315026803


In [None]:
NEW_MODEL_NAME = "/gd/MyDrive/models/rubert-tiny-sentiment"
model.save_pretrained(NEW_MODEL_NAME)
tokenizer.save_pretrained(NEW_MODEL_NAME)

('/gd/MyDrive/models/rubert-tiny-sentiment/tokenizer_config.json',
 '/gd/MyDrive/models/rubert-tiny-sentiment/special_tokens_map.json',
 '/gd/MyDrive/models/rubert-tiny-sentiment/vocab.txt',
 '/gd/MyDrive/models/rubert-tiny-sentiment/added_tokens.json',
 '/gd/MyDrive/models/rubert-tiny-sentiment/tokenizer.json')

# Evaluation

In [None]:
dev_dataloader_fixed = DataLoader(
    data_tokenized["dev"],
    batch_size=32,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=data_collator,
)

NameError: ignored

In [None]:
preds = []
facts = []

for batch in tqdm(dev_dataloader_fixed):
    facts.append(batch.labels.cpu().numpy())
    batch = batch.to(model.device)
    with torch.no_grad():
        pr = model(
            input_ids=batch.input_ids,
            attention_mask=batch.attention_mask,
            token_type_ids=batch.token_type_ids,
        )
    preds.append(torch.softmax(pr.logits, -1).cpu().numpy())
facts = np.concatenate(facts)
preds = np.concatenate(preds)

  0%|          | 0/1074 [00:00<?, ?it/s]

In [None]:
dev = full_sentiment_data[full_sentiment_data.split == "dev"].copy()
print(dev.shape)
assert (dev.label == [all_labels[i] for i in facts]).all()

(68676, 4)


In [None]:
dev["preds"] = [all_labels[i] for i in preds.argmax(axis=1)]

In [None]:
dev.groupby("source").apply(
    lambda x: pd.Series(
        {
            "n": x.shape[0],
            "accuracy": accuracy_score(x.label, x.preds),
            "f1_macro": f1_score(x.label, x.preds, average="macro"),
        }
    )
)

Unnamed: 0_level_0,n,accuracy,f1_macro
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SentiRuEval2016_banks,1055.0,0.869194,0.709848
SentiRuEval2016_tele,937.0,0.707577,0.634077
kaggle_news,840.0,0.65119,0.604491
linis,2675.0,0.601495,0.52559
mokoron,51886.0,0.996203,0.996019
rureviews,9171.0,0.737651,0.740745
rusentiment,2112.0,0.643466,0.549202


Мораль: модель, обученная на очень несбалансированном датасете, подстраивается под свою наиболее массивную часть. Нужно попробовать пересэмплировать всё нафиг. 

In [None]:
resampled = (
    dev.groupby(["source", "label"])
    .sample(70, random_state=1)
    .groupby("source")
    .apply(
        lambda x: pd.Series(
            {
                "n": x.shape[0],
                "accuracy": accuracy_score(x.label, x.preds),
                "f1_macro": f1_score(x.label, x.preds, average="macro"),
            }
        )
    )
)
resampled

Unnamed: 0_level_0,n,accuracy,f1_macro
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SentiRuEval2016_banks,210.0,0.67619,0.643551
SentiRuEval2016_tele,210.0,0.585714,0.574823
kaggle_news,210.0,0.552381,0.539622
linis,210.0,0.480952,0.460348
mokoron,210.0,1.0,1.0
rureviews,210.0,0.738095,0.740806
rusentiment,210.0,0.566667,0.538639


In [None]:
resampled.mean()

n           210.000000
accuracy      0.657143
f1_macro      0.642541
dtype: float64

# The second model, resampled and balanced

In [None]:
train_size = sum(full_sentiment_data.split == "train")
print(train_size)

548424


In [None]:
full_sentiment_data.sample(3)

Unnamed: 0,text,label,source,split
842927,Нет человека — нет проблем. (с) Сталин.,,,
1479419,"@xerepacid, #мысли Любые перемены несут с собо...",,,
817467,Нет человека — нет проблем. (с) Сталин.,,,


In [None]:
train = {
    k: v.text.tolist()
    for k, v in full_sentiment_data[full_sentiment_data.split == "train"].dropna().groupby(["source", "label"])
}

In [None]:
print(len(train))
train.keys()

21


dict_keys([('SentiRuEval2016_banks', 'negative'), ('SentiRuEval2016_banks', 'neutral'), ('SentiRuEval2016_banks', 'positive'), ('SentiRuEval2016_tele', 'negative'), ('SentiRuEval2016_tele', 'neutral'), ('SentiRuEval2016_tele', 'positive'), ('kaggle_news', 'negative'), ('kaggle_news', 'neutral'), ('kaggle_news', 'positive'), ('linis', 'negative'), ('linis', 'neutral'), ('linis', 'positive'), ('mokoron', 'negative'), ('mokoron', 'neutral'), ('mokoron', 'positive'), ('rureviews', 'negative'), ('rureviews', 'neutral'), ('rureviews', 'positive'), ('rusentiment', 'negative'), ('rusentiment', 'neutral'), ('rusentiment', 'positive')])

In [None]:
train_keys = list(train.keys())


def sample_batch(n=64):
    texts = []
    labels = []
    for i in range(n):
        source, label = random.choice(train_keys)
        texts.append(random.choice(train[(source, label)]))
        labels.append(all_labels.index(label))
    return texts, labels


sample_batch(2)

(['ждала юбку почти три месяца, так и не пришла. деньги вернули через спор .',
  'хороший дядька был.  Почитал, вроде всё правильно делал... Но процент бедности так и остался довольно высок(.  Бунтарь...  Уго Чавес...Солдат... КомандантеСегодня  покинул  мир...И ангелы  тихо  поют сарабандуА  дьяволы  прут  на  пир...Дикарь...  настоящий... народный индеец(Он  коку  жевал  всю жизнь...)И верил ,  свободы  текила  и перец   -Лекарство  мужицких  жил...И горечь  матэ  заливая в  глотку  Тянул  бечевой  Страну...А  в  русских просторах морозной водкойИ баней  лечил  хандру...Чем  громче  фавелы  ему   молилисьСкандируя:    Уго -Любовь!  Тем  тише шептались  Гудзон с  Пикадилли...Нефть –  вурдалачья кровь ... В  морях  озверело  гребут  шаланды ...Булькает вязкий  груз ...Но  ламу   ведет   перевалом  в  АндыИндейский шаман Иисус ...http://stihi.ru/2013/03/06/12096  и это при всей его биографии ..  вы же не знаете ЧТО там было до него.Я работала в Венесуэле с 97 по 2002 годы.Там нищета был

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(all_labels))
model.config.id2label = dict(enumerate(all_labels))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}
model.cuda();

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not i

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
torch_dev_balanced_tokenized = torch_dev_balanced.map(
    lambda x: tokenizer(x["text"], truncation=True),
    batched=True,
    remove_columns=["text"],
).map(lambda x: {"label": [all_labels.index(xl) for xl in x["label"]]}, batched=True)

torch_dev_balanced_loader = DataLoader(
    torch_dev_balanced_tokenized,
    batch_size=64,
    drop_last=False,
    shuffle=True,
    num_workers=0,
    collate_fn=data_collator,
)

In [None]:
evaluate_model(model, torch_dev_balanced_loader, verbose=True, labels=all_labels)

  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.283967,0.426531,0.340946,490.0,0.402788
neutral,0.415385,0.110204,0.174194,490.0,0.497675
positive,0.298013,0.367347,0.329068,490.0,0.488076
mean,0.332455,0.301361,0.281402,490.0,0.462846


aucs: [0.40278842149104543, 0.49767492711370265, 0.4880758017492712] 0.4628463834513397


0.4628463834513397

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [None]:
cleanup()

In [None]:
gradient_accumulation_steps = 1
window = 500
cleanup_step = 100
report_step = 3000

In [None]:
ewm_loss = 0

In [None]:
batch_size = 64

In [None]:
model.train()
cleanup()

tq = trange(int(train_size / batch_size * 3))  # 3 примерные эпохи

for i in tq:
    try:
        texts, labels = sample_batch(n=batch_size)
        inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt").to(model.device)
        output = model(labels=torch.tensor(labels).to(model.device), **inputs)
        loss = output.loss
        loss.backward()
    except RuntimeError as e:
        print("error on step", i, e)
        loss = None
        cleanup()
        continue

    if i and i % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

    if i % cleanup_step == 0:
        cleanup()

    w = 1 / min(i + 1, window)
    ewm_loss = ewm_loss * (1 - w) + loss.item() * w
    tq.set_description(f"loss: {ewm_loss:4.4f}")

    if i and i % report_step == 0:
        cleanup()
        model.eval()
        eval_loss = evaluate_model(model, torch_dev_balanced_loader, verbose=True, labels=all_labels)
        model.train()
        print(f"step {i}: train loss: {ewm_loss:4.4f}  val auc: {eval_loss}")
        cleanup()

model.eval()
eval_loss = evaluate_model(model, torch_dev_balanced_loader, verbose=True, labels=all_labels)
print(f"step {i}: train loss: {ewm_loss:4.4f}  val auc: {eval_loss}")

  0%|          | 0/25707 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.728778,0.718367,0.723535,490.0,0.900676
neutral,0.650108,0.614286,0.631689,490.0,0.825878
positive,0.723282,0.773469,0.747535,490.0,0.894082
mean,0.700723,0.702041,0.70092,490.0,0.873545


aucs: [0.9006757600999582, 0.8258777592669722, 0.8940816326530613] 0.8735450506733305
step 3000: train loss: 0.6469  val auc: 0.8735450506733305


  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.744681,0.714286,0.729167,490.0,0.908183
neutral,0.647773,0.653061,0.650407,490.0,0.838877
positive,0.737154,0.761224,0.748996,490.0,0.903807
mean,0.709869,0.709524,0.709523,490.0,0.883622


aucs: [0.908183048729696, 0.8388765097875885, 0.9038067471886714] 0.8836221019019853
step 6000: train loss: 0.5629  val auc: 0.8836221019019853


  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.748441,0.734694,0.741504,490.0,0.910551
neutral,0.644269,0.665306,0.654618,490.0,0.845601
positive,0.766046,0.755102,0.760534,490.0,0.904512
mean,0.719585,0.718367,0.718886,490.0,0.886888


aucs: [0.9105508121615993, 0.845600791336943, 0.9045116618075802] 0.8868877551020408
step 9000: train loss: 0.5022  val auc: 0.8868877551020408


  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.731939,0.785714,0.757874,490.0,0.913091
neutral,0.683652,0.657143,0.670135,490.0,0.852902
positive,0.773784,0.746939,0.760125,490.0,0.903426
mean,0.729792,0.729932,0.729378,490.0,0.889806


aucs: [0.9130914202415661, 0.8529019158683881, 0.9034256559766765] 0.8898063306955436
step 12000: train loss: 0.4585  val auc: 0.8898063306955436


  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.733333,0.785714,0.758621,490.0,0.914041
neutral,0.681529,0.655102,0.668054,490.0,0.854757
positive,0.763713,0.738776,0.751037,490.0,0.900154
mean,0.726192,0.726531,0.725904,490.0,0.889651


aucs: [0.9140410245730947, 0.8547573927530195, 0.9001541024573095] 0.8896508399278079
step 15000: train loss: 0.4233  val auc: 0.8896508399278079


  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.763265,0.763265,0.763265,490.0,0.909412
neutral,0.670732,0.673469,0.672098,490.0,0.851519
positive,0.762295,0.759184,0.760736,490.0,0.899475
mean,0.732097,0.731973,0.732033,490.0,0.886802


aucs: [0.909411703456893, 0.8515191586838817, 0.8994752186588921] 0.8868020269332223
step 18000: train loss: 0.3871  val auc: 0.8868020269332223


  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.759596,0.767347,0.763452,490.0,0.912465
neutral,0.662745,0.689796,0.676,490.0,0.854447
positive,0.774194,0.734694,0.753927,490.0,0.891818
mean,0.732178,0.730612,0.731126,490.0,0.886243


aucs: [0.9124645980841316, 0.8544471053727613, 0.8918179925031237] 0.8862432319866721
step 21000: train loss: 0.3658  val auc: 0.8862432319866721


  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.778993,0.726531,0.751848,490.0,0.911355
neutral,0.650699,0.665306,0.657921,490.0,0.851336
positive,0.736328,0.769388,0.752495,490.0,0.894698
mean,0.722007,0.720408,0.720755,490.0,0.885796


aucs: [0.9113546438983756, 0.851335901707622, 0.8946980424822991] 0.8857961960294323
step 24000: train loss: 0.3407  val auc: 0.8857961960294323


  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.750484,0.791837,0.770606,490.0,0.912315
neutral,0.668616,0.7,0.683948,490.0,0.854437
positive,0.793182,0.712245,0.750538,490.0,0.891624
mean,0.737427,0.734694,0.735031,490.0,0.886125


aucs: [0.9123146605581006, 0.8544366930445648, 0.8916243231986672] 0.8861252256004443
step 25706: train loss: 0.3305  val auc: 0.8861252256004443


In [None]:
NEW_MODEL_NAME = "/gd/MyDrive/models/rubert-tiny-sentiment-balanced"
model.save_pretrained(NEW_MODEL_NAME)
tokenizer.save_pretrained(NEW_MODEL_NAME)

('/gd/MyDrive/models/rubert-tiny-sentiment-balanced/tokenizer_config.json',
 '/gd/MyDrive/models/rubert-tiny-sentiment-balanced/special_tokens_map.json',
 '/gd/MyDrive/models/rubert-tiny-sentiment-balanced/vocab.txt',
 '/gd/MyDrive/models/rubert-tiny-sentiment-balanced/added_tokens.json',
 '/gd/MyDrive/models/rubert-tiny-sentiment-balanced/tokenizer.json')

# Compare with baselines

In [None]:
model1 = AutoModelForSequenceClassification.from_pretrained("/gd/MyDrive/models/rubert-tiny-sentiment").cuda()
model2 = AutoModelForSequenceClassification.from_pretrained("/gd/MyDrive/models/rubert-tiny-sentiment-balanced").cuda();

In [None]:
torch_dev_balanced_tokenized = torch_dev_balanced.map(
    lambda x: tokenizer(x["text"], truncation=True),
    batched=True,
    remove_columns=["text"],
).map(lambda x: {"label": [all_labels.index(xl) for xl in x["label"]]}, batched=True)

torch_dev_balanced_loader = DataLoader(
    torch_dev_balanced_tokenized,
    batch_size=64,
    drop_last=False,
    shuffle=True,
    num_workers=0,
    collate_fn=data_collator,
)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
evaluate_model(model1, torch_dev_balanced_loader, verbose=True, labels=all_labels)
evaluate_model(model2, torch_dev_balanced_loader, verbose=True, labels=all_labels)

  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.79235,0.591837,0.67757,490.0,0.882485
neutral,0.524194,0.795918,0.632091,490.0,0.813036
positive,0.794444,0.583673,0.672941,490.0,0.882904
mean,0.703663,0.657143,0.660867,490.0,0.859475


aucs: [0.8824854227405248, 0.813036234902124, 0.8829039983340274] 0.859475218658892


  0%|          | 0/23 [00:00<?, ?it/s]

Unnamed: 0,p,r,f,s,a
negative,0.738964,0.785714,0.761622,490.0,0.90992
neutral,0.65415,0.67551,0.664659,490.0,0.854236
positive,0.801354,0.72449,0.760986,490.0,0.895053
mean,0.731489,0.728571,0.729089,490.0,0.886403


aucs: [0.9099198250728862, 0.8542357351103707, 0.8950531028738027] 0.8864028876856865


0.8864028876856865

Сбалансированная модель явно лучше. Но как насчёт простого бейзлайна?

In [None]:
dev_balanced["label_id"] = [all_labels.index(xl) for xl in dev_balanced["label"]]

In [None]:
train_small = train_data.sample(100000, random_state=1)
train_small["label_id"] = [all_labels.index(xl) for xl in train_small["label"]]

In [None]:
%%time
pipe = make_pipeline(
    HashingVectorizer(analyzer="char", n_features=300_000, ngram_range=(3, 6)),
    LogisticRegression(max_iter=1000, solver="saga"),
)
pipe.fit(train_small.text, train_small.label_id)
display(get_classification_report(dev_balanced.label_id, pipe.predict_proba(dev_balanced.text), labels=all_labels)[0])

Unnamed: 0,p,r,f,s,a
negative,0.695122,0.581633,0.633333,490.0,0.849076
neutral,0.486202,0.755102,0.591527,490.0,0.758511
positive,0.725753,0.442857,0.550063,490.0,0.793168
mean,0.635692,0.593197,0.591641,490.0,0.800252


CPU times: user 1min 12s, sys: 695 ms, total: 1min 13s
Wall time: 1min 13s


Balance only classes; this definitely helps. 

In [None]:
%%time
pipe_b1 = make_pipeline(
    HashingVectorizer(analyzer="char", n_features=300_000, ngram_range=(3, 6)),
    LogisticRegression(max_iter=1000, solver="saga", class_weight="balanced"),
)
pipe_b1.fit(train_small.text, train_small.label_id)
display(
    get_classification_report(
        dev_balanced.label_id,
        pipe_b1.predict_proba(dev_balanced.text),
        labels=all_labels,
    )[0]
)

Unnamed: 0,p,r,f,s,a
negative,0.64351,0.718367,0.678881,490.0,0.859264
neutral,0.568862,0.581633,0.575177,490.0,0.752591
positive,0.691943,0.595918,0.640351,490.0,0.810263
mean,0.634772,0.631973,0.63147,490.0,0.807373


CPU times: user 1min 18s, sys: 459 ms, total: 1min 19s
Wall time: 1min 18s


What about words? They fare worse

In [None]:
%%time
pipe_b1 = make_pipeline(
    HashingVectorizer(analyzer="word", n_features=300_000, ngram_range=(1, 2)),
    LogisticRegression(max_iter=1000, solver="saga", class_weight="balanced"),
)
pipe_b1.fit(train_small.text, train_small.label_id)
display(
    get_classification_report(
        dev_balanced.label_id,
        pipe_b1.predict_proba(dev_balanced.text),
        labels=all_labels,
    )[0]
)

Unnamed: 0,p,r,f,s,a
negative,0.614159,0.708163,0.65782,490.0,0.83205
neutral,0.543478,0.561224,0.552209,490.0,0.728746
positive,0.62406,0.508163,0.56018,490.0,0.774703
mean,0.593899,0.592517,0.59007,490.0,0.7785


CPU times: user 10.1 s, sys: 20.1 ms, total: 10.2 s
Wall time: 10.1 s


Balance sources and classes

In [None]:
group_size = int(train_data.shape[0] * 0.1 / train_data.source.nunique() / train_data.label.nunique())
print(group_size)
bala_train = train_data.groupby(["source", "label"]).apply(
    lambda x: x.sample(group_size, random_state=1, replace=(x.shape[0] < group_size))
)
bala_train["label_id"] = [all_labels.index(xl) for xl in bala_train["label"]]
print(bala_train.shape)

2611
(54831, 5)


The model is a clear improvement over *unbalanced* BERT

In [None]:
pipe_b2 = make_pipeline(
    HashingVectorizer(analyzer="char", n_features=300_000, ngram_range=(3, 6)),
    LogisticRegression(max_iter=1000, solver="saga", class_weight="balanced"),
)
pipe_b2.fit(bala_train.text, bala_train.label_id)
display(
    get_classification_report(
        dev_balanced.label_id,
        pipe_b2.predict_proba(dev_balanced.text),
        labels=all_labels,
    )[0]
)

Unnamed: 0,p,r,f,s,a
negative,0.721545,0.72449,0.723014,490.0,0.891586
neutral,0.630975,0.673469,0.65153,490.0,0.82767
positive,0.740659,0.687755,0.713228,490.0,0.882229
mean,0.697726,0.695238,0.695924,490.0,0.867162


Repeat the experiment with a larger-sized resampled train data 

The full train sample fails because of RAM requirements, so we use a fraction of mostly 0.3 of the original data.

And still this model is only fractionally worse than BERT. 

In [None]:
group_size = int(train_data.shape[0] * 0.3 / train_data.source.nunique() / train_data.label.nunique())
print(group_size)
bala_train = train_data.groupby(["source", "label"]).apply(
    lambda x: x.sample(group_size, random_state=1, replace=(x.shape[0] < group_size))
)
bala_train["label_id"] = [all_labels.index(xl) for xl in bala_train["label"]]
print(bala_train.shape)

pipe_b2 = make_pipeline(
    HashingVectorizer(analyzer="char", n_features=300_000, ngram_range=(3, 6)),
    LogisticRegression(max_iter=1000, solver="saga", class_weight="balanced"),
)
pipe_b2.fit(bala_train.text, bala_train.label_id)
display(
    get_classification_report(
        dev_balanced.label_id,
        pipe_b2.predict_proba(dev_balanced.text),
        labels=all_labels,
    )[0]
)

7834
(164514, 5)


Unnamed: 0,p,r,f,s,a
negative,0.740443,0.75102,0.745694,490.0,0.904516
neutral,0.656546,0.706122,0.680433,490.0,0.849898
positive,0.782511,0.712245,0.745726,490.0,0.893027
mean,0.7265,0.723129,0.723951,490.0,0.88248


In [None]:
joblib.dump(pipe_b2, "/gd/MyDrive/models/ru_sentiment_balanced_logreg_classifier.joblib")

['/gd/MyDrive/models/ru_sentiment_balanced_logreg_classifier.joblib']

### Evaluate across sources

In [None]:
dev_dataloader_fixed = DataLoader(
    data_tokenized["dev"],
    batch_size=32,
    drop_last=False,
    shuffle=False,
    num_workers=0,
    collate_fn=data_collator,
)

In [None]:
dev = full_sentiment_data[full_sentiment_data.split == "dev"].copy()

In [None]:
facts, preds_m2 = predict_with_model(model2, dev_dataloader_fixed)
assert (dev.label == [all_labels[i] for i in facts]).all()
dev["preds_m2"] = [all_labels[i] for i in preds_m2.argmax(axis=1)]

  0%|          | 0/2147 [00:00<?, ?it/s]

In [None]:
facts, preds_m1 = predict_with_model(model1, dev_dataloader_fixed)
dev["preds_m1"] = [all_labels[i] for i in preds_m1.argmax(axis=1)]

  0%|          | 0/2147 [00:00<?, ?it/s]

In [None]:
dev["preds_lr"] = [all_labels[i] for i in pipe_b2.predict(dev["text"])]

In [None]:
stat_m1 = dev.groupby("source").apply(
    lambda x: pd.Series(
        {
            "n": x.shape[0],
            "accuracy": accuracy_score(x.label, x.preds_m1),
            "f1_macro": f1_score(x.label, x.preds_m1, average="macro"),
        }
    )
)
display(stat_m1)
print(stat_m1.mean())

Unnamed: 0_level_0,n,accuracy,f1_macro
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SentiRuEval2016_banks,1055.0,0.869194,0.709848
SentiRuEval2016_tele,937.0,0.707577,0.634077
kaggle_news,840.0,0.65119,0.604491
linis,2675.0,0.601495,0.52559
mokoron,51886.0,0.996203,0.996019
rureviews,9171.0,0.737651,0.740745
rusentiment,2112.0,0.643466,0.549202


n           9810.857143
accuracy       0.743825
f1_macro       0.679996
dtype: float64


In [None]:
stat_m2 = dev.groupby("source").apply(
    lambda x: pd.Series(
        {
            "n": x.shape[0],
            "accuracy": accuracy_score(x.label, x.preds_m2),
            "f1_macro": f1_score(x.label, x.preds_m2, average="macro"),
        }
    )
)
display(stat_m2)
print(stat_m2.mean())

Unnamed: 0_level_0,n,accuracy,f1_macro
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SentiRuEval2016_banks,1055.0,0.895735,0.821613
SentiRuEval2016_tele,937.0,0.779082,0.73888
kaggle_news,840.0,0.641667,0.644934
linis,2675.0,0.558131,0.508081
mokoron,51886.0,0.942778,0.938706
rureviews,9171.0,0.724894,0.726662
rusentiment,2112.0,0.663352,0.648349


n           9810.857143
accuracy       0.743663
f1_macro       0.718175
dtype: float64


In [None]:
stat_lr = dev.groupby("source").apply(
    lambda x: pd.Series(
        {
            "n": x.shape[0],
            "accuracy": accuracy_score(x.label, x.preds_lr),
            "f1_macro": f1_score(x.label, x.preds_lr, average="macro"),
        }
    )
)
display(stat_lr)
print(stat_lr.mean())

Unnamed: 0_level_0,n,accuracy,f1_macro
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SentiRuEval2016_banks,1055.0,0.881517,0.80341
SentiRuEval2016_tele,937.0,0.772679,0.734903
kaggle_news,840.0,0.685714,0.678902
linis,2675.0,0.570093,0.521029
mokoron,51886.0,0.804745,0.767611
rureviews,9171.0,0.720532,0.722225
rusentiment,2112.0,0.675189,0.658313


n           9810.857143
accuracy       0.730067
f1_macro       0.698056
dtype: float64


И всё та же статистика - на сбалансированной выборке, чтобы понять, как дисбаланс влияет на все эти числа. 

Все числа стали больше, но видим, что модель очень хорошо отрабатывает на Мокороне, и очень плохо - на почти всех остальных данных. Увы. Придётся работать с чем есть. 

В целом, второй берт получился относительно ничего. Надеюсь, на своих данных его можно будет нормально дотюнивать. 

In [None]:
facts, preds_m2 = predict_with_model(model2, torch_dev_balanced_loader)
assert (dev_balanced.label == [all_labels[i] for i in facts]).all()
dev_balanced["preds_m2"] = [all_labels[i] for i in preds_m2.argmax(axis=1)]

facts, preds_m1 = predict_with_model(model1, torch_dev_balanced_loader)
dev_balanced["preds_m1"] = [all_labels[i] for i in preds_m1.argmax(axis=1)]

dev_balanced["preds_lr"] = [all_labels[i] for i in pipe_b2.predict(dev_balanced["text"])]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

In [None]:
stat_m1 = dev_balanced.groupby("source").apply(
    lambda x: pd.Series(
        {
            "n": x.shape[0],
            "accuracy": accuracy_score(x.label, x.preds_m1),
            "f1_macro": f1_score(x.label, x.preds_m1, average="macro"),
        }
    )
)
display(stat_m1)
print(stat_m1.mean())

Unnamed: 0_level_0,n,accuracy,f1_macro
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SentiRuEval2016_banks,210.0,0.67619,0.643551
SentiRuEval2016_tele,210.0,0.585714,0.574823
kaggle_news,210.0,0.552381,0.539622
linis,210.0,0.480952,0.460348
mokoron,210.0,1.0,1.0
rureviews,210.0,0.738095,0.740806
rusentiment,210.0,0.566667,0.538639


n           210.000000
accuracy      0.657143
f1_macro      0.642541
dtype: float64


In [None]:
stat_m2 = dev_balanced.groupby("source").apply(
    lambda x: pd.Series(
        {
            "n": x.shape[0],
            "accuracy": accuracy_score(x.label, x.preds_m2),
            "f1_macro": f1_score(x.label, x.preds_m2, average="macro"),
        }
    )
)
display(stat_m2)
print(stat_m2.mean())

Unnamed: 0_level_0,n,accuracy,f1_macro
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SentiRuEval2016_banks,210.0,0.833333,0.8308
SentiRuEval2016_tele,210.0,0.742857,0.741517
kaggle_news,210.0,0.657143,0.659544
linis,210.0,0.5,0.495627
mokoron,210.0,0.980952,0.980839
rureviews,210.0,0.719048,0.719883
rusentiment,210.0,0.666667,0.668986


n           210.000000
accuracy      0.728571
f1_macro      0.728171
dtype: float64


In [None]:
stat_lr = dev_balanced.groupby("source").apply(
    lambda x: pd.Series(
        {
            "n": x.shape[0],
            "accuracy": accuracy_score(x.label, x.preds_lr),
            "f1_macro": f1_score(x.label, x.preds_lr, average="macro"),
        }
    )
)
display(stat_lr)
print(stat_lr.mean())

Unnamed: 0_level_0,n,accuracy,f1_macro
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SentiRuEval2016_banks,210.0,0.838095,0.837759
SentiRuEval2016_tele,210.0,0.780952,0.779683
kaggle_news,210.0,0.719048,0.72245
linis,210.0,0.547619,0.547372
mokoron,210.0,0.8,0.798673
rureviews,210.0,0.695238,0.69574
rusentiment,210.0,0.680952,0.681018


n           210.000000
accuracy      0.723129
f1_macro      0.723242
dtype: float64


# Usage

In [None]:
!pip install transformers sentencepiece --quiet

[K     |████████████████████████████████| 2.6 MB 8.9 MB/s 
[K     |████████████████████████████████| 1.2 MB 85.1 MB/s 
[K     |████████████████████████████████| 636 kB 53.9 MB/s 
[K     |████████████████████████████████| 895 kB 66.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 55.1 MB/s 
[?25h

In [None]:
model_checkpoint = "cointegrated/rubert-tiny-sentiment-balanced"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()

In [None]:
def get_sentiment(text, return_type="label"):
    """Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba'"""
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
    if return_type == "label":
        return model.config.id2label[proba.argmax()]
    elif return_type == "score":
        return proba.dot([-1, 0, 1])
    return proba


text = "Какая гадость эта ваша заливная рыба!"
# classify the text
print(get_sentiment(text, "label"))  # negative
# score the text on the scale from -1 (very negative) to +1 (very positive)
print(get_sentiment(text, "score"))  # -0.5894946306943893
# calculate probabilities of all labels
print(get_sentiment(text, "proba"))  # [0.7870447  0.4947824  0.19755007]

negative
-0.5894946306943893
[0.7870447  0.4947824  0.19755007]
