## Обработка данных с kaggle


In [None]:
import pandas as pd

In [None]:
kaggle_df = pd.read_json('train.json')
print(kaggle_df.shape)

(8263, 3)


In [None]:
kaggle_df.sentiment.value_counts()

neutral     4034
positive    2795
negative    1434
Name: sentiment, dtype: int64

In [None]:
kaggle_df.sample(3)

Unnamed: 0,text,id,sentiment
4273,С момента как семилетний Азамат Айдаркулов про...,6327,negative
4843,Главная » Публикации » Богатые тоже плачут Бог...,6897,negative
7914,ДКНБ Алматы проводит досудебное расследование ...,9968,negative


In [None]:
kaggle_df.to_csv('kaggle.csv')

# initial training

In [None]:
full_sentiment_data = pd.read_csv('kaggle.csv')

In [None]:
full_sentiment_data.rename(columns={'sentiment':'label'}, inplace=True)

In [None]:
import random

def do_split(x):
    if random.random() < 0.8:
        return 'train'
    if random.random() < 0.5:
        return 'dev'
    return 'test'

random.seed(1)
full_sentiment_data['split'] = full_sentiment_data.text.apply(do_split)

In [None]:
!pip install datasets -q

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
dev_balanced = full_sentiment_data[full_sentiment_data.split=='dev'].groupby(['label']).sample(70, random_state=1)
torch_dev_balanced = Dataset.from_pandas(
    dev_balanced[['text', 'label']].reset_index(drop=True)
)

In [None]:
train_data = full_sentiment_data[full_sentiment_data.split=='train'].dropna().reset_index(drop=True)
train_data.shape

(6578, 4)

In [None]:
torch_data = DatasetDict({
    'train': Dataset.from_pandas(train_data[['text', 'label']].reset_index(drop=True)),
    'dev':   Dataset.from_pandas(full_sentiment_data[full_sentiment_data.split=='dev'][['text', 'label']].reset_index(drop=True))
})
torch_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6578
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 849
    })
})

# Modelling

In [None]:
all_labels = ['negative', 'neutral', 'positive']

In [None]:
model_checkpoint = "cointegrated/rubert-tiny"

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
import torch

In [None]:
from torch.utils.data import DataLoader

In [None]:
import gc

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [None]:
from tqdm.auto import tqdm, trange
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support
from IPython.display import display

def evaluate_model(model, dev_dataloader, verbose=False, labels=None):
    facts, preds = predict_with_model(model, dev_dataloader)
    pfrs, aucs, accuracy = get_classification_report(facts, preds, labels)
    if verbose:
        display(pfrs)
        print('aucs:', aucs, np.mean(aucs))
        print('Accuracy:', accuracy)
    return np.mean(aucs), accuracy

def predict_with_model(model, dataloader):
    preds = []
    facts = []

    for batch in tqdm(dataloader):
        facts.append(batch.labels.cpu().numpy())
        batch = batch.to(model.device)
        with torch.no_grad():
            pr = model(input_ids=batch.input_ids, attention_mask=batch.attention_mask, token_type_ids=batch.token_type_ids)
        preds.append(torch.softmax(pr.logits, -1).cpu().numpy())
    facts = np.concatenate(facts)
    preds = np.concatenate(preds)
    return facts, preds

def get_classification_report(facts, preds, labels=None):
    pfrs = pd.DataFrame(dict(zip(['p', 'r', 'f', 's'], precision_recall_fscore_support(facts, preds.argmax(1)))))
    aucs = [roc_auc_score(facts==i, preds[:, i]) for i in set(facts)]
    accuracy = accuracy_score(facts, preds.argmax(1))
    pfrs['a'] = aucs
    pd.concat([pfrs, pfrs.mean().to_frame().transpose()], ignore_index=True)
    if labels is not None:
        pfrs.index = list(labels) + ['mean']
    return pfrs, aucs, accuracy

## Initial Model

In [None]:
torch_dev_balanced_tokenized = torch_dev_balanced.map(
    lambda x: tokenizer(x["text"], truncation=True), batched=True, remove_columns=['text']
).map(lambda x: {'label': [all_labels.index(xl) for xl in x['label']]}, batched=True)

torch_dev_balanced_loader = DataLoader(
    torch_dev_balanced_tokenized,
    batch_size=64, drop_last=False, shuffle=False, num_workers=0, collate_fn=data_collator
)

In [None]:
data_tokenized = torch_data.map(
    lambda x: tokenizer(x["text"], truncation=True), batched=True, remove_columns=['text']
)


Map:   0%|          | 0/6578 [00:00<?, ? examples/s]

Map:   0%|          | 0/849 [00:00<?, ? examples/s]

In [None]:
data_tokenized = data_tokenized.map(lambda x: {'label': [all_labels.index(xl) for xl in x['label']]}, batched=True)

Map:   0%|          | 0/6578 [00:00<?, ? examples/s]

Map:   0%|          | 0/849 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(all_labels))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model = model.cuda()

# Training

In [None]:
batch_size = 64

train_dataloader = DataLoader(
    data_tokenized['train'],
    batch_size=batch_size, drop_last=False, shuffle=True, num_workers=0, collate_fn=data_collator
)
dev_dataloader = DataLoader(
    data_tokenized['dev'],
    batch_size=batch_size, drop_last=False, shuffle=True, num_workers=0, collate_fn=data_collator
)

In [None]:
evaluate_model(model, dev_dataloader, verbose=True)

  0%|          | 0/14 [00:00<?, ?it/s]

  pfrs = pfrs.append(pfrs.mean(), ignore_index=True)


Unnamed: 0,p,r,f,s,a
0,0.175355,0.993289,0.298087,149.0,0.423025
1,0.333333,0.002404,0.004773,416.0,0.451895
2,0.5,0.003521,0.006993,284.0,0.381896
3,0.33623,0.333071,0.103284,283.0,0.418939


aucs: [0.4230249280920421, 0.4518953188843489, 0.38189579957621833] 0.41893868218420316


0.41893868218420316

А вот после этого момента можно сразу переходить ко второй версии модели

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

In [None]:
gradient_accumulation_steps = 1
window = 500
cleanup_step = 100
report_step = 10000

In [None]:
ewm_loss = 0

In [None]:
model.train()
cleanup()

for epoch in trange(3):
    tq = tqdm(train_dataloader)

    for i, batch in enumerate(tq):
        try:
            batch = batch.to(model.device)
            output = model(**batch)
            loss = output.loss
            loss.backward()
        except RuntimeError as e:
            print('error on step', i, e)
            loss = None
            cleanup()
            continue

        if i and i % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if i % cleanup_step == 0:
            cleanup()

        w = 1 / min(i+1, window)
        ewm_loss = ewm_loss * (1-w) + loss.item() * w
        tq.set_description(f'loss: {ewm_loss:4.4f}')

        if i % report_step == 0:
            model.eval()
            eval_loss = evaluate_model(model, dev_dataloader, verbose=True)
            model.train()
            print(f'epoch {epoch}, step {i}: train loss: {ewm_loss:4.4f}  val auc: {eval_loss}')

model.eval()
eval_loss = evaluate_model(model, dev_dataloader, verbose=True)
print(f'epoch {epoch + 1}, step {i}: train loss: {ewm_loss:4.4f}  val auc: {eval_loss}')

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  pfrs = pfrs.append(pfrs.mean(), ignore_index=True)


Unnamed: 0,p,r,f,s,a
0,0.175355,0.993289,0.298087,149.0,0.423025
1,0.333333,0.002404,0.004773,416.0,0.451895
2,0.5,0.003521,0.006993,284.0,0.381896
3,0.33623,0.333071,0.103284,283.0,0.418939


aucs: [0.4230249280920421, 0.4518953188843489, 0.38189579957621833] 0.41893868218420316
epoch 0, step 0: train loss: 1.1238  val auc: 0.41893868218420316


  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  pfrs = pfrs.append(pfrs.mean(), ignore_index=True)


Unnamed: 0,p,r,f,s,a
0,0.0,0.0,0.0,149.0,0.711031
1,0.489965,0.997596,0.657165,416.0,0.564654
2,0.5,0.003521,0.006993,284.0,0.672551
3,0.329988,0.333706,0.221386,283.0,0.649412


aucs: [0.7110306807286674, 0.564654023805294, 0.6725507914745108] 0.649411832002824
epoch 1, step 0: train loss: 0.9384  val auc: 0.649411832002824


  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  pfrs = pfrs.append(pfrs.mean(), ignore_index=True)


Unnamed: 0,p,r,f,s,a
0,0.0,0.0,0.0,149.0,0.774329
1,0.498615,0.865385,0.632689,416.0,0.616323
2,0.535433,0.239437,0.3309,284.0,0.73504
3,0.344683,0.368274,0.321196,283.0,0.708564


aucs: [0.7743288590604027, 0.6163228370936223, 0.7350398853296771] 0.7085638604945674
epoch 2, step 0: train loss: 1.0005  val auc: 0.7085638604945674


  0%|          | 0/14 [00:00<?, ?it/s]

  pfrs = pfrs.append(pfrs.mean(), ignore_index=True)


Unnamed: 0,p,r,f,s,a
0,0.666667,0.120805,0.204545,149.0,0.813878
1,0.56391,0.721154,0.632911,416.0,0.639501
2,0.589655,0.602113,0.595819,284.0,0.767085
3,0.606744,0.481357,0.477759,283.0,0.740155


aucs: [0.8138782358581017, 0.6395007994315154, 0.7670852548921849] 0.7401547633939339
epoch 3, step 102: train loss: 0.8831  val auc: 0.7401547633939339


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
model.save_pretrained('/content/gdrive/MyDrive/dl-model')

In [None]:
tokenizer.save_pretrained('/content/gdrive/MyDrive/dl-model')

('/content/gdrive/MyDrive/dl-model/tokenizer_config.json',
 '/content/gdrive/MyDrive/dl-model/special_tokens_map.json',
 '/content/gdrive/MyDrive/dl-model/vocab.txt',
 '/content/gdrive/MyDrive/dl-model/added_tokens.json',
 '/content/gdrive/MyDrive/dl-model/tokenizer.json')