In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [2]:
import numpy as np
import torch
from tqdm.auto import tqdm, trange
from torch.utils.data import DataLoader
from datasets import Dataset
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

In [3]:
df = pd.read_csv('/content/in_domain_train.csv')

In [4]:
print(df.shape)
df.sample(3)

(7869, 5)


Unnamed: 0,id,sentence,acceptable,error_type,detailed_source
1364,1364,Он выбирал сотрудницу по анкетным данным.,1,0,Paducheva2004
1833,1833,На складе имеется мука и сахар.,1,0,Paducheva2004
2090,2090,Мне этого заранее не сказали.,1,0,Paducheva2013


In [5]:
data = Dataset.from_dict({'text': df.sentence, 'label': df.acceptable}).train_test_split(test_size=0.2, seed=1)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6295
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1574
    })
})

In [6]:
base_model = 'ai-forever/ruBert-base'
tokenizer = AutoTokenizer.from_pretrained(base_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

In [7]:
data_tokenized = data.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, remove_columns=['text'])

Map:   0%|          | 0/6295 [00:00<?, ? examples/s]

Map:   0%|          | 0/1574 [00:00<?, ? examples/s]

In [8]:
data_tokenized

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6295
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1574
    })
})

In [9]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(data_tokenized['train'], shuffle=True, batch_size=4, collate_fn=collator)
val_dataloader = DataLoader(data_tokenized['test'], shuffle=False, batch_size=4, collate_fn=collator)

In [10]:
from torch.optim import Adam
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)

pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
if torch.cuda.is_available():
    model.cuda()
optimizer = Adam(model.parameters(), lr=1e-6)

In [12]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [13]:
losses = []
for epoch in trange(10):
    pbar = tqdm(train_dataloader)
    model.train()
    for i, batch in enumerate(pbar):
        out = model(**batch.to(model.device))
        out.loss.backward()
        if i % 1 == 0:
            optimizer.step()
            optimizer.zero_grad()
        losses.append(out.loss.item())
        pbar.set_description(f'loss: {np.mean(losses[-100:]):2.2f}')
    model.eval()
    eval_losses = []
    eval_preds = []
    eval_targets = []
    for batch in tqdm(val_dataloader):
        with torch.no_grad():
                out = model(**batch.to(model.device))
        eval_losses.append(out.loss.item())
        eval_preds.extend(out.logits.argmax(1).tolist())
        eval_targets.extend(batch['labels'].tolist())
    print('recent train loss', np.mean(losses[-100:]), 'eval loss', np.mean(eval_losses), 'accuracy', np.mean(np.array(eval_targets) == eval_preds))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1574 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

recent train loss 0.5761562572419643 eval loss 0.5408334760799021 accuracy 0.7534942820838628


  0%|          | 0/1574 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

recent train loss 0.46983807027339936 eval loss 0.5282686014238953 accuracy 0.7617534942820838


  0%|          | 0/1574 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

recent train loss 0.4873246495425701 eval loss 0.5053001086837446 accuracy 0.7738246505717916


  0%|          | 0/1574 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

recent train loss 0.46821958586573603 eval loss 0.5037883398912583 accuracy 0.7852604828462516


  0%|          | 0/1574 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

recent train loss 0.3885927601903677 eval loss 0.5161961926440448 accuracy 0.7935196950444727


  0%|          | 0/1574 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

recent train loss 0.3343437453359365 eval loss 0.5293902217831254 accuracy 0.7922490470139771


  0%|          | 0/1574 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

recent train loss 0.28254850253462793 eval loss 0.5745340604618722 accuracy 0.7947903430749682


  0%|          | 0/1574 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

recent train loss 0.25767605621367695 eval loss 0.5873538399762875 accuracy 0.7928843710292249


  0%|          | 0/1574 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

recent train loss 0.18698279768228532 eval loss 0.6283088214222701 accuracy 0.7935196950444727


  0%|          | 0/1574 [00:00<?, ?it/s]

  0%|          | 0/394 [00:00<?, ?it/s]

recent train loss 0.2316313663031906 eval loss 0.6707306859239482 accuracy 0.7922490470139771
