In [1]:
from IPython.display import clear_output

In [2]:
! pip install git+https://github.com/huggingface/transformers.git
clear_output(wait=False)

In [3]:
# стандартные библиотеки
import os, re
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
import pandas as pd
from collections import Counter
from string import punctuation
import matplotlib.pyplot as plt
%matplotlib inline


# pytortch и huggingface 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from transformers import AutoModel
from transformers import AutoTokenizer

In [4]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

Using device: cuda


In [5]:
from google.colab import drive
drive.mount('/content/gdrive/')
!ls /content/gdrive/

Mounted at /content/gdrive/
MyDrive


In [6]:
data = pd.read_csv('/content/gdrive/MyDrive/lenta_sample.csv')
data.dropna(subset=['topic', 'text'], inplace=True)
data.topic.value_counts()

Из жизни             55
Наука и техника      54
Бывший СССР          54
Культура             53
Ценности             45
Дом                  45
Бизнес               44
Интернет и СМИ       44
Силовые структуры    40
Спорт                39
Россия               32
Экономика            32
Мир                  27
69-я параллель       13
Легпром              13
Библиотека           10
Крым                  7
Name: topic, dtype: int64

In [7]:
data.head(2)

Unnamed: 0,url,title,text,topic,tags,date
0,https://lenta.ru/news/2010/12/16/passports/,Московская милиция ужесточила паспортный режим,В Москве после серии массовых беспорядков на н...,Россия,Все,2010/12/16
1,https://lenta.ru/news/2014/05/19/student/,Московского студента ограбили на 6 миллионов р...,Неизвестные вынесли из квартиры московского ст...,Россия,,2014/05/19


In [8]:
texts = data.title.values
id2label = {i:l for i,l in enumerate(set(data.topic))}
label2id = {l:i for i,l in id2label.items()}
targets = [label2id[l] for l in data.topic]

In [9]:
train_texts, valid_texts, train_targets, valid_targets = train_test_split(texts, targets, test_size=0.05)

### rubert-tiny2 из huggingface transformers (обучение через Trainer)

In [10]:
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification

In [11]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [12]:
model = AutoModelForSequenceClassification.from_pretrained("cointegrated/rubert-tiny2", num_labels=len(label2id)).to(DEVICE)

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [13]:
MAX_LEN = 2048

In [14]:
train_texts = train_texts.tolist()
valid_texts = valid_texts.tolist()

In [15]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LEN)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=MAX_LEN)

In [16]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [17]:
train_dataset = Dataset(train_encodings, train_targets)
valid_dataset = Dataset(valid_encodings, valid_targets)

In [18]:
from sklearn.metrics import f1_score

In [19]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = f1_score(labels, preds, average="micro")
    return {
        'f1_score': acc,
    }

In [20]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=20,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=20,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',                
    logging_steps=400,               
    evaluation_strategy="steps",   
)

In [21]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [22]:
trainer.train()



Step,Training Loss,Validation Loss,F1 Score
400,2.6748,2.280325,0.516129
800,1.4773,1.480387,0.516129
1200,0.534,1.424547,0.580645


TrainOutput(global_step=1440, training_loss=1.3538644896613228, metrics={'train_runtime': 42.694, 'train_samples_per_second': 269.827, 'train_steps_per_second': 33.728, 'total_flos': 4156104384000.0, 'train_loss': 1.3538644896613228, 'epoch': 20.0})

In [23]:
from sklearn.metrics import classification_report

In [24]:
# Predicting with model
predictions = trainer.predict(valid_dataset)
pred = list(np.argmax(predictions.predictions, axis=-1))

print(classification_report(valid_targets, pred, labels=list(range(len(label2id))),
                            target_names=list(label2id), zero_division=0))

                   precision    recall  f1-score   support

Силовые структуры       0.00      0.00      0.00         3
         Ценности       0.67      1.00      0.80         2
   Интернет и СМИ       0.50      0.50      0.50         2
             Крым       0.00      0.00      0.00         0
         Культура       0.67      0.50      0.57         4
   69-я параллель       0.00      0.00      0.00         0
       Библиотека       1.00      1.00      1.00         1
        Экономика       0.00      0.00      0.00         1
  Наука и техника       0.50      1.00      0.67         4
              Мир       0.00      0.00      0.00         0
            Спорт       1.00      1.00      1.00         1
      Бывший СССР       0.40      0.67      0.50         3
          Легпром       0.00      0.00      0.00         1
           Россия       0.00      0.00      0.00         2
         Из жизни       0.75      0.75      0.75         4
           Бизнес       0.00      0.00      0.00       

Результат хороший. Rubert-tiny2 молодец. И он маленький и быстро учится.

## xlm-roberta-base

In [25]:
tokenizer_xlm = AutoTokenizer.from_pretrained('xlm-roberta-base')
model_xlm = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(label2id)).to(DEVICE)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

In [26]:
MAX_LEN = 514

In [27]:
train_encodings = tokenizer_xlm(train_texts, truncation=True, padding=True, max_length=MAX_LEN)
valid_encodings = tokenizer_xlm(valid_texts, truncation=True, padding=True, max_length=MAX_LEN)

In [28]:
train_dataset = Dataset(train_encodings, train_targets)
valid_dataset = Dataset(valid_encodings, valid_targets)

In [29]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=25,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=20,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',                
    logging_steps=300,               
    evaluation_strategy="steps",   
)

In [30]:
trainer = Trainer(
    model=model_xlm,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [31]:
trainer.train()



Step,Training Loss,Validation Loss,F1 Score
300,2.6116,1.774326,0.612903
600,1.143,1.660131,0.580645
900,0.3388,2.008941,0.612903
1200,0.048,3.022015,0.516129
1500,0.0074,3.221491,0.580645
1800,0.0101,3.343496,0.516129


TrainOutput(global_step=1800, training_loss=0.6931479565302531, metrics={'train_runtime': 369.8432, 'train_samples_per_second': 38.935, 'train_steps_per_second': 4.867, 'total_flos': 229430846736000.0, 'train_loss': 0.6931479565302531, 'epoch': 25.0})

In [32]:
# Predicting with model
predictions = trainer.predict(valid_dataset)
pred = list(np.argmax(predictions.predictions, axis=-1))

print(classification_report(valid_targets, pred, labels=list(range(len(label2id))),
                            target_names=list(label2id), zero_division=0))

                   precision    recall  f1-score   support

Силовые структуры       0.00      0.00      0.00         3
         Ценности       0.50      1.00      0.67         2
   Интернет и СМИ       0.00      0.00      0.00         2
             Крым       0.00      0.00      0.00         0
         Культура       1.00      0.50      0.67         4
   69-я параллель       0.00      0.00      0.00         0
       Библиотека       1.00      1.00      1.00         1
        Экономика       0.00      0.00      0.00         1
  Наука и техника       0.50      0.75      0.60         4
              Мир       0.00      0.00      0.00         0
            Спорт       1.00      1.00      1.00         1
      Бывший СССР       0.40      0.67      0.50         3
          Легпром       0.00      0.00      0.00         1
           Россия       0.00      0.00      0.00         2
         Из жизни       1.00      0.75      0.86         4
           Бизнес       0.00      0.00      0.00       

Странно выучилось -- переобучилось, видно по лоссу на валидации. Качество хорошее, но можно ставить сильно меньше эпох для обучения.

Попробую заморозить все слои кроме classifier.

In [33]:
model_xlm2 = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(label2id)).to(DEVICE)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

In [34]:
for name, param in model_xlm2.named_parameters():
	if 'classifier' not in name: # classifier layer
		param.requires_grad = False

In [35]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=35,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=20,   
    warmup_steps=500,                
    weight_decay=0.001,               
    logging_dir='./logs',                
    logging_steps=300,               
    evaluation_strategy="steps",   
)

In [36]:
trainer = Trainer(
    model=model_xlm2,                    
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=valid_dataset,          
    compute_metrics=compute_metrics,     
)

In [37]:
trainer.train()



Step,Training Loss,Validation Loss,F1 Score
300,2.8049,2.705001,0.129032
600,2.7345,2.675949,0.193548
900,2.7169,2.648951,0.129032
1200,2.7009,2.647827,0.193548
1500,2.6945,2.638897,0.258065
1800,2.6795,2.629997,0.354839
2100,2.6804,2.633264,0.387097
2400,2.6673,2.628596,0.387097


TrainOutput(global_step=2520, training_loss=2.7075127495659723, metrics={'train_runtime': 78.7725, 'train_samples_per_second': 255.927, 'train_steps_per_second': 31.991, 'total_flos': 321203185430400.0, 'train_loss': 2.7075127495659723, 'epoch': 35.0})

In [38]:
# Predicting with model
predictions = trainer.predict(valid_dataset)
pred = list(np.argmax(predictions.predictions, axis=-1))

print(classification_report(valid_targets, pred, labels=list(range(len(label2id))),
                            target_names=list(label2id), zero_division=0))

                   precision    recall  f1-score   support

Силовые структуры       0.00      0.00      0.00         3
         Ценности       0.00      0.00      0.00         2
   Интернет и СМИ       0.00      0.00      0.00         2
             Крым       0.00      0.00      0.00         0
         Культура       0.00      0.00      0.00         4
   69-я параллель       0.00      0.00      0.00         0
       Библиотека       0.00      0.00      0.00         1
        Экономика       0.00      0.00      0.00         1
  Наука и техника       0.57      1.00      0.73         4
              Мир       0.00      0.00      0.00         0
            Спорт       0.00      0.00      0.00         1
      Бывший СССР       0.43      1.00      0.60         3
          Легпром       0.00      0.00      0.00         1
           Россия       0.00      0.00      0.00         2
         Из жизни       0.27      1.00      0.42         4
           Бизнес       0.00      0.00      0.00       

Как-то выучивается, но, видимо, правда, так делать не стоит.

## DeepPavlov/rubert-base-cased

In [39]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=len(label2id)).to(DEVICE)

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [40]:
MAX_LEN = 512

In [41]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LEN)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=MAX_LEN)

In [42]:
train_dataset = Dataset(train_encodings, train_targets)
valid_dataset = Dataset(valid_encodings, valid_targets)

In [43]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=20,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=20,   
    warmup_steps=400,                
    weight_decay=0.001,               
    logging_dir='./logs',                
    logging_steps=400,               
    evaluation_strategy="steps",   
)

In [44]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [45]:
trainer.train()



Step,Training Loss,Validation Loss,F1 Score
400,1.6835,1.132154,0.709677
800,0.087,2.268553,0.548387
1200,0.0025,2.261455,0.645161


TrainOutput(global_step=1440, training_loss=0.492788824149304, metrics={'train_runtime': 195.0217, 'train_samples_per_second': 59.07, 'train_steps_per_second': 7.384, 'total_flos': 130257512985600.0, 'train_loss': 0.492788824149304, 'epoch': 20.0})

In [46]:
# Predicting with model
predictions = trainer.predict(valid_dataset)
pred = list(np.argmax(predictions.predictions, axis=-1))

print(classification_report(valid_targets, pred, labels=list(range(len(label2id))),
                            target_names=list(label2id), zero_division=0))

                   precision    recall  f1-score   support

Силовые структуры       0.00      0.00      0.00         3
         Ценности       1.00      1.00      1.00         2
   Интернет и СМИ       1.00      0.50      0.67         2
             Крым       0.00      0.00      0.00         0
         Культура       0.75      0.75      0.75         4
   69-я параллель       0.00      0.00      0.00         0
       Библиотека       1.00      1.00      1.00         1
        Экономика       0.00      0.00      0.00         1
  Наука и техника       0.50      0.50      0.50         4
              Мир       0.00      0.00      0.00         0
            Спорт       1.00      1.00      1.00         1
      Бывший СССР       1.00      0.67      0.80         3
          Легпром       0.00      0.00      0.00         1
           Россия       0.67      1.00      0.80         2
         Из жизни       0.67      1.00      0.80         4
           Бизнес       0.00      0.00      0.00       

DeepPavlov/rubert-base-cased лучшая модель.

## Вопросы

Чем ROBERTA отличается от BERT? В какой статье описана ROBERTA?

Статья про роберту:

RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.


В чем разница берта и роберты:

Роберта была обучена на большем датасете (x10 больше данных, чем берт).
У роберты больше словарь (~50к токенов роберты vs ~30к токенов берта).
Во время обучения у роберты был больший размер батча. 

Стратегия маскирования: у берта статичная, у роберты динамическая (маска меняется при каждой подаче последовательности модели).

Чем модель T5 отличается от ROBERTA/BERT?

У берта и роберты есть только энкодер и они обучены на задачу masked language modeling. 
У T5 есть энкодер и декодер. Это text-to-text модель. Т5 учится предсказывать маскированные токены, как и BERT, но на токен [MASK] заменяются несколько последовательных токенов.
Модель обладает преимуществами и берт-подобных моделей, и гпт.