## Загрузка данных

In [None]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[?25l[K     |█                               | 10 kB 22.1 MB/s eta 0:00:01[K     |██                              | 20 kB 25.9 MB/s eta 0:00:01[K     |███                             | 30 kB 18.7 MB/s eta 0:00:01[K     |████                            | 40 kB 16.6 MB/s eta 0:00:01[K     |█████                           | 51 kB 11.3 MB/s eta 0:00:01[K     |██████                          | 61 kB 13.0 MB/s eta 0:00:01[K     |███████                         | 71 kB 10.4 MB/s eta 0:00:01[K     |████████                        | 81 kB 11.2 MB/s eta 0:00:01[K     |█████████                       | 92 kB 12.3 MB/s eta 0:00:01[K     |██████████                      | 102 kB 11.8 MB/s eta 0:00:01[K     |███████████                     | 112 kB 11.8 MB/s eta 0:00:01[K     |████████████                    | 122 kB 11.8 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 11.8 MB/s eta

In [None]:
!gdown --id 1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
!gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv

Downloading...
From: https://drive.google.com/uc?id=1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
To: /content/apps.csv
100% 134k/134k [00:00<00:00, 52.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
To: /content/reviews.csv
100% 7.17M/7.17M [00:00<00:00, 94.5MB/s]


In [None]:
import matplotlib.pyplot as plt

import transformers
from transformers import BertModel, AutoTokenizer, BertTokenizer, PreTrainedTokenizerFast, AdamW, get_linear_schedule_with_warmup, Trainer, TrainingArguments
import torch.nn.functional as F

import torch
import numpy as np
import pandas as pd
import seaborn as sns

from pylab import rcParams
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import Dataset, DataLoader


In [None]:
class GPReviewDataset(torch.utils.data.Dataset):
  """ Предобработка """
  def __init__(self, encodings, labels):
      self.encodings = encodings
      self.labels = labels

  def __len__(self):
      return len(self.labels)
  
  def __getitem__(self, idx):
      #Тут подготавливаются данные для дальнейшей передачи в trainer
      item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      item['labels'] = torch.tensor(self.labels[idx])
      return item

In [None]:
# Возьмем первые 3500
df = pd.read_csv("reviews.csv")[:3500]

In [None]:
# Зададим массив категории отзыва иначе
# Пусть 1,2 будут плохими отзывами, 3 - нейтральными, 4,5 - хорошими
def to_sentiment(rating):
  rating = int(rating)
  if rating <= 2:
    return 0
  elif rating == 3:
    return 1
  else: 
    return 2

df['sentiment'] = df['score']
df['sentiment'] = df['sentiment'].apply(lambda x: to_sentiment(x))

In [None]:
# Проверим замену
df[['score', 'sentiment']]

Unnamed: 0,score,sentiment
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
3495,5,2
3496,5,2
3497,5,2
3498,5,2


In [None]:
class_names = ['negative', 'neutral', 'positive']

In [None]:
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [None]:
np.random.seed(1)
torch.manual_seed(1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=3, stratify=df['sentiment'])
df_val, df_test = train_test_split(df_test, test_size=0.15, random_state=3, stratify=df_test['sentiment'])

In [None]:
# Длина отзыва обычно не превышает этого числа
MAX_LENGTH=75

In [None]:
train_encodings = tokenizer(list(df_train.content),padding=True,truncation=True,return_token_type_ids=False,max_length=MAX_LENGTH)
val_encodings = tokenizer(list(df_val.content),padding=True,truncation=True,return_token_type_ids=False,max_length=MAX_LENGTH)
test_encodings = tokenizer(list(df_test.content),padding=True,truncation=True,return_token_type_ids=False,max_length=MAX_LENGTH)

In [None]:
train_dataset = GPReviewDataset(train_encodings, list(df_train.sentiment))
val_dataset = GPReviewDataset(val_encodings, list(df_val.sentiment))
test_dataset = GPReviewDataset(test_encodings, list(df_test.sentiment))

## Подключение Trainer

In [None]:
from datasets import load_metric
d = load_metric("f1")

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

In [None]:
# Зададим метрику для выделения
from sklearn.metrics import f1_score 
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return d.compute(predictions=predictions, references=labels, average='micro')

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir='./logging',
    logging_steps=30,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## Задание №1

In [None]:
# Обучаем модель SentimentClassifier из тетрадки
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super().__init__()
    self.bert = BertModel.from_pretrained('distilbert-base-uncased',output_attentions = True,output_hidden_states = True)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask,labels):
    _, pooled_output,hs,oa = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False)
    output = self.drop(pooled_output)
    logits = self.out(output)
    loss = None

    if labels is not None:
        loss_fct = nn.CrossEntropyLoss().to(device)
        loss = loss_fct(logits.view(-1, 3), labels.view(-1))

    # Вернем hs для обучения Берта
    # В таком случае мы сможем использовать Trainer, а так же проводить evaluate
    return transformers.modeling_outputs.SequenceClassifierOutput(
        logits=logits,
        loss=loss,
        hidden_states=hs,
        attentions=oa
    )

In [None]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Model config BertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dim": 768,
  "dropout": 0.1,
  "hidden_act": "gelu",
  "hidden_dim": 3072,
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "n_heads": 12,
  "n_layers": 6,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,


In [None]:
#Замораживаем первые 10 слоёв,чтобы повысить скорость обучения
for layer_id in list(range(10)):
  for param in list(model.bert.encoder.layer[layer_id].parameters()):
    param.requires_grad = False

Обучение

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 2450
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2456


Step,Training Loss
30,1.1749
60,1.0994
90,1.1577
120,1.1376
150,1.1285
180,1.1431
210,1.1177
240,1.1347
270,1.1193
300,1.0958


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./results/checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./results/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2456, training_loss=0.5977438492184741, metrics={'train_runtime': 480.6937, 'train_samples_per_second': 40.774, 'train_steps_per_second': 5.109, 'total_flos': 0.0, 'train_loss': 0.5977438492184741, 'epoch': 8.0})

**Приведение результатов на тестовой выборке**

In [None]:
#необходимо добавить в ignore_keys два приведённых пункта,чтобы они не учавствовали в функции compute_metrics
trainer.evaluate(eval_dataset=test_dataset, ignore_keys =["hidden_states","attentions"])

***** Running Evaluation *****
  Num examples = 158
  Batch size = 16


{'epoch': 8.0,
 'eval_f1': 0.7215189873417721,
 'eval_loss': 1.2078289985656738,
 'eval_runtime': 1.6103,
 'eval_samples_per_second': 98.121,
 'eval_steps_per_second': 6.21}