# Домашнее задание #4



## Загрузка и обработка данных

In [None]:
!pip install -q -U watermark

In [None]:
!pip install transformers



In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

Python implementation: CPython
Python version       : 3.7.13
IPython version      : 5.5.0

numpy       : 1.21.5
pandas      : 1.3.5
torch       : 1.10.0+cu111
transformers: 4.17.0



In [None]:
pip install datasets



In [None]:
!gdown --id 1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
!gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv

Downloading...
From: https://drive.google.com/uc?id=1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
To: /content/apps.csv
100% 134k/134k [00:00<00:00, 69.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
To: /content/reviews.csv
100% 7.17M/7.17M [00:00<00:00, 193MB/s]


In [None]:
import transformers
#задаём подгрузку Trainer
from transformers import BertModel, AutoTokenizer, BertTokenizer, PreTrainedTokenizerFast, AdamW, get_linear_schedule_with_warmup, Trainer, TrainingArguments
import torch.nn.functional as F

import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import Dataset, DataLoader


In [None]:
#урезаем выборку для более быстрого обучения
df = pd.read_csv("reviews.csv")[:6000]

In [None]:
def to_sentiment(rating):
  rating = int(rating)
  if rating <= 2:
    return 0
  elif rating == 3:
    return 1
  else: 
    return 2

df['sentiment'] = df.score.apply(to_sentiment)

In [None]:
class_names = ['negative', 'neutral', 'positive']

In [None]:
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [None]:
RANDOM_SEED = 1
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

In [None]:
#уменьшим длину,чтобы повысить скорость обучения
MAX_LENGTH=100

In [None]:
train_encodings = tokenizer(list(df_train.content),padding=True,truncation=True,return_token_type_ids=False,max_length=MAX_LENGTH)
val_encodings = tokenizer(list(df_val.content),padding=True,truncation=True,return_token_type_ids=False,max_length=MAX_LENGTH)

In [None]:
test_encodings = tokenizer(list(df_test.content),padding=True,truncation=True,return_token_type_ids=False,max_length=MAX_LENGTH)

In [None]:
class GPReviewDataset(torch.utils.data.Dataset):

  def __init__(self, encodings, labels):
      self.encodings = encodings
      self.labels = labels

  def __len__(self):
      return len(self.labels)
  
  def __getitem__(self, idx):
      #Тут подготавливаются данные для дальнейшей передачи в trainer
      item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      item['labels'] = torch.tensor(self.labels[idx])
      return item

In [None]:
train_dataset = GPReviewDataset(train_encodings, list(df_train.sentiment))
val_dataset = GPReviewDataset(val_encodings, list(df_val.sentiment))

In [None]:
test_dataset = GPReviewDataset(test_encodings, list(df_test.sentiment))

In [None]:
#удаляем датасет в свяи с отсутсвием необходимости его использования
del df_test

## Подключение Trainer от Hugging Face и задание гиперпараметров(общее требование)

In [None]:
from datasets import load_metric

metric = load_metric("accuracy")

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

## Задание №1

In [None]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super().__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,output_attentions = True,output_hidden_states = True)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask,labels):
    _, pooled_output,hs,oa = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False)
    output = self.drop(pooled_output)
    logits = self.out(output)
    loss = None

    if labels is not None:
        loss_fct = nn.CrossEntropyLoss().to(device)
        loss = loss_fct(logits.view(-1, 3), labels.view(-1))

    #transformers.modeling_outputs.SequenceClassifierOutput используется как для обучения так
    #и для функции compute_metrics
    return transformers.modeling_outputs.SequenceClassifierOutput(
        logits=logits,
        loss=loss,
        hidden_states=hs,
        attentions=oa
    )

In [None]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertModel: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.transformer.layer.1.sa_layer_norm.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.1.ffn.lin1.weight', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.transformer.layer.5.ffn.lin1.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.1.attention.v_lin.weight', 'distilbert.transformer.layer.2.ffn.lin1.bias', 'distilbert.transformer.layer.2.attention.k_lin.bias', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.5.ffn.lin2.bias', 'voc

In [None]:
#Замораживаем первые 10 слоёв,чтобы повысить скорость обучения
freeze_layers = list(range(10))
for layer_id in freeze_layers:
  for param in list(model.bert.encoder.layer[layer_id].parameters()):
    param.requires_grad = False

**Обучение**

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 5400
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1350


Step,Training Loss
10,1.1908
20,1.1195
30,1.0896
40,1.1366
50,1.1118
60,1.1273
70,1.0935
80,1.1275
90,1.1017
100,1.1579


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1350, training_loss=0.93012563140304, metrics={'train_runtime': 337.6381, 'train_samples_per_second': 31.987, 'train_steps_per_second': 3.998, 'total_flos': 0.0, 'train_loss': 0.93012563140304, 'epoch': 2.0})

**Приведение результатов на тестовой выборке**

In [None]:
#необходимо добавить в ignore_keys два приведённых пункта,чтобы они не учавствовали в функции compute_metrics
trainer.evaluate(eval_dataset=test_dataset, ignore_keys =["hidden_states","attentions"], metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 300
  Batch size = 16


{'epoch': 2.0,
 'test_accuracy': 0.7,
 'test_loss': 0.7743759751319885,
 'test_runtime': 3.8306,
 'test_samples_per_second': 78.317,
 'test_steps_per_second': 4.96}

## Задание №2

In [None]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super().__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,output_attentions = True,output_hidden_states = True)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask,labels):
    #sequence_output ставим для получения эмбедингов всех токенов
    #в том числе cls
    sequence_output, pooled_output,hs,oa = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False)
    
    #Алгоритм следуюзи 7 строк следуюзий: берётся CLS из pooled_output(а точне CLS пройденый через линейный слой и tanh()) и эмбеддинг CLS из sequence_output
    #Затем вычисляется их среднее через torch.mean() таким образом получаем усреднённый вектор размерности (1,768)
    #Далее такие вектора складываем в общий массив final_means
    #Таким образом удовлетворяем требованию заданию,что помимо CLS из pooled_output,используется ещё CLS из sequence_output
    final_means = torch.zeros(1, sequence_output.shape[2]).to(device)
    for i in range(sequence_output.shape[0]):
        temp_tensor1 = sequence_output.clone().detach()[i][0].unsqueeze(0)
        temp_tensor2 = torch.cat((temp_tensor1,pooled_output.clone().detach()[i].unsqueeze(0)),0)
        mean=torch.mean(temp_tensor2, 0).unsqueeze(0)
        final_means = torch.cat((mean,final_means),0)
    final_means = final_means[:-1]

    output = self.drop(final_means)
    logits = self.out(output)
    loss = None

    if labels is not None:
        loss_fct = nn.CrossEntropyLoss().to(device)
        loss = loss_fct(logits.view(-1, 3), labels.view(-1))

    return transformers.modeling_outputs.SequenceClassifierOutput(
        logits=logits,
        loss=loss,
        hidden_states=hs,
        attentions=oa
    )

In [None]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Model config BertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dim": 768,
  "dropout": 0.1,
  "hidden_act": "gelu",
  "hidden_dim": 3072,
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "n_heads": 12,
  "n_layers": 6,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,


In [None]:
#Замораживаем первые 10 слоёв,чтобы повысить скорость обучения
freeze_layers = list(range(10))
for layer_id in freeze_layers:
  for param in list(model.bert.encoder.layer[layer_id].parameters()):
    param.requires_grad = False

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 5400
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1350


Step,Training Loss
10,1.1139
20,1.1327
30,1.1108
40,1.1641
50,1.1198
60,1.1472
70,1.0967
80,1.0931
90,1.1773
100,1.171


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1350, training_loss=1.1217960294087728, metrics={'train_runtime': 168.0832, 'train_samples_per_second': 64.254, 'train_steps_per_second': 8.032, 'total_flos': 0.0, 'train_loss': 1.1217960294087728, 'epoch': 2.0})

**Приведение результатов на тестовой выборке**

In [None]:
#необходимо добавить в ignore_keys два приведённых пункта,чтобы они не учавствовали в функции compute_metrics
trainer.evaluate(eval_dataset=test_dataset, ignore_keys =["hidden_states","attentions"], metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 300
  Batch size = 16


{'epoch': 2.0,
 'test_accuracy': 0.3233333333333333,
 'test_loss': 1.1012868881225586,
 'test_runtime': 3.8679,
 'test_samples_per_second': 77.561,
 'test_steps_per_second': 4.912}

##Задание №3 и №5

**Задание №3: Применяем к данным подготовленным выше модель DistilBertForSequenceClassification**

In [None]:
from transformers import DistilBertTokenizerFast

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accef

In [None]:
from transformers import DistilBertForSequenceClassification

In [None]:
#Внутри модели важно задать num_labels=3,поскольку у нас 3 класса и модель не знает об этом
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = 3)
model = model.to(device)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-b

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 5400
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1350


Step,Training Loss
10,1.1047
20,1.0884
30,1.0228
40,1.0478
50,0.9292
60,0.9038
70,0.7635
80,0.9499
90,0.8182
100,1.0593


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1350, training_loss=0.6321729435744109, metrics={'train_runtime': 260.0517, 'train_samples_per_second': 41.53, 'train_steps_per_second': 5.191, 'total_flos': 279428402160000.0, 'train_loss': 0.6321729435744109, 'epoch': 2.0})

In [None]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 300
  Batch size = 16


{'epoch': 2.0,
 'test_accuracy': 0.8166666666666667,
 'test_loss': 0.5064780712127686,
 'test_runtime': 1.9527,
 'test_samples_per_second': 153.636,
 'test_steps_per_second': 9.73}

**Задание №5: Проводим предсказание на моделе DistilBertForSequenceClassification для 3 случайно найденных отзывов из GP**

In [None]:
negative = "Useless editor, a lot of advertising and zero functionality, I'm wasting my time"
neutral = "Good app bit i am facing a perineal problem. The option to select AM-PM is not working where the clock is shown. This needs to be fixed as every time i select the time is defaults to PM"
positive = "Developers, God bless you"

In [None]:
test_encodings = tokenizer([negative],padding=True,truncation=True,return_token_type_ids=False,max_length=MAX_LENGTH)
test_dataset = GPReviewDataset(test_encodings, [0])

outputs = trainer.predict(test_dataset)
y_pred = outputs.predictions.argmax(0)

print(f'Review text: {negative}')
print(f'Sentiment  : {class_names[y_pred[0]]}')

***** Running Prediction *****
  Num examples = 1
  Batch size = 16


Review text: Useless editor, a lot of advertising and zero functionality, I'm wasting my time
Sentiment  : negative


In [None]:
test_encodings = tokenizer([neutral],padding=True,truncation=True,return_token_type_ids=False,max_length=MAX_LENGTH)
test_dataset = GPReviewDataset(test_encodings, [1])

outputs = trainer.predict(test_dataset)
y_pred = outputs.predictions.argmax(1)

print(f'Review text: {neutral}')
print(f'Sentiment  : {class_names[y_pred[0]]}')

***** Running Prediction *****
  Num examples = 1
  Batch size = 16


Review text: Good app bit i am facing a perineal problem. The option to select AM-PM is not working where the clock is shown. This needs to be fixed as every time i select the time is defaults to PM
Sentiment  : neutral


In [None]:
test_encodings = tokenizer([positive],padding=True,truncation=True,return_token_type_ids=False,max_length=MAX_LENGTH)
test_dataset = GPReviewDataset(test_encodings, [2])

outputs = trainer.predict(test_dataset)
y_pred = outputs.predictions.argmax(1)

print(f'Review text: {positive}')
print(f'Sentiment  : {class_names[y_pred[0]]}')

***** Running Prediction *****
  Num examples = 1
  Batch size = 16


Review text: Developers, God bless you
Sentiment  : positive


## Задание №4

In [None]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super().__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME,output_attentions = True,output_hidden_states = True)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask,labels):
    #sequence_output ставим для получения эмбедингов всех токенов
    #в том числе cls
    sequence_output, pooled_output,hs,oa = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False)
    
    #Переведём hidden_states из [# layers, # batches, # tokens, # features] в 
    #[# batches, # layers, # tokens, # features] для удобства дальнейшего агрегироварния
    token_dim = torch.stack(hs, dim = 0)
    token_dim = token_dim.permute(1, 0, 2, 3)
    all_4l_aggrs = torch.zeros(1,768).to(device)
    for batch in token_dim:
        #Переведём hidden_states из [# layers, # tokens, # features] в 
        #[# tokens, # layers, # featuress] для удобства дальнейшего агрегироварния
        token_dim2 = batch.permute(1, 0, 2)
        #берём cls токен
        token = token_dim2[0]
        #создаём матрицу (4,768) состояющую из последних 4 слоёв для токена cls
        cat_vec = torch.cat((token[-1].unsqueeze(0), token[-2].unsqueeze(0), token[-3].unsqueeze(0), token[-4].unsqueeze(0)), dim = 0)
        #агрегируем
        aggr = torch.mean(cat_vec,0).unsqueeze(0)
        #отправляем в общий тензор
        all_4l_aggrs = torch.cat((all_4l_aggrs,aggr),0)
    all_4l_aggrs = all_4l_aggrs[1:]

    output = self.drop(all_4l_aggrs)
    logits = self.out(output)
    loss = None

    if labels is not None:
        loss_fct = nn.CrossEntropyLoss().to(device)
        loss = loss_fct(logits.view(-1, 3), labels.view(-1))

    return transformers.modeling_outputs.SequenceClassifierOutput(
        logits=logits,
        loss=loss,
        hidden_states=hs,
        attentions=oa
    )

In [None]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Model config BertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dim": 768,
  "dropout": 0.1,
  "hidden_act": "gelu",
  "hidden_dim": 3072,
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "n_heads": 12,
  "n_layers": 6,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,


In [None]:
#Замораживаем первые 10 слоёв,чтобы повысить скорость обучения
freeze_layers = list(range(10))
for layer_id in freeze_layers:
  for param in list(model.bert.encoder.layer[layer_id].parameters()):
    param.requires_grad = False

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 5400
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1350


Step,Training Loss
10,1.1833
20,1.1916
30,1.1373
40,1.1864
50,1.1717
60,1.0945
70,1.115
80,1.152
90,1.0984
100,1.1302


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1350, training_loss=0.9914702761614764, metrics={'train_runtime': 341.9123, 'train_samples_per_second': 31.587, 'train_steps_per_second': 3.948, 'total_flos': 0.0, 'train_loss': 0.9914702761614764, 'epoch': 2.0})

In [None]:
#необходимо добавить в ignore_keys два приведённых пункта,чтобы они не учавствовали в функции compute_metrics
trainer.evaluate(eval_dataset=test_dataset, ignore_keys =["hidden_states","attentions"], metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1
  Batch size = 16


{'epoch': 2.0,
 'test_accuracy': 1.0,
 'test_loss': 0.44419461488723755,
 'test_runtime': 0.0414,
 'test_samples_per_second': 24.181,
 'test_steps_per_second': 24.181}