<a href="https://colab.research.google.com/github/srRyzhov/Hackathon/blob/main/NLP_DeepPavlov7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -U accelerate
! pip install -U transformers

In [2]:
import pandas as pd
import numpy as np
import random
import torch

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from transformers import BertForSequenceClassification, BertTokenizer
from transformers import TrainingArguments
from transformers import Trainer

import transformers
import accelerate
transformers.__version__, accelerate.__version__


('4.33.1', '0.22.0')

Загрузить файл CRA_train_1200.xlsx !

In [3]:
train_df = pd.read_excel('/content/CRA_train_1200.xlsx', engine = 'openpyxl', index_col = 0)

In [None]:
# На 7
train_text, test_text, train_labels, test_labels = train_test_split(train_df['pr_txt'].astype('str'),train_df['Категория'].astype('str'), test_size=0.1, random_state=36)


Тексты для классификации - train_text

Метки классов, соответствующие текстам - train_labels

In [4]:
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False
seed_all(42)

In [None]:
# Модель 'DeepPavlov/rubert-base-cased'  Внимание: 7 или 17 выбрать
model_name = 'DeepPavlov/rubert-base-cased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=7) # 7!!!!
tokenizer = BertTokenizer.from_pretrained(model_name)

In [6]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.values,
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

tokens_test = tokenizer.batch_encode_plus(
    test_text.values,
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

In [7]:
# Создание и обучение кодировщика на тренировочных метках
label_encoder = LabelEncoder()
label_encoder.fit(train_labels)

# Преобразование тренировочных и тестовых меток в целочисленные значения
train_labels_encoded = label_encoder.transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Получение соответствия между исходными метками и их целочисленными значениями
label_mapping = {label: value for label, value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}


In [8]:
#оборачиваем токенизированные текстовые данные в torch Dataset:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item
    def __len__(self):
        return len(self.labels)

#train_dataset = Data(tokens_train, train_labels)
#test_dataset = Data(tokens_test, test_labels)

train_dataset = Data(tokens_train, train_labels_encoded)
test_dataset = Data(tokens_test, test_labels_encoded)

In [9]:
#расчет метрики - F1

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='micro')
    # one of [None, 'micro', 'macro', 'weighted']
    return {'F1': f1}

In [19]:
#параметры для обучения:
training_args = TrainingArguments(
    output_dir = './results', #Выходной каталог
    num_train_epochs = 20, #Кол-во эпох для обучения
    per_device_train_batch_size = 12, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 12, #Размер пакета для каждого устройства во время валидации
    weight_decay =0.01, #Понижение весов
    logging_dir = './logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = 1e-5, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=42)

In [20]:
#Передача в trainer предообученной модели, tokenizer, данных для обучения, данных для валидации и способа расчета метрики
trainer = Trainer(model=model,
                  tokenizer = tokenizer,
                  args = training_args,
                  train_dataset = train_dataset,
                  eval_dataset = test_dataset,
                  compute_metrics = compute_metrics)

In [21]:
#Запуск обучения модели
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.3958,0.902231,0.707113
2,0.306,0.973443,0.719665
3,0.2179,1.004989,0.711297
4,0.1759,1.046953,0.728033
5,0.1549,1.108727,0.732218
6,0.1501,1.07633,0.782427
7,0.0801,1.262973,0.753138
8,0.0405,1.284056,0.761506
9,0.0289,1.40417,0.761506
10,0.023,1.36097,0.769874


TrainOutput(global_step=1620, training_loss=0.0806089817005911, metrics={'train_runtime': 2788.1364, 'train_samples_per_second': 6.893, 'train_steps_per_second': 0.581, 'total_flos': 5057221507584000.0, 'train_loss': 0.0806089817005911, 'epoch': 20.0})

In [22]:
#Сохранение обученной модели
model_path = "fine-tune-bert"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('fine-tune-bert/tokenizer_config.json',
 'fine-tune-bert/special_tokens_map.json',
 'fine-tune-bert/vocab.txt',
 'fine-tune-bert/added_tokens.json')

In [23]:
#функция для получения предикта
def get_prediction():
    test_pred = trainer.predict(test_dataset)
    labels = np.argmax(test_pred.predictions, axis = -1)
    return labels
pred = get_prediction()

In [None]:
#проверка полученного результата

In [24]:
#оценки качества модели
print(classification_report(test_labels_encoded, pred))
print(f1_score(test_labels_encoded, pred, average='micro'))

              precision    recall  f1-score   support

           0       0.75      0.84      0.79        77
           1       0.76      0.70      0.73        40
           2       0.95      0.68      0.79        31
           3       0.00      0.00      0.00         8
           4       0.57      0.48      0.52        27
           5       0.60      0.78      0.68        54
           6       0.00      0.00      0.00         2

    accuracy                           0.71       239
   macro avg       0.52      0.50      0.50       239
weighted avg       0.69      0.71      0.69       239

0.707112970711297


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
