# Импорт библиотек

In [3]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm
from scipy.special import softmax

import transformers
from transformers import (
    AdamW, 
    get_linear_schedule_with_warmup, 
    BertModel, 
    BertTokenizer, 
    BertPreTrainedModel, 
    BertConfig, 
    AutoTokenizer
)

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from IPython.display import clear_output
import matplotlib.pyplot as plt
import re
import string
import warnings

warnings.filterwarnings("ignore")

device = 'cuda'

In [None]:
def preprocess_text(text):
    """
    Предобработка текста:
    - Удаление меток ID в начале строки
    - Удаление табуляций и переводов строк
    """
    if text.startswith('[id'):
        text = re.sub(r'\[.*?,', '', text)
    
    text = re.sub(r'[\t\n]', ' ', text)

    return text

In [None]:
def train(model, 
          optimizer: torch.optim, 
          scheduler: torch.optim, 
          train_loader: DataLoader, 
          test_loader: DataLoader, 
          criterion_isr: nn.Module, 
          criterion_pal: nn.Module, 
          epochs: int, 
          acc_max: float) -> None:


    train_losses = []
    train_acc = []

    for epoch in range(epochs):
        model.train()
        curr_train_acc = []
        curr_train_loss = []

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].squeeze(1).to(device)
            attention_mask = batch['attention_mask'].squeeze(1).to(device)
            labels_isr = batch['labels_isr'].long().to(device)
            labels_pal = batch['labels_pal'].long().to(device)

            # Прямой проход модели
            outputs_isr, outputs_pal = model(input_ids=input_ids, attention_mask=attention_mask)

            # Предсказания и вычисление потерь
            logits_isr = outputs_isr.detach().cpu().numpy()
            logits_pal = outputs_pal.detach().cpu().numpy()
            pred_tr_isr = np.argmax(logits_isr, axis=1)
            pred_tr_pal = np.argmax(logits_pal, axis=1)

            loss_isr = criterion_isr(outputs_isr, labels_isr)
            loss_pal = criterion_pal(outputs_pal, labels_pal)
            loss = loss_isr + loss_pal

            loss.backward()
            optimizer.step()
            scheduler.step()

            train_losses.append(loss.item())
            curr_train_loss.append(loss.item())


            # Вычисление точности
            acc = np.sum((pred_tr_isr == labels_isr.detach().cpu().numpy()) & 
                         (pred_tr_pal == labels_pal.detach().cpu().numpy())) / len(labels_isr)
            curr_train_acc.append(acc)
            train_acc.append(acc)

        print('Epoch: ', epoch + 1)
        print(f'Train loss: {sum(curr_train_loss) / len(train_loader)}')
        print(f'Train accuracy: {np.mean(curr_train_acc)}')

        model.eval()
        test_loss = []
        test_acc = []
        true_labels_isr = []
        pred_all_isr = []
        true_labels_pal = []
        pred_all_pal = []

        
        # Оценка модели на тестовой выборке
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].squeeze(1).to(device)
                attention_mask = batch['attention_mask'].squeeze(1).to(device)
                labels_isr = batch['labels_isr'].long().to(device)
                labels_pal = batch['labels_pal'].long().to(device)

                # Прямой проход модели
                outputs_isr, outputs_pal = model(input_ids, attention_mask)

                # Вычисление потерь
                loss_isr = criterion_isr(outputs_isr, labels_isr)
                loss_pal = criterion_pal(outputs_pal, labels_pal)
                loss = loss_isr + loss_pal
                test_loss.append(loss.item())

                logits_isr = outputs_isr.detach().cpu().numpy()
                pred_test_isr = np.argmax(logits_isr, axis=1)

                true_labels_isr += labels_isr.tolist()
                pred_all_isr += pred_test_isr.tolist()
                
                logits_pal = outputs_pal.detach().cpu().numpy()
                pred_test_pal = np.argmax(logits_pal, axis=1)

                true_labels_pal += labels_pal.tolist()
                pred_all_pal += pred_test_pal.tolist()

            avg_test_loss = sum(test_loss) / len(test_loader)
            avg_test_acc = np.mean((np.array(true_labels_isr) == np.array(pred_all_isr)) & 
                                   (np.array(true_labels_pal) == np.array(pred_all_pal)))


            print(f'\nTest loss: {avg_test_loss}')
            print(f'Test accuracy: {avg_test_acc}\n')
            
            if avg_test_acc > acc_max:
                print(classification_report(true_labels_isr, pred_all_isr), '\n')
                print(classification_report(true_labels_pal, pred_all_pal), '\n')
                return

            if epoch % 1 == 0:
                print(classification_report(true_labels_isr, pred_all_isr), '\n')
                print(classification_report(true_labels_pal, pred_all_pal), '\n')

In [5]:
def tokenize(data: pd.DataFrame, 
             tokenizer: AutoTokenizer, 
             loader=True, 
             batch_size=16, 
             shuffle=True):

    '''
    Токенизация текста
    - loader: если True, то вернуть формат DataLoader, в ином случае - список из словарей
    - shuffle: передается в DataLoader
    '''

    data_tokenized = []
    
    for i, row in tqdm(data.iterrows()):
        inputs = tokenizer.encode_plus(
                row['text'],
                add_special_tokens=True,
                truncation=True,
                max_length=256,
                padding='max_length',
                return_attention_mask = True,
                return_tensors = 'pt',
            )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        labels_isr = row['affect_isr']
        labels_pal = row['affect_pal']
        
        final = {
                'input_ids': ids.clone().detach(),
                'attention_mask': mask.clone().detach(),
                'labels_isr': torch.tensor(labels_isr, dtype=torch.float),
                'labels_pal': torch.tensor(labels_pal, dtype=torch.float)
            }

        data_tokenized.append(final)


        dataloader = DataLoader(data_tokenized, batch_size=16, shuffle=shuffle)
    
    if loader:
        return dataloader
    else:
        return data_tokenized

In [83]:
def train_test(df: pd.DataFrame, aug: pd.DataFrame, test_size: float, augs=None):

    '''
    Раздление на тренировочную и тестовую выборки, добавление аугментаций
    - augs: если True, то к тренировочную набору данных добавляются аугментации
    - aug: массив аугментированных данных
    '''
    
    X_train, X_test = train_test_split(df, test_size=test_size, stratify=df[['affect_isr', 'affect_pal']], random_state=1907)

    X_train['affect_isr'] = X_train['affect_isr'].astype(int)
    X_test['affect_isr'] = X_test['affect_isr'].astype(int)
    
    X_train['affect_pal'] = X_train['affect_pal'].astype(int)
    X_test['affect_pal'] = X_test['affect_pal'].astype(int)
    
    
    if augs:
        X_train = pd.concat((aug, X_train))

        
    return X_train, X_test

# Загрузка данных

# affect models

In [75]:
df = pd.read_csv('df.csv')
df['text'] = df['text'].apply(preprocess_text)

df_test = pd.read_csv('df_test.csv')
df_test['text'] = df_test['text'].apply(preprocess_text)

aug = pd.read_csv('aug.csv')
aug['text'] = aug['text'].apply(preprocess_text)

In [76]:
class BertForMultiTaskClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)


        self.classifier1 = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 3),
        )
        
        self.classifier2 = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 3),
        )

        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels1=None, labels2=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        pooled_output = outputs[1]

        logits1 = self.classifier1(pooled_output)
        logits2 = self.classifier2(pooled_output)

        return logits1, logits2

In [67]:
import gc

# очистка памяти 
model.cpu()
gc.collect()
torch.cuda.empty_cache()

## affect isr model

In [89]:
checkpoint = "DeepPavlov/rubert-base-cased-conversational"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

X_train, X_test = train_test(df.copy(), aug.copy(),  0.15, True)

X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')

train_dataloader = tokenize(X_train, tokenizer, batch_size=64, shuffle=True)
test_dataloader = tokenize(X_test,  tokenizer, batch_size=64, shuffle=False)

config = BertConfig.from_pretrained(checkpoint)
config.num_labels = 3  

model = BertForMultiTaskClassification.from_pretrained(checkpoint, config=config)
model = model.to(device)

criterion_isr = nn.CrossEntropyLoss()
criterion_pal = nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr = 4e-5, no_deprecation_warning=True)
num_epochs = 3
total_steps = len(train_dataloader) * num_epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1 * total_steps, num_training_steps=total_steps)


train(model, optimizer, scheduler, train_dataloader, test_dataloader, criterion_isr, criterion_pal, num_epochs, acc_max=0.73)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Some weights of BertForMultiTaskClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier1.0.bias', 'classifier1.0.weight', 'classifier1.2.bias', 'classifier1.2.weight', 'classifier2.0.bias', 'classifier2.0.weight', 'classifier2.2.bias', 'classifier2.2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:  1
Train loss: 1.4939720804492633
Train accuracy: 0.4523467432950191

Test loss: 1.1956732564194257
Test accuracy: 0.5976505139500734

              precision    recall  f1-score   support

           0       0.60      0.41      0.49        58
           1       0.73      0.66      0.69       268
           2       0.73      0.82      0.77       355

    accuracy                           0.72       681
   macro avg       0.68      0.63      0.65       681
weighted avg       0.72      0.72      0.71       681
 

              precision    recall  f1-score   support

           0       0.62      0.19      0.29        85
           1       0.61      0.51      0.56        69
           2       0.84      0.95      0.89       527

    accuracy                           0.81       681
   macro avg       0.69      0.55      0.58       681
weighted avg       0.79      0.81      0.78       681
 

Epoch:  2
Train loss: 0.8805948005582409
Train accuracy: 0.7209650383141762

Test loss: 1.16

In [90]:
torch.save(model, 'model_FINAL')

# evaluating

In [91]:
def evaluate(test, model):
    logits_isr, logits_pal = [], []
    preds_isr, preds_pal = [], []

    for i in tqdm(test):
    
        ids = i['input_ids'].clone().detach().to(device)
        mask = i['attention_mask'].clone().detach().to(device)
        
        with torch.no_grad():
            output = model(ids, mask)
            logit_isr, logit_pal = output
            logit_isr, logit_pal = logit_isr.detach().cpu().numpy(), logit_pal.detach().cpu().numpy()
            
            logits_isr.append(logit_isr[0])
            logits_pal.append(logit_pal[0])
            preds_isr.append(np.argmax(logit_isr, axis=1))
            preds_pal.append(np.argmax(logit_pal, axis=1))

    return logits_isr, logits_pal, preds_isr, preds_pal   

In [92]:
# model1 = torch.load('model11')
# model2 = torch.load('model22')

test1 = tokenize(df_test, tokenizer, shuffle=False, loader=False)
logits_isr, logits_pal, preds_isr, preds_pal = evaluate(test1, model)
# logits2, preds2 = evaluate(test2, model2)

0it [00:00, ?it/s]

  0%|          | 0/501 [00:00<?, ?it/s]

In [93]:
def attitude_pred(x):
    if x['pred_ai'] == x['pred_ap']:
        return(x['pred_ap'][0])
    elif x['pred_ap'] == 2 or x['pred_ai'] == 2:
        return min(x['pred_ai'], x['pred_ap'])[0]
    else:
        return 2
            

df_test['pred_ai'] = preds_isr
df_test['pred_ap'] = preds_pal
df_test['pred_a'] = df_test.apply(attitude_pred, axis=1)

print(classification_report(df_test['attitude'], df_test['pred_a']))

              precision    recall  f1-score   support

           0       0.64      0.56      0.60        89
           1       0.75      0.79      0.77       242
           2       0.67      0.67      0.67       170

    accuracy                           0.71       501
   macro avg       0.69      0.67      0.68       501
weighted avg       0.71      0.71      0.71       501



In [123]:
df_test['pred_ai'] = [i[0] for i in df_test['pred_ai']]

In [124]:
df_test['pred_ap'] = [i[0] for i in df_test['pred_ap']]