In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/диплом

/content/drive/MyDrive/диплом


In [4]:
from typing import Dict, List
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import f1_score, accuracy_score, classification_report
from torch import nn
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import torch.nn.functional as F
import sklearn
import sys
import random
import math
import os
import argparse
import logging
from sklearn.utils import class_weight
from torch.cuda.amp import autocast, GradScaler
from transformers import BertModel
from torch.cuda.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import Dataset, DataLoader

In [5]:
features_for_extra_layer = {
    'AP': ['mean_len_words', 'mean_freq_ipm', 'mean_freq_d', 'N_pronn_pers_first',
           'N_words', 'mean_Nwords_sent', 'min_freq_d'],
    'AM': ['mean_len_words', 'N_pronn_pers_first', 'min_freq_ipm', 'mean_freq_ipm',
           'N_most_repeated_word', 'mean_Nwords_sent', 'N_adj', 'mean_sent_depth', 'mean_freq_d'],
    'BP': ['N_words', 'N_adj', 'mean_len_words', 'N_repeated_words',
           'mean_freq_d', 'mean_freq_ipm', 'min_freq_ipm', 'N_acl:relcl', 'max_Nwords_sent', 'max_len_words',
           'max_synt_depth'],
    'BM': ['N_words', 'mean_len_words', 'N_repeated_words', 'N_adj', 'mean_freq_d', 'max_len_words',
           'mean_freq_ipm', 'min_freq_ipm', 'max_synt_depth', 'N_pronn_pers_first', 'mean_sent_depth'],
    'DP': ['mean_len_words', 'N_pronn_pers_first', 'N_repeated_words', 'mean_freq_d', 'N_words',
           'mean_freq_ipm', 'N_most_repeated_word', 'mean_sent_depth', 'max_synt_depth'],
    'DM': ['mean_len_words', 'N_pronn_pers_first', 'N_repeated_words', 'N_words', 'mean_freq_ipm',
           'mean_freq_d', 'max_len_words', 'min_freq_ipm', 'N_acl:relcl', 'N_most_repeated_word'],
    'GP': ['mean_len_words', 'mean_freq_ipm', 'mean_freq_d', 'min_freq_ipm', 'N_verb_tense_pres', 'N_adj',
           'mean_Nwords_sent'],
    'GM': ['mean_len_words', 'N_most_repeated_word', 'N_pronn_pers_first', 'max_Nwords_sent', 'mean_freq_d',
           'N_acl', 'max_len_words', 'mean_Nwords_sent'],
    'verb': ['N_words', 'N_adj', 'mean_len_words', 'max_len_words', 'max_synt_depth', 'mean_sent_depth',
             'mean_freq_ipm', 'min_freq_d', 'min_freq_ipm', 'N_relative', 'N_most_repeated_word',
             'mean_freq_d', 'N_acl:relcl'], 
    'raven': ['N_adj', 'N_words', 'mean_len_words', 'N_repeated_words', 
              'N_most_repeated_word', 'mean_freq_d', 'max_len_words', 'max_synt_depth', 'min_freq_ipm', 'min_freq_d',]
}

In [6]:
class DataPreparation(Dataset):
    
    def __init__(self, tokenizer, data, scale_init, intelligence='verb', max_length=None, if_scale=True):
        
        self.tokenizer = tokenizer
        self.data = data
        self.intell = intelligence
        self.scale = scale_init
        self.if_scale = if_scale
        
        if max_length == None:
            max_length_counted = data["text"].str.split(' ').str.len().max(axis=0)
            self.max_length = max_length_counted if max_length_counted < 512 else 512
        else:
            self.max_length = max_length


    def __len__(self):
        return len(self.data)


    def tokenize(self, text):

        tokens = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt')

        return tokens


    def scaling(self, labels):

        scaled_target = self.scale.transform(np.array(labels))#.reshape(-1, 1))
        
        return scaled_target

     
    def __getitem__(self, index):
        
        source_text = self.data['text'].iloc[index]
        source = self.tokenize(source_text)

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        
        target_index = self.intell.split('_')[0]

        if self.if_scale:
            nlp_features_sc = self.scaling(self.data[features_for_extra_layer[target_index]])
            nlp_features = torch.from_numpy(nlp_features_sc[index])

        else:
            nlp_features = torch.from_numpy(self.data[features_for_extra_layer[target_index]].iloc[index].to_numpy())

        label = self.data[self.intell].iloc[index]
        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "labels":  label, 
            "nlp_feat": nlp_features.to(dtype=torch.float)
        }

In [7]:
def count_vocab_features(subset, target_index):
    rel_info = subset[features_for_extra_layer[target_index]].to_numpy()
    return len(np.unique(rel_info))

In [8]:
class NLP_FEATURES_Block(nn.Module):
    
    def __init__(self, input_size, out_features, inner_features=120):
        super().__init__()

        self.linear = nn.Linear(input_size, inner_features)
        self.bn = nn.BatchNorm1d(inner_features)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(inner_features, out_features)


    def forward(self, sequence_input):

        x = self.linear(sequence_input)
        x = self.bn(x)
        x = self.relu(x)
        x = self.linear2(x)

        return x

In [9]:
class Attention(nn.Module):
    def __init__(self,emb_size,hid_size=512, hid_size_out=512, dropout=0.3):

        super(Attention,self).__init__()
        
        self.softmax = torch.nn.Softmax(dim=1)
        self.fc1 = torch.nn.Linear(emb_size, hid_size_out)
        self.dropout = nn.Dropout(dropout)

        self.W_w = nn.Parameter(torch.Tensor(emb_size, emb_size))
        self.u_w = nn.Parameter(torch.Tensor(emb_size, 1))
        nn.init.uniform_(self.W_w, -0.1, 0.1)
        nn.init.uniform_(self.u_w, -0.1, 0.1)
  
    
    def forward(self, last_hidden_states):

        encoded_layers = self.dropout(last_hidden_states)
        score = torch.tanh(torch.matmul(encoded_layers, self.W_w))

        attention_weights = self.softmax(torch.matmul(score, self.u_w))

        scored_x = encoded_layers * attention_weights

        feat = torch.sum(scored_x, dim=1)
        out_lin_attent =  self.fc1(feat)

        return out_lin_attent

In [11]:
class ModelModule(nn.Module):
    
    def __init__(self, out_features, hidden_size=512, hidden_size_lin=256,
                 inner_features=128,
                 nlp_feat_size=18, nlp_feat_size_out=4,
                 pre_trained='DeepPavlov/rubert-base-cased-sentence'):
        super(ModelModule, self).__init__()

        self.bert = AutoModel.from_pretrained(pre_trained)

        self.nlp_features = NLP_FEATURES_Block(input_size=nlp_feat_size, out_features=nlp_feat_size_out)
        
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(num_features=32)

        self.fc2 = nn.Linear(hidden_size_lin+nlp_feat_size_out, inner_features)
        self.linear_modules = nn.ModuleList([torch.nn.Linear(inner_features, 32),
                                          torch.nn.Linear(32, 32),
                                          torch.nn.Linear(32, out_features)])

        self.attention = Attention(emb_size=self.bert.config.hidden_size,
                                   hid_size_out=hidden_size_lin, dropout=self.bert.config.hidden_dropout_prob)

        for param in self.bert.parameters():
            param.requires_grad = False
    
    def forward(self, input_ids, attention_mask, nlp_feat):

        all_outs_bert, _ = self.bert(input_ids, attention_mask = attention_mask, return_dict = False)
        out_lin_attent = self.attention(all_outs_bert)

        nlp_features_out_block = self.nlp_features(nlp_feat)

        concatenated_block_features = torch.cat([out_lin_attent, nlp_features_out_block], dim=1)
        
        x = self.fc2(concatenated_block_features)
        x = self.relu(x)

        h = []
        
        for lin in self.linear_modules[:-1]:
            x = lin(x)
            h.append(x)
            x = self.bn1(x)
            x = self.relu(x)
            x = self.dropout(x)
        
        x = self.relu(h[-1] + h[-2])
        x = self.linear_modules[-1](x)
        
        return x

In [12]:
def train(model, data_loader, device, optimizer, criterion, n_epoch):

    print('Epoch #{}\n'.format(n_epoch+1))

    train_losses = []
    train_labels = []
    train_predictions = []

    progress_bar = tqdm(total=math.ceil(len(data_loader.dataset)/data_loader.batch_size), desc='Epoch {}'.format(n_epoch + 1))

    model.train()

    for _, data in enumerate(data_loader, 0):


        input_ids = data["source_ids"].to(device)
        attention_mask = data["source_mask"].to(device)
        labels = data['labels'].to(device)
        nlp_feat = data['nlp_feat'].to(device)

        optimizer.zero_grad()

        pred = model(input_ids, attention_mask, nlp_feat)
        loss = criterion(pred, labels)
              
        loss.backward()      
        optimizer.step()
        
        lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])
        scheduler.step()
        
        predict = torch.log_softmax(pred, dim=1).argmax(dim=1)
        
        train_losses.append(loss.item())
        train_labels.extend(labels.cpu().detach().numpy())
        train_predictions.extend(predict.cpu().detach().numpy())

        progress_bar.set_postfix(loss=np.mean(train_losses))
        progress_bar.update(1)
    
    progress_bar.update(1)
    progress_bar.close()
    
    print('\n\nMean Loss after epoch #{0} - {1}'.format(str(n_epoch + 1), np.mean(train_losses)))
    print('F1 score after epoch #{0} on train - {1}\n'.format(str(n_epoch + 1), f1_score(train_labels, train_predictions, average='macro')))
    print('Accuracy score after epoch #{0} on train - {1}\n'.format(str(n_epoch + 1), accuracy_score(train_labels, train_predictions)))

    print(classification_report(train_labels, train_predictions))
    return train_labels, train_predictions


def validating(model, data_loader, device, criterion, n_epoch):

    val_losses, val_labels, val_predictions = [], [], []

    progress_bar = tqdm(total=math.ceil(len(data_loader.dataset)/data_loader.batch_size),
                        desc='Epoch {}'.format(n_epoch + 1))

    model.eval()

    for _, data in enumerate(data_loader, 0):
          
        input_ids = data["source_ids"].to(device)
        attention_mask = data["source_mask"].to(device)
        labels = data['labels'].to(device)
        nlp_feat = data['nlp_feat'].to(device)

        with torch.no_grad():
            pred = model(input_ids, attention_mask, nlp_feat)

            loss = criterion(pred, labels)
          
            predict = torch.log_softmax(pred, dim=1).argmax(dim=1)

        val_losses.append(loss.item())
        val_labels.extend(labels.cpu().detach().numpy())
        val_predictions.extend(predict.cpu().detach().numpy())

        progress_bar.set_postfix(loss=np.mean(val_losses))
        progress_bar.update(1)

    progress_bar.update(1)
    progress_bar.close()
    
    
    valid_stats.append(
        {
            'Val Loss': np.mean(val_losses)
        }
    )

    print('\n\nMean Loss after epoch #{0} - {1}'.format(str(n_epoch + 1), np.mean(val_losses)))
    print('F1 score after epoch #{0} on validation - {1}\n'.format(str(n_epoch + 1), f1_score(val_labels, val_predictions, average='macro')))
    print('Accuracy score after epoch #{0} on validation - {1}\n'.format(str(n_epoch + 1), accuracy_score(val_labels, val_predictions)))
    
    print(classification_report(val_labels, val_predictions))
    return valid_stats


def test(data_loader, device, id_name):
    
    model.load_state_dict(torch.load(f'results/model_resnet_attent_{id_name}.pth'))
    
    test_labels, test_predictions = [], []

    model.eval()

    for _, data in enumerate(data_loader, 0):
          
        input_ids = data["source_ids"].to(device)
        attention_mask = data["source_mask"].to(device)
        labels = data['labels'].to(device)
        nlp_feat = data['nlp_feat'].to(device)

        with torch.no_grad():
            pred = model(input_ids, attention_mask,nlp_feat)
          
        predict = torch.log_softmax(pred, dim=1).argmax(dim=1)

        test_labels.extend(labels.cpu().detach().numpy())
        test_predictions.extend(predict.cpu().detach().numpy())

    print('F1 macro score on test - {0}\n'.format(f1_score(test_labels, test_predictions, average='macro')))
    print('F1 score on test - {0}\n'.format(f1_score(test_labels, test_predictions, average='weighted')))
    print('Accuracy score on test - {0}\n'.format(accuracy_score(test_labels, test_predictions)))
    
    print(classification_report(test_labels, test_predictions))


def evaluate(model, train_dataset, val_dataset, device, epochs, target_value, weights, lr):
    
    model = model.to(device)
    lr = lr
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(weight=weights,reduction='mean').to(device)
    global lr_list
    lr_list = []
    global scheduler
    total_steps = len(train_dataset) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=int(total_steps * 0.1),
                                                num_training_steps=total_steps)
    global valid_stats
    valid_stats = []
    best_valid_loss = float('inf')

    for epoch in range(epochs):
        # train
        try:
            train(model, train_dataset, device, optimizer, criterion,  epoch)
            # # validate
            validating(model, val_dataset, device, criterion, epoch)

            if valid_stats[epoch]['Val Loss'] < best_valid_loss:
                best_valid_loss = valid_stats[epoch]['Val Loss']

                name_to_save = f'model_resnet_attent_{target_value}'
                if os.path.isfile('results/'+name_to_save+'.pth'):
                    os.remove('results/'+name_to_save+'.pth')
                    torch.save(model.state_dict(), 'results/'+name_to_save+'.pth')
                else:
                    if not os.path.isdir('results'):
                        os.mkdir('results')
                    torch.save(model.state_dict(), 'results/'+name_to_save+'.pth')
        except KeyboardInterrupt:
            break

In [13]:
def initialize_scaling(data_org, target_index):
    scale = StandardScaler().fit(np.array(data_org[features_for_extra_layer[target_index]]))
    return scale

def inverse_toorig(scaler, list_of_labels):
    inverse = scaler.inverse_transform(list_of_labels.reshape(-1, 1))
    return inverse

In [14]:
def seed_everything(seed_value=42):
    os.environ["PYTHONHASHSEED"] = str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    return seed_value

In [None]:
path_to_data = 'dataset_plusextra_nlp_features_target_classes_no_naive.csv'
target_value = 'BP'
path_to_model = 'ai-forever/ruRoberta-large'
epochs = 10
seed_value = 42

_ = seed_everything(seed_value)
LR = 2e-4
maxlength = 120
minlength = 2
bsize = 16

dataset = pd.read_csv(path_to_data, sep='\t')

dataset = dataset[dataset.question_id != '129_Чтение текста - видео']

if target_value == 'raven':
            dataset = dataset[dataset["raven"] > 0]
if target_value == 'verb':
            dataset = dataset[dataset["verb"] > 0]
        
tokenizer = AutoTokenizer.from_pretrained(path_to_model)

intelligence = target_value+'_classes'

dataset = dataset[dataset['N_words'] > minlength]

dataset[intelligence] = dataset[intelligence].astype(int)

train_data, extra_data = train_test_split(dataset, test_size=0.22,
                                        stratify=dataset[intelligence],
                                        random_state=seed_value)

vaild_data, test_data = train_test_split(extra_data, test_size=0.45,
                                        stratify=extra_data[intelligence],
                                        random_state=seed_value)


global length_vocab_nlp_feat 
length_vocab_nlp_feat = count_vocab_features(train_data, target_value)

train_dataset_data = DataPreparation(
            tokenizer=tokenizer,
            data = train_data,
            scale_init = initialize_scaling(train_data, target_value),
            intelligence = intelligence,
            max_length = maxlength,
            if_scale = True
        )

val_dataset_data = DataPreparation(
            tokenizer=tokenizer,
            data = vaild_data,
            scale_init = initialize_scaling(vaild_data, target_value),
            intelligence = intelligence,
            max_length = maxlength,
            if_scale = True
        )

test_dataset_data = DataPreparation(
            tokenizer=tokenizer,
            data = test_data,
            scale_init = initialize_scaling(test_data, target_value),
            intelligence = intelligence,
            max_length = maxlength,
            if_scale = True
        )

weights = class_weight.compute_class_weight(class_weight='balanced',classes=np.unique(dataset[intelligence]), y=dataset[intelligence].to_numpy())

wights_tensor = torch.tensor(weights,dtype=torch.float)

train_dataset = DataLoader(train_dataset_data, batch_size=bsize, drop_last=True, shuffle=True)
val_dataset = DataLoader(val_dataset_data, batch_size=bsize)
test_dataset = DataLoader(test_dataset_data, batch_size=bsize)
      
model = ModelModule(pre_trained=path_to_model, out_features=len(dataset[intelligence].unique()),
                    nlp_feat_size=len(features_for_extra_layer[target_value]))


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
evaluate(model=model, train_dataset=train_dataset, val_dataset=val_dataset, device=device, epochs=epochs, target_value=target_value, weights=wights_tensor, lr=LR)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[intelligence] = dataset[intelligence].astype(int)
Some weights of the model checkpoint at ai-forever/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceCl

Epoch #1



Epoch 1:   0%|          | 0/316 [00:00<?, ?it/s]



Mean Loss after epoch #1 - 1.1223884077299209
F1 score after epoch #1 on train - 0.25892174663877904

Accuracy score after epoch #1 on train - 0.28849206349206347

              precision    recall  f1-score   support

           0       0.39      0.13      0.19      1672
           1       0.41      0.14      0.21      2099
           2       0.25      0.75      0.38      1269

    accuracy                           0.29      5040
   macro avg       0.35      0.34      0.26      5040
weighted avg       0.36      0.29      0.25      5040



Epoch 1:   0%|          | 0/49 [00:00<?, ?it/s]



Mean Loss after epoch #1 - 1.09118288877059
F1 score after epoch #1 on validation - 0.32538580874000145

Accuracy score after epoch #1 on validation - 0.40485312899106

              precision    recall  f1-score   support

           0       0.39      0.77      0.52       259
           1       0.47      0.33      0.39       327
           2       0.21      0.04      0.07       197

    accuracy                           0.40       783
   macro avg       0.36      0.38      0.33       783
weighted avg       0.38      0.40      0.35       783

Epoch #2



Epoch 2:   0%|          | 0/316 [00:00<?, ?it/s]



Mean Loss after epoch #2 - 1.0908643731995233
F1 score after epoch #2 on train - 0.37685045144999374

Accuracy score after epoch #2 on train - 0.38571428571428573

              precision    recall  f1-score   support

           0       0.40      0.51      0.45      1670
           1       0.45      0.34      0.38      2102
           2       0.29      0.31      0.30      1268

    accuracy                           0.39      5040
   macro avg       0.38      0.38      0.38      5040
weighted avg       0.39      0.39      0.38      5040



Epoch 2:   0%|          | 0/49 [00:00<?, ?it/s]



Mean Loss after epoch #2 - 1.0859696281199553
F1 score after epoch #2 on validation - 0.35109760741059204

Accuracy score after epoch #2 on validation - 0.4061302681992337

              precision    recall  f1-score   support

           0       0.39      0.84      0.53       259
           1       0.52      0.20      0.29       327
           2       0.36      0.17      0.23       197

    accuracy                           0.41       783
   macro avg       0.42      0.40      0.35       783
weighted avg       0.44      0.41      0.36       783

Epoch #3



Epoch 3:   0%|          | 0/316 [00:00<?, ?it/s]



Mean Loss after epoch #3 - 1.08147402434122
F1 score after epoch #3 on train - 0.3886264813176834

Accuracy score after epoch #3 on train - 0.40773809523809523

              precision    recall  f1-score   support

           0       0.41      0.60      0.49      1672
           1       0.47      0.35      0.40      2096
           2       0.31      0.25      0.28      1272

    accuracy                           0.41      5040
   macro avg       0.40      0.40      0.39      5040
weighted avg       0.41      0.41      0.40      5040



Epoch 3:   0%|          | 0/49 [00:00<?, ?it/s]



Mean Loss after epoch #3 - 1.0844943985647084
F1 score after epoch #3 on validation - 0.31730543236348746

Accuracy score after epoch #3 on validation - 0.4099616858237548

              precision    recall  f1-score   support

           0       0.38      0.88      0.53       259
           1       0.53      0.26      0.35       327
           2       0.40      0.04      0.07       197

    accuracy                           0.41       783
   macro avg       0.44      0.39      0.32       783
weighted avg       0.45      0.41      0.34       783

Epoch #4



Epoch 4:   0%|          | 0/316 [00:00<?, ?it/s]

In [None]:
test(train_dataset, device, target_value) 

In [None]:
test(val_dataset, device, target_value) 

In [None]:
test(test_dataset, device, target_value) 