In [1]:
from typing import Dict, List

import numpy as np
import os
import pandas as pd
import argparse
import torch
import math

from sklearn.metrics import f1_score
from torch import nn
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from sklearn.utils import class_weight
import torch.nn.functional as F
import sklearn
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from torch.utils.data import Dataset, DataLoader

SEED = 227


class DataPreparation(Dataset):
    
    def __init__(self, tokenizer, data, scale_init, intelligence='verb', max_length=None, if_scale=True):
        
        self.tokenizer = tokenizer
        self.data = data
        self.intell = intelligence
        self.scale = scale_init
        self.if_scale = if_scale
        
        if max_length == None:
            max_length_counted = data["text"].str.split(' ').str.len().max(axis=0)
            self.max_length = max_length_counted if max_length_counted < 512 else 512
        else:
            self.max_length = max_length


    def __len__(self):
        return len(self.data)


    def tokenize(self, text):

        tokens = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt')

        return tokens


    def scaling(self, labels):
      
        scaled_target = self.scale.transform(np.array(labels).reshape(-1, 1))
        
        return scaled_target

     
    def __getitem__(self, index):
        
        source_text = self.data['text'].iloc[index]
        source = self.tokenize(source_text)

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        
        if self.if_scale:
            scaled_labels = self.scaling(self.data[self.intell])
            label = scaled_labels[index][0]
        else:
            label = self.data[self.intell].iloc[index]

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "labels":  label
        }

class BertBaseline(nn.Module):
    
    def __init__(self, bert, output_neurons):
        super(BertBaseline, self).__init__()

        self.bert = AutoModel.from_pretrained(bert)

        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768,400)
        self.fc2 = nn.Linear(400,20)
        self.fc3 = nn.Linear(20,output_neurons)
    
    def forward(self, input_ids, attention_mask):

        _, cls_hs = self.bert(input_ids, attention_mask = attention_mask, return_dict = False)
        x = self.fc1(cls_hs)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x
    

def initialize_scaling(data_org, intell):
    scale = StandardScaler().fit(np.array(data_org[intell]).reshape(-1, 1))
    return scale

def inverse_toorig(scaler, list_of_labels):
    inverse = scaler.inverse_transform(list_of_labels.reshape(-1, 1))
    return inverse


def train(model, data_loader, device, optimizer, criterion, n_epoch):

    print('Epoch #{}\n'.format(n_epoch+1))

    train_losses = []
    train_labels = []
    train_predictions = []   

    progress_bar = tqdm(total=math.ceil(len(data_loader.dataset)/data_loader.batch_size), 
                        desc='Epoch {}'.format(n_epoch + 1))

    model.train()

    for _, data in enumerate(data_loader, 0):


          input_ids = data["source_ids"].to(device)
          attention_mask = data["source_mask"].to(device)
          labels = data['labels'].to(device)

          optimizer.zero_grad()

          pred = model(input_ids=input_ids, attention_mask=attention_mask)
          loss = criterion(pred, labels)
              
          loss.backward()
              
          optimizer.step()

          _, predict = torch.max(pred.cpu().data, 1)
          train_losses.append(loss.item())
          train_labels.extend(labels.cpu().detach().numpy())
          train_predictions.extend(predict.cpu().detach().numpy())

          progress_bar.set_postfix(loss=np.mean(train_losses))
          progress_bar.update(1)
    
    progress_bar.update(1)
    progress_bar.close()
  
    
    print('\n\nMean Loss after epoch #{0} - {1}'.format(str(n_epoch + 1), np.mean(train_losses)))
    print('F1 score after epoch #{0} on train - {1}\n'.format(str(n_epoch + 1), f1_score(train_labels, train_predictions, average='macro')))
    print('Accuracy score after epoch #{0} on train - {1}\n'.format(str(n_epoch + 1), accuracy_score(train_labels, train_predictions)))

    print(classification_report(train_labels, train_predictions))
    
    return train_labels, train_predictions


def validating(model, data_loader, device, criterion, n_epoch):

    val_losses, val_labels, val_predictions = [], [], []

    progress_bar = tqdm(total=math.ceil(len(data_loader.dataset)/data_loader.batch_size),
                        desc='Epoch {}'.format(n_epoch + 1))

    model.eval()

    for _, data in enumerate(data_loader, 0):
          
          input_ids = data["source_ids"].to(device)
          attention_mask = data["source_mask"].to(device)
          labels = data['labels'].to(device)

          with torch.no_grad():
              pred = model(input_ids, attention_mask)

          loss = criterion(pred, labels)
          
          _, predict = torch.max(pred.cpu().data, 1)

          val_losses.append(loss.item())
          val_labels.extend(labels.cpu().detach().numpy())
          val_predictions.extend(predict.cpu().detach().numpy())

          progress_bar.set_postfix(loss=np.mean(val_losses))
          progress_bar.update(1)

    progress_bar.update(1)
    progress_bar.close()
    
    
    valid_stats.append(
        {
            'Val Loss': np.mean(val_losses)
        }
    )

    print('\n\nMean Loss after epoch #{0} - {1}'.format(str(n_epoch + 1), np.mean(val_losses)))
    print('F1 score after epoch #{0} on validation - {1}\n'.format(str(n_epoch + 1), f1_score(val_labels, val_predictions, average='macro')))
    print('Accuracy score after epoch #{0} on validation - {1}\n'.format(str(n_epoch + 1), accuracy_score(val_labels, val_predictions)))
    
    print(classification_report(val_labels, val_predictions))
    return valid_stats


def evaluate(model, train_dataset, val_dataset, device, epochs, target_value, weights):
    
    model = model.to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)
    criterion = nn.CrossEntropyLoss(weight=weights,reduction='mean').to(device)

    global valid_stats
    valid_stats = []
    best_valid_loss = float('inf')

    for epoch in range(epochs):
        # train
        try:
            train(model, train_dataset, device, optimizer, criterion,  epoch)
            # # validate
            validating(model, val_dataset, device, criterion, epoch)

            if valid_stats[epoch]['Val Loss'] < best_valid_loss:
                best_valid_loss = valid_stats[epoch]['Val Loss']

                name_to_save = f'model_baseline_basic_{target_value}'
                if os.path.isfile('results/'+name_to_save+'.pth'):
                    os.remove('results/'+name_to_save+'.pth')
                    torch.save(model.state_dict(), 'results/'+name_to_save+'.pth')
                else:
                    if not os.path.isdir('results'):
                        os.mkdir('results')
                    torch.save(model.state_dict(), 'results/'+name_to_save+'.pth')
#                     else:
#                         os.mkdir('results')
        except KeyboardInterrupt:
            break



In [None]:
path_to_data = '/kaggle/input/traits-no-naives/dataset_all_nlp_features_target_classes_no_naive (1).csv'
target_value = 'raven'
path_to_model = 'DeepPavlov/rubert-base-cased'
epochs = 15

dataset = pd.read_csv(path_to_data, sep='\t')

dataset = dataset[dataset.question_id != '129_Чтение текста - видео']

if target_value == 'raven':
    dataset = dataset[dataset["raven"] > 0]
if target_value == 'verb':
    dataset = dataset[dataset["verb"] > 0]
        
tokenizer = AutoTokenizer.from_pretrained(path_to_model)

intelligence = target_value+'_classes'

dataset[intelligence] = dataset[intelligence].astype(int)

dataset = dataset[dataset['N_words'] > 2]

train_data, extra_data = train_test_split(dataset, test_size=0.25,
                                        stratify=dataset[intelligence],
                                        random_state=SEED)

vaild_data, test_data = train_test_split(extra_data, test_size=0.4,
                                        stratify=extra_data[intelligence],
                                        random_state=SEED)
        
scaler = initialize_scaling(train_data, target_value)

train_dataset_data = DataPreparation(
            tokenizer=tokenizer,
            data = train_data,
            scale_init = scaler,
            intelligence = intelligence,
            max_length = 120,
            if_scale = False
        )

val_dataset_data = DataPreparation(
            tokenizer=tokenizer,
            data = vaild_data,
            scale_init = scaler,
            intelligence = intelligence,
            max_length = 120,
            if_scale = False
        )

test_dataset_data = DataPreparation(
            tokenizer=tokenizer,
            data = test_data,
            scale_init = scaler,
            intelligence = intelligence,
            max_length = 120,
            if_scale = False
        )

weights = class_weight.compute_class_weight(class_weight='balanced',classes=np.unique(dataset[intelligence]), y=dataset[intelligence].to_numpy())
wights_tensor = torch.tensor(weights,dtype=torch.float)

train_dataset = DataLoader(train_dataset_data, batch_size=8, drop_last=True, shuffle=True)
val_dataset = DataLoader(val_dataset_data, batch_size=8)
test_dataset = DataLoader(test_dataset_data, batch_size=8)
        
model = BertBaseline(bert=path_to_model, output_neurons=len(dataset[intelligence].unique()))
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

evaluate(model=model, train_dataset=train_dataset, val_dataset=val_dataset, device=device, epochs=epochs, target_value=target_value, weights=wights_tensor)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #1



Epoch 1:   0%|          | 0/593 [00:00<?, ?it/s]



Mean Loss after epoch #1 - 1.6097260967299745
F1 score after epoch #1 on train - 0.17688218116596705

Accuracy score after epoch #1 on train - 0.26013513513513514

              precision    recall  f1-score   support

           0       0.19      0.54      0.28       810
           1       0.16      0.04      0.07       899
           2       0.44      0.33      0.38      2079
           3       0.16      0.07      0.10       759
           4       0.06      0.08      0.07       189

    accuracy                           0.26      4736
   macro avg       0.20      0.21      0.18      4736
weighted avg       0.28      0.26      0.24      4736



Epoch 1:   0%|          | 0/119 [00:00<?, ?it/s]



Mean Loss after epoch #1 - 1.5937832814304769
F1 score after epoch #1 on validation - 0.17481095176010433

Accuracy score after epoch #1 on validation - 0.370253164556962

              precision    recall  f1-score   support

           0       0.23      0.61      0.34       162
           1       0.00      0.00      0.00       180
           2       0.48      0.61      0.54       416
           3       0.00      0.00      0.00       152
           4       0.00      0.00      0.00        38

    accuracy                           0.37       948
   macro avg       0.14      0.24      0.17       948
weighted avg       0.25      0.37      0.29       948



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch #2



Epoch 2:   0%|          | 0/593 [00:00<?, ?it/s]



Mean Loss after epoch #2 - 1.5908503399507419
F1 score after epoch #2 on train - 0.2035247309160174

Accuracy score after epoch #2 on train - 0.2782939189189189

              precision    recall  f1-score   support

           0       0.23      0.57      0.32       809
           1       0.16      0.05      0.07       900
           2       0.44      0.30      0.35      2080
           3       0.22      0.25      0.24       759
           4       0.03      0.03      0.03       188

    accuracy                           0.28      4736
   macro avg       0.22      0.24      0.20      4736
weighted avg       0.30      0.28      0.26      4736



Epoch 2:   0%|          | 0/119 [00:00<?, ?it/s]



Mean Loss after epoch #2 - 1.584333456864878
F1 score after epoch #2 on validation - 0.18651090058905692

Accuracy score after epoch #2 on validation - 0.399789029535865

              precision    recall  f1-score   support

           0       0.27      0.60      0.37       162
           1       0.00      0.00      0.00       180
           2       0.48      0.68      0.57       416
           3       0.00      0.00      0.00       152
           4       0.00      0.00      0.00        38

    accuracy                           0.40       948
   macro avg       0.15      0.26      0.19       948
weighted avg       0.26      0.40      0.31       948



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch #3



Epoch 3:   0%|          | 0/593 [00:00<?, ?it/s]



Mean Loss after epoch #3 - 1.5763913314487483
F1 score after epoch #3 on train - 0.22137422532106016

Accuracy score after epoch #3 on train - 0.30215371621621623

              precision    recall  f1-score   support

           0       0.24      0.56      0.34       809
           1       0.18      0.05      0.07       899
           2       0.44      0.36      0.40      2080
           3       0.22      0.22      0.22       759
           4       0.08      0.07      0.07       189

    accuracy                           0.30      4736
   macro avg       0.23      0.25      0.22      4736
weighted avg       0.31      0.30      0.29      4736



Epoch 3:   0%|          | 0/119 [00:00<?, ?it/s]



Mean Loss after epoch #3 - 1.5727597555192578
F1 score after epoch #3 on validation - 0.1652091257614076

Accuracy score after epoch #3 on validation - 0.2521097046413502

              precision    recall  f1-score   support

           0       0.22      0.82      0.35       162
           1       0.00      0.00      0.00       180
           2       0.48      0.12      0.19       416
           3       0.24      0.38      0.29       152
           4       0.00      0.00      0.00        38

    accuracy                           0.25       948
   macro avg       0.19      0.26      0.17       948
weighted avg       0.28      0.25      0.19       948



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch #4



Epoch 4:   0%|          | 0/593 [00:00<?, ?it/s]

In [3]:
def test(data_loader, device):
    
    model.load_state_dict(torch.load('/kaggle/working/results/model_baseline_basic_verb.pth'))
    
    test_labels, test_predictions = [], []

    model.eval()

    for _, data in enumerate(data_loader, 0):
          
          input_ids = data["source_ids"].to(device)
          attention_mask = data["source_mask"].to(device)
          labels = data['labels'].to(device)
          with torch.no_grad():
              pred = model(input_ids, attention_mask)
         
          _, predict = torch.max(pred.cpu().data, 1)

          test_labels.extend(labels.cpu().detach().numpy())
          test_predictions.extend(predict.cpu().detach().numpy())

    print('F1 macro score on test - {0}\n'.format(f1_score(test_labels, test_predictions, average='macro')))
    print('F1 score on test - {0}\n'.format(f1_score(test_labels, test_predictions, average='weighted')))
    print('Accuracy score on test - {0}\n'.format(accuracy_score(test_labels, test_predictions)))
    
    print(classification_report(test_labels, test_predictions))

In [4]:
test(train_dataset, device)

F1 macro score on test - 0.3283653928609622

F1 score on test - 0.2830607059035576

Accuracy score on test - 0.3951812191103789

              precision    recall  f1-score   support

           0       0.58      0.90      0.70       910
           1       0.26      0.64      0.37       742
           2       0.43      0.03      0.06      1231
           3       0.00      0.00      0.00      1211
           4       0.38      0.78      0.51       762

    accuracy                           0.40      4856
   macro avg       0.33      0.47      0.33      4856
weighted avg       0.32      0.40      0.28      4856



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
test(val_dataset, device)

F1 macro score on test - 0.22011847140232113

F1 score on test - 0.1899953233399893

Accuracy score on test - 0.2623456790123457

              precision    recall  f1-score   support

           0       0.38      0.62      0.47       182
           1       0.17      0.44      0.24       148
           2       0.27      0.02      0.04       247
           3       0.00      0.00      0.00       243
           4       0.27      0.47      0.34       152

    accuracy                           0.26       972
   macro avg       0.22      0.31      0.22       972
weighted avg       0.21      0.26      0.19       972



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
test(test_dataset, device)

F1 macro score on test - 0.21739820120813533

F1 score on test - 0.18868241368802394

Accuracy score on test - 0.26080246913580246

              precision    recall  f1-score   support

           0       0.34      0.57      0.43       122
           1       0.17      0.40      0.24        99
           2       0.42      0.03      0.06       164
           3       0.00      0.00      0.00       161
           4       0.27      0.53      0.36       102

    accuracy                           0.26       648
   macro avg       0.24      0.31      0.22       648
weighted avg       0.24      0.26      0.19       648



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
