In [1]:
import math
from tqdm import tqdm
from datetime import datetime
import pandas as pd
import numpy as np
import os
import gensim
import torch
import sentence_transformers
import matplotlib.pyplot as plt
from matplotlib import ticker
%matplotlib inline
import torch.nn as nn

from transformers import BertTokenizer, BertModel, pipeline, BartModel
from transformers import RobertaTokenizer, RobertaModel, AutoConfig

from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer, models
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F

from collections import OrderedDict

import textstat

AttributeError: partially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cuda:0


In [3]:
model_name = 'roberta-base'

In [4]:
def config_model():
    configuration = AutoConfig.from_pretrained(model_name)

    configuration.hidden_dropout_prob = 0.0
    configuration.attention_probs_dropout_prob = 0.0
    configuration.classifier_dropout = 0.0
    
    return configuration

In [5]:
# ROBERTa model

tokenizer = RobertaTokenizer.from_pretrained(model_name)
embeddings_model = RobertaModel.from_pretrained(model_name, config=config_model())
summarizer = pipeline("summarization", model="bart-large-cnn")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Data

In [6]:
summaries_train_path = r"dataset\summaries_train.csv"
summaries_test_path = r"dataset\summaries_test.csv"
prompts_train_path = r"dataset\prompts_train.csv"
prompts_test_path = r"dataset\prompts_test.csv"

In [7]:
def norm_score(df: pd.DataFrame) -> pd.DataFrame:
    """Normilizing score to values 0 to 1"""
    df -= np.min(df)
    df /= df.max()
    print('Normilized' if df.min() == 0.0 and df.max() == 1.0 else 'NormError:wrong values')
    
    return df

def short_text(text, max_length=512) -> str:
    if len(text.split()) > 700:
        middle_point = text[len(text)//2:].index('.') + len(text)//2 + 1
        text_pt1 = text[:middle_point]
        text_pt2 = text[middle_point:]
        
        text = short_text(text_pt1) + short_text(text_pt2)
        
    summarized = summarizer(text, max_length=max_length, min_length=300, do_sample=False)
    
    return summarized[0]['summary_text']

def moving_average(array: np.array, betta=0.9) -> np.array:
    """
    Computing moving average with bias correction.
    """
    
    V = 0
    average_array = np.zeros(len(array))
    for i in range(len(array)):
        V = betta * V + (1 - betta) * array[i]
        average_array[i] = V/(1 - pow(betta, i+1))
        
    return average_array



def mean_pooling(outputs, batch) -> torch.tensor:
    
    attention_mask = batch['attention_mask']
    embeddings = outputs.last_hidden_state
    
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask
    
    summed = torch.sum(masked_embeddings, 1)
    summed_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
    
    mean_pooled = summed/summed_mask
    
    return mean_pooled

def normilize(df_column: pd.DataFrame):
    
    df_column = (df_column - df_column.mean())/df_column.std()
    
    return df_column

def get_stat_features(df, text_col="text"):
    
    df["num_unique_words"] = normilize(df[text_col].apply(lambda x: len(set(x.split()))))
    df["num_words"] = normilize(df[text_col].apply(lambda x: len(x.split())))
    df["num_sentences"] = normilize(df[text_col].apply(lambda x: len(x.split('.'))))
    
    df["syntax_count"] = normilize(df[text_col].apply(lambda x: x.count(",") 
                                                      + x.count("-") + x.count(";") + x.count(":")))
    df['smog_index'] = normilize(df[text_col].apply(lambda x: textstat.smog_index(x)))
    
    
    return df

In [8]:
class SentenseData(Dataset):
    """
    :params: path to csv file with summaries, path to csv file with prompts, 'score' param\
    defines which score is used: content/wording
    """
    def __init__(self, summaries_path=summaries_train_path,
                 prompts_path=prompts_train_path,
                 score='wording', test=False, fold_test='ebad26'):
        self.test = test
        self.score_type = score
        self.fold_test = fold_test
        
        with open(summaries_path, encoding='utf-8') as f:
            summaries = pd.read_csv(f)
            summaries = get_stat_features(summaries)
            
            self.summaries = summaries
            
        with open(prompts_path, encoding='utf-8') as f:
            prompts = pd.read_csv(f)
#             prompts = prompts[prompts.prompt_id != '3b9047'].reset_index(drop=True)
            self.prompts = prompts
        
#         self.summaries.content = norm_score(self.summaries.content)
#         self.summaries.wording = norm_score(self.summaries.wording)
        

        
    def __len__(self):
        return len(self.__summaries)
    
    @property
    def summaries(self):
        return self.__summaries
    
    @summaries.setter
    def summaries(self, df):
        if self.test:
            self.__summaries = df[df.prompt_id == self.fold_test].reset_index(drop=True)
            
        else:
            self.__summaries = df[df.prompt_id != self.fold_test].reset_index(drop=True)
    
    @property
    def prompts(self):
        return self.__prompts
    
    @prompts.setter
    def prompts(self, file):
        self.__prompts = file
#         for i, text in enumerate(self.__prompts.prompt_text):
#             if len(text.split()) > 600:
#                 self.__prompts.prompt_text[i] = short_text(text)
    
    def get_batch_text(self, index):
        summary_text = self.summaries.text[index]
        
        if True:#self.score_type == 'wording':
            res = tokenizer(summary_text, padding='max_length', return_tensors='pt', truncation=True)
            res = {k:val.squeeze() for k, val in res.items()}
            res['features'] = torch.tensor([self.summaries['num_unique_words'][index], self.summaries['num_words'][index],
                                  self.summaries['num_sentences'][index],self.summaries['syntax_count'][index],
                                  self.summaries['smog_index'][index]], dtype=torch.float32)
            
            return res
            
        prompt_text = self.prompts.prompt_text[self.prompts.prompt_id ==
                                               self.summaries.prompt_id[index]].item().replace('\n','')
        
        return tokenizer([summary_text, prompt_text], padding='max_length', return_tensors='pt', truncation=True)
    
    def get_score(self, index):
        if self.score_type == 'content':
            score = self.summaries.content[index]
            
        elif self.score_type == 'wording':
            score = self.summaries.wording[index]
        
        return torch.tensor(score)
        
    def __getitem__(self, index) -> torch.tensor:

        batch_text = self.get_batch_text(index)
        batch_score = self.get_score(index)
        
        return batch_text, batch_score

In [94]:
data = SentenseData()

In [95]:
data[0];

In [96]:
train_loader = DataLoader(data,
                           batch_size=3,
                           shuffle=True, drop_last=True)

In [97]:
for batch in train_loader:
    shme = batch
    break

In [90]:
shme[0]['attention_mask'].dtype

torch.int64

In [91]:
output = embeddings_model(input_ids=shme[0]['input_ids'], attention_mask=shme[0]['attention_mask'])

In [75]:
output = mean_pooling(output, shme[0])

In [98]:
shme[0]['features']

tensor([[ 1.8927,  1.4800,  0.7691,  0.8581,  1.0899],
        [-0.2083, -0.4264, -0.2766, -0.0857,  1.0288],
        [-0.5471, -0.5946, -0.7995, -0.3216, -1.2527]])

In [77]:
output.dtype

torch.float32

In [78]:
torch.cat((output, shme[0]['features']), dim=1).dtype

torch.float64

## Model

In [9]:
class STSBertModel(nn.Module):
    """
    Sentence Semantic Similarity Bert model
    :param: seg_head=True for using segmentation head instead of cosine similarity
    :param: freeze_weights=True to freeze BERT model's weights and train only the segmentation head
    """
    def __init__(self, with_features=False, input_size=768):
        super(STSBertModel, self).__init__()
        
        self.word_embedding = RobertaModel.from_pretrained(model_name, config=config_model())
        self.cos_score = nn.CosineSimilarity(dim=0)
        self.identity = nn.Identity()
        self.input_size = input_size
        self.with_features = with_features
        
        if with_features:
            self.n_features = 5
            self.input_size += self.n_features
        
        self.attention = SelfAttention(self.input_size)
    
        self.FC_head = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(self.input_size, 512)),
            ('relu1', nn.ReLU()),
            ('fc2', nn.Linear(512, 512)),
            ('relu2', nn.ReLU()),
            ('fc3', nn.Linear(512, 256)),
            ('relu3', nn.ReLU()),
            ('fc4', nn.Linear(256, 64)),
            ('relu4', nn.ReLU()),
            ('fc5', nn.Linear(64, 1)),
            ('identity', nn.Identity())
            ]))
    
    def forward(self, x):
        output = self.word_embedding(input_ids=x['input_ids'], attention_mask=x['attention_mask'])
        output = mean_pooling(output, x)
        
        if self.with_features:            
            output = torch.cat((output, x['features']), dim=1)
        
        output = self.attention(output.unsqueeze(1))
        output = self.FC_head(output)
        
        return output

In [10]:
class SelfAttention(nn.Module):
    def __init__(self, input_dim):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.softmax = nn.Softmax(dim=2)
        
    def forward(self, x):
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x)
        scores = torch.bmm(queries, keys.transpose(1, 2)) / (self.input_dim ** 0.5)
        attention = self.softmax(scores)
        weighted = torch.bmm(attention, values)
        return weighted
    

In [10]:
# class STSBertModel(nn.Module):
#     """
#     Sentence Semantic Similarity Bert model
#     :param: seg_head=True for using segmentation head instead of cosine similarity
#     :param: freeze_weights=True to freeze BERT model's weights and train only the segmentation head
#     """
#     def __init__(self, seg_head=False, freeze_weights=False, input_size=768):
#         super(STSBertModel, self).__init__()
        
#         self.word_embedding = RobertaModel.from_pretrained('roberta-base', config=config_model())
#         self.cos_sim = nn.CosineSimilarity(dim=0)
#         self.identity = nn.Identity()
#         self.seg_head = seg_head
#         self.fc = nn.Sequential(OrderedDict([
#             ('fc1', nn.Linear(input_size*2, 1024)),
#             ('tanh', nn.Tanh()),
#             ('fc2', nn.Linear(1024, 256)),
#             ('tanh', nn.Tanh()),
#             ('output', nn.Linear(256, 1)),
#         ]))
        
        
#         if seg_head:
# #             self.FC_head = SbertHead(inputs=768)
#             self.FC_head = nn.Sequential(OrderedDict([
# #                 ('dropout1', nn.Dropout(0.2)),
#                 ('fc_input', nn.Linear(input_size, 1024)),
#                 ('relu1', nn.ReLU()),
#                 ('fc1', nn.Linear(1024,1024)),
# #                 ('dropout2', nn.Dropout(0.2)),
# #                 ('batch_norm1', nn.BatchNorm1d(1024)),
#                 ('relu2', nn.ReLU()),
#                 ('fc2', nn.Linear(1024, 512)),
#                 ('relu3', nn.ReLU()),
# #                 ('dropout2', nn.Dropout(0.2)),
#                 ('fc3', nn.Linear(512, 512)),
# #                 ('batch_norm2', nn.BatchNorm1d(512)),
#                 ('relu4', nn.ReLU()),
#                 ('fc4', nn.Linear(512, 256)),
# #                 ('batch_norm3', nn.BatchNorm1d(256)),
#                 ('relu5', nn.ReLU()),
# #                 ('dropout3', nn.Dropout(0.2)),
#                 ('fc5', nn.Linear(256, 64)),
#                 ('relu6', nn.ReLU()),
#                 ('fc_output', nn.Linear(64, 1))
# #                 ('activation', nn.Sigmoid())
#             ]))
            
#         if freeze_weights:
#             self.freeze()
    
#     def freeze(self):
#         for param in self.word_embedding.parameters():
#             param.requires_grad = False
    
#     @staticmethod
#     def mean_pooling(outputs, batch) -> torch.tensor:
        
#         attention_mask = batch['attention_mask']
#         embeddings = outputs.last_hidden_state
    
#         mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
#         masked_embeddings = embeddings * mask
    
#         summed = torch.sum(masked_embeddings, 1)
#         summed_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
    
#         mean_pooled = summed/summed_mask
    
#         return mean_pooled
    
#     def cos_score(self, x):
#         embedding_1 = x[0]
#         embedding_2 = x[1]
#         emb = torch.concat((embedding_1, embedding_2))
        
        
        
#         cos_score = self.fc(emb)
        
# #         cos_score = self.cos_sim(embedding_1, embedding_2)
# #         cos_score = torch.sigmoid(cos_score)
        
#         return cos_score
    
#     def forward(self, x):
#         output = self.word_embedding(**x)
#         output = self.mean_pooling(output, x)
# #         output = output[1]
        
#         if self.seg_head:
#             output = self.FC_head(output)
        
#         else:
#             output = self.cos_score(output)
        
#         return output

In [None]:
# score_type = 'wording'
# batch_size = 15

# test_data = SentenseData(test=True, score=score_type)
# train_data = SentenseData(score=score_type)

# train_loader = DataLoader(train_data,
#                            batch_size=batch_size,
#                            shuffle=True)

# test_loader = DataLoader(test_data,
#                           batch_size=batch_size,
#                           shuffle=True)

## Training

In [None]:
# model = STSBertModel(seg_head=True, freeze_weights=False)

# model.float()

# model.to(device)

In [None]:
# Loss_func = nn.MSELoss()

# optimizer = torch.optim.AdamW(model.parameters(), lr=2e-06, weight_decay=2e-04)

# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 2, gamma=0.5)

# scaler = torch.cuda.amp.GradScaler()

In [20]:
def validate_model(model, loss_func):

    losses_epoch_test = []
    
    model.eval()
    with torch.no_grad():
        for inputs, targets in test_loader:
            targets = targets.float().to(device)
            if True:
                inputs = {k:val.squeeze().to(device) for k, val in inputs.items()}
                
                with torch.no_grad():
                    outputs = model(inputs)
                    
            else:
                outputs = torch.stack([model(sentence.to(device)) for sentence in inputs])


            loss = torch.sqrt(loss_func(outputs.squeeze(), targets))

            
            losses_epoch_test.append(loss.item())
    
    return losses_epoch_test

def train_model(n_epochs, lr, weight_decay, n_fold):
    
    model = STSBertModel(with_features=True)
    
    model.float()
    model.to(device)
    
    params = [
        {'params': model.word_embedding.parameters()},
        {'params': model.attention.parameters(), 'lr': 1e-04},
        {'params': model.FC_head.parameters(), 'lr': 1e-04}
    ]
    
    loss_func = nn.MSELoss()
    optimizer = torch.optim.AdamW(params, lr=lr, weight_decay=weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 2, gamma=0.5)
    scaler = torch.cuda.amp.GradScaler()
    
    losses_train = []
    losses_test = []
    t0 = datetime.now()

    for i in range(n_epochs):
    
        losses_train_per_epoch = []
    
        for j, batch in enumerate(tqdm(train_loader, position=0, leave=True), 1):
            inputs, targets = batch
            targets = targets.float().to(device)
        
            model.train()
        
            optimizer.zero_grad()
        
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                inputs = {k:val.squeeze().to(device) for k, val in inputs.items()}
                outputs = model(inputs)
            
                loss = torch.sqrt(loss_func(outputs.squeeze(), targets))
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
        
            scaler.update()
        
            losses_train_per_epoch.append(loss.item())
        
            if j%15 == 0 or j == len(train_loader):
            
                losses_test_per_epoch = validate_model(model, loss_func)
                losses_test_mean = np.mean(losses_test_per_epoch)
                losses_train_mean = np.mean(losses_train_per_epoch)
            
                losses_train.append(losses_train_mean)
                losses_test.append(losses_test_mean)
            
                if losses_test_mean == np.min(losses_test):
                    best_score = losses_test_mean
                    n_iter = len(losses_train)
                    torch.save(model.state_dict(), r'models\cv\best__fold_{}.pt'.format(n_fold))
                    print(f'Best saved, loss: {best_score:.5f}')
                
                print(f'Fold# {n_fold}, Epoch: {i+1}/{n_epochs}, L_train: {losses_train_mean:.5f},\
                L_test: {losses_test_mean:.5f}') 
            
        torch.cuda.empty_cache()

    print(f"Time elapsed:{datetime.now()-t0}, Best score:{best_score:.5f}")
    
    return best_score, n_iter, losses_test, losses_train

## CV

In [21]:
def loaders_init(score_type, batch_size, fold):
    
    test_data = SentenseData(test=True, score=score_type, fold_test=fold)
    train_data = SentenseData(score=score_type, fold_test=fold)
    
    train_loader = DataLoader(train_data,
                           batch_size=batch_size,
                           shuffle=True, drop_last=True)

    test_loader = DataLoader(test_data,
                          batch_size=batch_size,
                          shuffle=True, drop_last=True)
    
    return train_loader, test_loader

In [22]:
def axis_set(ax):
    
    ax.plot(test_losses, label='test')
    ax.plot(train_losses, label='train')
    ax.scatter(n_iter - 1, best_score, label='min')
    ax.set_title(f'#{i}, min: {best_score:.5f}', loc='left')
    ax.xaxis.set_major_locator(ticker.LinearLocator(n_epochs + 1))
    ax.xaxis.set_major_formatter(ticker.FixedFormatter(range(n_epochs)))
    
    return ax

In [23]:
# def cross_validation():
with open(prompts_train_path, encoding='utf-8') as f:
    prompts = pd.read_csv(f)
#     prompts = prompts[prompts.prompt_id != '3b9047'].reset_index(drop=True)
    
plt.figure(figsize=(12,4))

folds_list = list(prompts.prompt_id)
score_type = 'wording'
batch_size = 15
lr = 2e-06
weight_decay = 2e-02

n_epochs = 5

scores = []

fig, axs = plt.subplots(1, len(folds_list), figsize=(12,4), layout='tight')

for i, fold in enumerate(folds_list, 1):
    train_loader, test_loader = loaders_init(score_type, batch_size, fold)
    
    best_score, n_iter, test_losses, train_losses = train_model(n_epochs, lr, weight_decay, i)
    
    scores.append(best_score)
    
    axs[i-1] = axis_set(axs[i-1])
    
#     plt.subplot(1, 4, i)
#     plt.plot(test_losses, label='test')
#     plt.plot(train_losses, label='train')
#     plt.scatter(n_iter - 1, best_score, label='min')
#     plt.legend()
#     plt.title(f'#{i}, min: {best_score:.5f}')
#     plt.tight_layout()
    
mean_score = np.mean(scores)

fig.suptitle(f'{mean_score:.5f}')
fig.savefig('res_figure.png', dpi=200)

mean_score

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 29%|███████████████████████▌                                                        | 100/340 [00:37<21:34,  5.39s/it]

Best saved, loss: 0.59299
Fold# 1, Epoch: 1/5, L_train: 0.76454,                L_test: 0.59299


 59%|███████████████████████████████████████████████                                 | 200/340 [01:14<12:18,  5.27s/it]

Fold# 1, Epoch: 1/5, L_train: 0.70534,                L_test: 0.59735


 88%|██████████████████████████████████████████████████████████████████████▌         | 300/340 [01:50<03:31,  5.28s/it]

Fold# 1, Epoch: 1/5, L_train: 0.68006,                L_test: 0.61006


100%|████████████████████████████████████████████████████████████████████████████████| 340/340 [02:16<00:00,  2.49it/s]


Best saved, loss: 0.58367
Fold# 1, Epoch: 1/5, L_train: 0.67486,                L_test: 0.58367


 29%|███████████████████████▌                                                        | 100/340 [00:36<21:07,  5.28s/it]

Fold# 1, Epoch: 2/5, L_train: 0.59780,                L_test: 0.60477


 59%|███████████████████████████████████████████████                                 | 200/340 [01:13<12:18,  5.28s/it]

Fold# 1, Epoch: 2/5, L_train: 0.59331,                L_test: 0.64144


 88%|██████████████████████████████████████████████████████████████████████▌         | 300/340 [01:50<03:32,  5.30s/it]

Fold# 1, Epoch: 2/5, L_train: 0.58479,                L_test: 0.63864


100%|████████████████████████████████████████████████████████████████████████████████| 340/340 [02:15<00:00,  2.51it/s]


Fold# 1, Epoch: 2/5, L_train: 0.57894,                L_test: 0.62049


 29%|███████████████████████▌                                                        | 100/340 [00:36<21:06,  5.28s/it]

Fold# 1, Epoch: 3/5, L_train: 0.53232,                L_test: 0.63339


 59%|███████████████████████████████████████████████                                 | 200/340 [01:13<12:19,  5.28s/it]

Fold# 1, Epoch: 3/5, L_train: 0.53106,                L_test: 0.62436


 88%|██████████████████████████████████████████████████████████████████████▌         | 300/340 [01:50<03:31,  5.29s/it]

Fold# 1, Epoch: 3/5, L_train: 0.53512,                L_test: 0.62974


100%|████████████████████████████████████████████████████████████████████████████████| 340/340 [02:15<00:00,  2.51it/s]


Fold# 1, Epoch: 3/5, L_train: 0.53474,                L_test: 0.64374


 29%|███████████████████████▌                                                        | 100/340 [00:36<21:07,  5.28s/it]

Fold# 1, Epoch: 4/5, L_train: 0.49416,                L_test: 0.67997


 59%|███████████████████████████████████████████████                                 | 200/340 [01:14<12:37,  5.41s/it]

Best saved, loss: 0.53988
Fold# 1, Epoch: 4/5, L_train: 0.50765,                L_test: 0.53988


 88%|██████████████████████████████████████████████████████████████████████▌         | 300/340 [01:50<03:31,  5.29s/it]

Fold# 1, Epoch: 4/5, L_train: 0.51016,                L_test: 0.73795


100%|████████████████████████████████████████████████████████████████████████████████| 340/340 [02:15<00:00,  2.50it/s]


Fold# 1, Epoch: 4/5, L_train: 0.50802,                L_test: 0.59054


 29%|███████████████████████▌                                                        | 100/340 [00:36<21:09,  5.29s/it]

Fold# 1, Epoch: 5/5, L_train: 0.49540,                L_test: 0.72137


 59%|███████████████████████████████████████████████                                 | 200/340 [01:13<12:19,  5.28s/it]

Fold# 1, Epoch: 5/5, L_train: 0.47621,                L_test: 0.63909


 88%|██████████████████████████████████████████████████████████████████████▌         | 300/340 [01:50<03:31,  5.29s/it]

Fold# 1, Epoch: 5/5, L_train: 0.46993,                L_test: 0.71714


100%|████████████████████████████████████████████████████████████████████████████████| 340/340 [02:15<00:00,  2.51it/s]

Fold# 1, Epoch: 5/5, L_train: 0.47096,                L_test: 0.69909
Time elapsed:0:11:19.141757, Best score:0.53988



  ax.xaxis.set_major_formatter(ticker.FixedFormatter(range(n_epochs)))
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 29%|███████████████████████▎                                                        | 100/343 [00:36<21:27,  5.30s/it]

Best saved, loss: 0.95058
Fold# 2, Epoch: 1/5, L_train: 0.69231,                L_test: 0.95058


 58%|██████████████████████████████████████████████▋                                 | 200/343 [01:13<12:22,  5.19s/it]

Fold# 2, Epoch: 1/5, L_train: 0.64834,                L_test: 0.95240


 87%|█████████████████████████████████████████████████████████████████████▉          | 300/343 [01:50<03:48,  5.32s/it]

Best saved, loss: 0.86919
Fold# 2, Epoch: 1/5, L_train: 0.62740,                L_test: 0.86919


100%|████████████████████████████████████████████████████████████████████████████████| 343/343 [02:15<00:00,  2.53it/s]


Best saved, loss: 0.80183
Fold# 2, Epoch: 1/5, L_train: 0.62268,                L_test: 0.80183


 29%|███████████████████████▎                                                        | 100/343 [00:36<21:00,  5.19s/it]

Fold# 2, Epoch: 2/5, L_train: 0.54030,                L_test: 0.94194


 58%|██████████████████████████████████████████████▋                                 | 200/343 [01:13<12:22,  5.19s/it]

Fold# 2, Epoch: 2/5, L_train: 0.54292,                L_test: 0.87150


 87%|█████████████████████████████████████████████████████████████████████▉          | 300/343 [01:49<03:42,  5.19s/it]

Fold# 2, Epoch: 2/5, L_train: 0.54124,                L_test: 0.92012


100%|████████████████████████████████████████████████████████████████████████████████| 343/343 [02:14<00:00,  2.55it/s]


Fold# 2, Epoch: 2/5, L_train: 0.53880,                L_test: 0.84681


 29%|███████████████████████▌                                                        | 101/343 [00:36<14:53,  3.69s/it]

Fold# 2, Epoch: 3/5, L_train: 0.50034,                L_test: 0.91416


 58%|██████████████████████████████████████████████▋                                 | 200/343 [01:13<12:21,  5.19s/it]

Fold# 2, Epoch: 3/5, L_train: 0.50820,                L_test: 0.86858


 87%|█████████████████████████████████████████████████████████████████████▉          | 300/343 [01:49<03:42,  5.19s/it]

Fold# 2, Epoch: 3/5, L_train: 0.50548,                L_test: 0.87782


100%|████████████████████████████████████████████████████████████████████████████████| 343/343 [02:14<00:00,  2.54it/s]


Fold# 2, Epoch: 3/5, L_train: 0.50333,                L_test: 0.84631


 29%|███████████████████████▎                                                        | 100/343 [00:37<21:21,  5.27s/it]

Fold# 2, Epoch: 4/5, L_train: 0.46322,                L_test: 0.84670


 58%|██████████████████████████████████████████████▋                                 | 200/343 [01:13<12:25,  5.21s/it]

Fold# 2, Epoch: 4/5, L_train: 0.47417,                L_test: 0.84333


 88%|██████████████████████████████████████████████████████████████████████▏         | 301/343 [01:50<02:35,  3.70s/it]

Fold# 2, Epoch: 4/5, L_train: 0.47616,                L_test: 0.87413


100%|████████████████████████████████████████████████████████████████████████████████| 343/343 [02:15<00:00,  2.53it/s]


Fold# 2, Epoch: 4/5, L_train: 0.47613,                L_test: 0.81361


 29%|███████████████████████▎                                                        | 100/343 [00:37<21:41,  5.36s/it]

Best saved, loss: 0.77862
Fold# 2, Epoch: 5/5, L_train: 0.43901,                L_test: 0.77862


 58%|██████████████████████████████████████████████▋                                 | 200/343 [01:14<12:41,  5.32s/it]

Best saved, loss: 0.76830
Fold# 2, Epoch: 5/5, L_train: 0.44085,                L_test: 0.76830


 87%|█████████████████████████████████████████████████████████████████████▉          | 300/343 [01:50<03:43,  5.20s/it]

Fold# 2, Epoch: 5/5, L_train: 0.43823,                L_test: 0.85116


100%|████████████████████████████████████████████████████████████████████████████████| 343/343 [02:16<00:00,  2.52it/s]

Best saved, loss: 0.76157
Fold# 2, Epoch: 5/5, L_train: 0.44069,                L_test: 0.76157
Time elapsed:0:11:17.460307, Best score:0.76157



  ax.xaxis.set_major_formatter(ticker.FixedFormatter(range(n_epochs)))
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 25%|███████████████████▊                                                            | 100/404 [00:29<15:27,  3.05s/it]

Best saved, loss: 0.85112
Fold# 3, Epoch: 1/5, L_train: 0.74153,                L_test: 0.85112


 50%|███████████████████████████████████████▌                                        | 200/404 [00:58<09:56,  2.92s/it]

Fold# 3, Epoch: 1/5, L_train: 0.67932,                L_test: 0.96075


 74%|███████████████████████████████████████████████████████████▍                    | 300/404 [01:27<05:17,  3.05s/it]

Best saved, loss: 0.84434
Fold# 3, Epoch: 1/5, L_train: 0.65834,                L_test: 0.84434


 99%|███████████████████████████████████████████████████████████████████████████████▏| 400/404 [01:57<00:11,  2.94s/it]

Fold# 3, Epoch: 1/5, L_train: 0.64043,                L_test: 0.90667


100%|████████████████████████████████████████████████████████████████████████████████| 404/404 [02:07<00:00,  3.17it/s]


Best saved, loss: 0.83474
Fold# 3, Epoch: 1/5, L_train: 0.64027,                L_test: 0.83474


 25%|███████████████████▊                                                            | 100/404 [00:29<15:32,  3.07s/it]

Best saved, loss: 0.79516
Fold# 3, Epoch: 2/5, L_train: 0.56441,                L_test: 0.79516


 50%|███████████████████████████████████████▌                                        | 200/404 [00:59<10:12,  3.00s/it]

Fold# 3, Epoch: 2/5, L_train: 0.54971,                L_test: 0.79546


 74%|███████████████████████████████████████████████████████████▍                    | 300/404 [01:28<05:06,  2.95s/it]

Fold# 3, Epoch: 2/5, L_train: 0.54517,                L_test: 0.83928


 99%|███████████████████████████████████████████████████████████████████████████████▏| 400/404 [01:58<00:12,  3.08s/it]

Best saved, loss: 0.77133
Fold# 3, Epoch: 2/5, L_train: 0.54379,                L_test: 0.77133


100%|████████████████████████████████████████████████████████████████████████████████| 404/404 [02:08<00:00,  3.15it/s]


Fold# 3, Epoch: 2/5, L_train: 0.54281,                L_test: 0.85319


 25%|███████████████████▊                                                            | 100/404 [00:29<15:31,  3.06s/it]

Best saved, loss: 0.75803
Fold# 3, Epoch: 3/5, L_train: 0.50700,                L_test: 0.75803


 50%|███████████████████████████████████████▌                                        | 200/404 [00:59<10:26,  3.07s/it]

Best saved, loss: 0.70137
Fold# 3, Epoch: 3/5, L_train: 0.50180,                L_test: 0.70137


 74%|███████████████████████████████████████████████████████████▍                    | 300/404 [01:28<05:09,  2.98s/it]

Fold# 3, Epoch: 3/5, L_train: 0.50535,                L_test: 0.78837


 99%|███████████████████████████████████████████████████████████████████████████████▏| 400/404 [01:57<00:11,  2.92s/it]

Fold# 3, Epoch: 3/5, L_train: 0.50864,                L_test: 0.77399


100%|████████████████████████████████████████████████████████████████████████████████| 404/404 [02:07<00:00,  3.17it/s]


Fold# 3, Epoch: 3/5, L_train: 0.50809,                L_test: 0.70294


 25%|████████████████████                                                            | 101/404 [00:29<11:04,  2.19s/it]

Best saved, loss: 0.69404
Fold# 3, Epoch: 4/5, L_train: 0.48043,                L_test: 0.69404


 50%|███████████████████████████████████████▌                                        | 200/404 [00:58<09:57,  2.93s/it]

Fold# 3, Epoch: 4/5, L_train: 0.48433,                L_test: 0.74438


 74%|███████████████████████████████████████████████████████████▍                    | 300/404 [01:27<05:19,  3.07s/it]

Best saved, loss: 0.69090
Fold# 3, Epoch: 4/5, L_train: 0.48192,                L_test: 0.69090


 99%|███████████████████████████████████████████████████████████████████████████████▏| 400/404 [01:57<00:11,  2.92s/it]

Fold# 3, Epoch: 4/5, L_train: 0.47749,                L_test: 0.74878


100%|████████████████████████████████████████████████████████████████████████████████| 404/404 [02:06<00:00,  3.18it/s]


Fold# 3, Epoch: 4/5, L_train: 0.47739,                L_test: 0.71954


 25%|███████████████████▊                                                            | 100/404 [00:29<14:46,  2.92s/it]

Fold# 3, Epoch: 5/5, L_train: 0.44186,                L_test: 0.69317


 50%|███████████████████████████████████████▌                                        | 200/404 [00:58<10:01,  2.95s/it]

Fold# 3, Epoch: 5/5, L_train: 0.45098,                L_test: 0.75706


 74%|███████████████████████████████████████████████████████████▍                    | 300/404 [01:27<05:04,  2.93s/it]

Fold# 3, Epoch: 5/5, L_train: 0.45176,                L_test: 0.70975


 99%|███████████████████████████████████████████████████████████████████████████████▏| 400/404 [01:56<00:11,  2.93s/it]

Fold# 3, Epoch: 5/5, L_train: 0.45168,                L_test: 0.70752


100%|████████████████████████████████████████████████████████████████████████████████| 404/404 [02:06<00:00,  3.20it/s]

Fold# 3, Epoch: 5/5, L_train: 0.45178,                L_test: 0.70371
Time elapsed:0:10:36.223476, Best score:0.69090



  ax.xaxis.set_major_formatter(ticker.FixedFormatter(range(n_epochs)))
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 29%|███████████████████████▎                                                        | 100/344 [00:37<21:39,  5.32s/it]

Best saved, loss: 0.67078
Fold# 4, Epoch: 1/5, L_train: 0.75498,                L_test: 0.67078


 58%|██████████████████████████████████████████████▌                                 | 200/344 [01:14<12:45,  5.32s/it]

Best saved, loss: 0.62521
Fold# 4, Epoch: 1/5, L_train: 0.71601,                L_test: 0.62521


 87%|█████████████████████████████████████████████████████████████████████▊          | 300/344 [01:51<03:54,  5.33s/it]

Best saved, loss: 0.61713
Fold# 4, Epoch: 1/5, L_train: 0.68285,                L_test: 0.61713


100%|████████████████████████████████████████████████████████████████████████████████| 344/344 [02:16<00:00,  2.52it/s]


Fold# 4, Epoch: 1/5, L_train: 0.67379,                L_test: 0.62227


 29%|███████████████████████▎                                                        | 100/344 [00:37<21:49,  5.37s/it]

Best saved, loss: 0.61640
Fold# 4, Epoch: 2/5, L_train: 0.59737,                L_test: 0.61640


 58%|██████████████████████████████████████████████▌                                 | 200/344 [01:13<12:28,  5.20s/it]

Fold# 4, Epoch: 2/5, L_train: 0.57960,                L_test: 0.62284


 87%|█████████████████████████████████████████████████████████████████████▊          | 300/344 [01:50<03:53,  5.31s/it]

Best saved, loss: 0.61522
Fold# 4, Epoch: 2/5, L_train: 0.59003,                L_test: 0.61522


100%|████████████████████████████████████████████████████████████████████████████████| 344/344 [02:16<00:00,  2.52it/s]


Fold# 4, Epoch: 2/5, L_train: 0.58900,                L_test: 0.64566


 29%|███████████████████████▎                                                        | 100/344 [00:36<21:08,  5.20s/it]

Fold# 4, Epoch: 3/5, L_train: 0.53542,                L_test: 0.61683


 58%|██████████████████████████████████████████████▌                                 | 200/344 [01:13<12:44,  5.31s/it]

Best saved, loss: 0.58757
Fold# 4, Epoch: 3/5, L_train: 0.54168,                L_test: 0.58757


 87%|█████████████████████████████████████████████████████████████████████▊          | 300/344 [01:49<03:47,  5.17s/it]

Fold# 4, Epoch: 3/5, L_train: 0.54662,                L_test: 0.61968


100%|████████████████████████████████████████████████████████████████████████████████| 344/344 [02:15<00:00,  2.54it/s]


Fold# 4, Epoch: 3/5, L_train: 0.54795,                L_test: 0.59979


 29%|███████████████████████▎                                                        | 100/344 [00:36<21:03,  5.18s/it]

Fold# 4, Epoch: 4/5, L_train: 0.49968,                L_test: 0.60405


 58%|██████████████████████████████████████████████▌                                 | 200/344 [01:13<12:26,  5.18s/it]

Fold# 4, Epoch: 4/5, L_train: 0.50949,                L_test: 0.63097


 87%|█████████████████████████████████████████████████████████████████████▊          | 300/344 [01:50<03:54,  5.34s/it]

Best saved, loss: 0.58463
Fold# 4, Epoch: 4/5, L_train: 0.51230,                L_test: 0.58463


100%|████████████████████████████████████████████████████████████████████████████████| 344/344 [02:15<00:00,  2.53it/s]


Fold# 4, Epoch: 4/5, L_train: 0.51270,                L_test: 0.59484


 29%|███████████████████████▎                                                        | 100/344 [00:36<21:03,  5.18s/it]

Fold# 4, Epoch: 5/5, L_train: 0.48952,                L_test: 0.59773


 58%|██████████████████████████████████████████████▌                                 | 200/344 [01:13<12:38,  5.27s/it]

Fold# 4, Epoch: 5/5, L_train: 0.49002,                L_test: 0.59655


 87%|█████████████████████████████████████████████████████████████████████▊          | 300/344 [01:50<03:49,  5.22s/it]

Fold# 4, Epoch: 5/5, L_train: 0.48588,                L_test: 0.62019


100%|████████████████████████████████████████████████████████████████████████████████| 344/344 [02:16<00:00,  2.52it/s]

Best saved, loss: 0.57768
Fold# 4, Epoch: 5/5, L_train: 0.48043,                L_test: 0.57768
Time elapsed:0:11:20.876812, Best score:0.57768
Unexpected exception formatting exception. Falling back to standard exception



  ax.xaxis.set_major_formatter(ticker.FixedFormatter(range(n_epochs)))
Traceback (most recent call last):
  File "C:\Users\shmak\miniconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3433, in run_code
  File "C:\Users\shmak\AppData\Local\Temp\ipykernel_25720\3764877995.py", line 39, in <module>
    fig.suptitle(f'{mean_score:.5f}', loc='left')
  File "C:\Users\shmak\miniconda3\envs\tf\lib\site-packages\matplotlib\figure.py", line 381, in suptitle
    return self._suplabels(t, info, **kwargs)
  File "C:\Users\shmak\miniconda3\envs\tf\lib\site-packages\matplotlib\figure.py", line 360, in _suplabels
    sup = self.text(x, y, t, **kwargs)
  File "C:\Users\shmak\miniconda3\envs\tf\lib\site-packages\matplotlib\figure.py", line 1166, in text
    text = Text(x=x, y=y, text=s, **effective_kwargs)
  File "C:\Users\shmak\miniconda3\envs\tf\lib\site-packages\matplotlib\_api\deprecation.py", line 454, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\shmak\minicon

<Figure size 1200x400 with 0 Axes>

In [24]:
fig.suptitle(f'{mean_score:.5f}')

In [26]:
fig.savefig('res_figure.png', dpi=200)

In [None]:
# fig, axs = plt.subplots(1, len(fold_list), figsize=(12,4), layout='tight')

# axs[i].plot(test_losses, label='test')
# axs[i].plot(train_losses, label='train')
# axs[i].scatter(n_iter - 1, best_score, label='min')
# axs[i].set_title(f'#{i}, min: {best_score:.5f}', loc=left)
# axs[i].xaxis.set_major_locator(ticker.LinearLocator(n_epochs + 1))
# axs[i].xaxis.set_major_formatter(ticker.FixedFormatter(range(n_epochs)))

In [1]:
scores

NameError: name 'scores' is not defined

In [14]:
torch.cuda.empty_cache()

In [13]:
folds_list

['39c16e', '814d6b', 'ebad26']