In [1]:
import math
from tqdm import tqdm
from datetime import datetime
import pandas as pd
import numpy as np
import os
import gensim
import torch
import sentence_transformers
import matplotlib.pyplot as plt
%matplotlib inline
import torch.nn as nn

from transformers import BertTokenizer, BertModel, pipeline, BartModel
from transformers import RobertaTokenizer, RobertaModel, AutoConfig, RobertaConfig
from transformers import AutoTokenizer, AutoModel

from sentence_transformers import SentenceTransformer, models
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn.functional as F

from collections import OrderedDict

import textstat

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cuda:0


In [3]:
model_name = 'roberta-large'

In [4]:
def config_model():
    configuration = AutoConfig.from_pretrained(model_name)

    configuration.hidden_dropout_prob = 0.0
    configuration.attention_probs_dropout_prob = 0.0
    configuration.classifier_dropout = 0.0

    return configuration

config = config_model()
size = config.hidden_size

tokenizer = RobertaTokenizer.from_pretrained(model_name)

# Objects and functions

In [5]:
summaries_train_path = r"dataset\summaries_train.csv"
summaries_test_path = r"dataset\summaries_test.csv"
prompts_train_path = r"dataset\prompts_train.csv"
prompts_test_path = r"dataset\prompts_test.csv"

In [6]:
def norm_score(df: pd.DataFrame) -> pd.DataFrame:
    """Normilizing score to values 0 to 1"""
    df -= np.min(df)
    df /= df.max()
    print('Normilized' if df.min() == 0.0 and df.max() == 1.0 else 'NormError:wrong values')
    
    return df

def short_text(text, max_length=400) -> str:
    if len(text.split()) > 700:
        middle_point = text[len(text)//2:].index('.') + len(text)//2 + 1
        text_pt1 = text[:middle_point]
        text_pt2 = text[middle_point:]
        
        text = short_text(text_pt1) + short_text(text_pt2)
        
    summarized = summarizer(text, max_length=max_length, min_length=200, do_sample=False)
    
    return summarized[0]['summary_text']

def moving_average(array: np.array, betta=0.9) -> np.array:
    """
    Computing moving average with bias correction.
    """
    
    V = 0
    average_array = np.zeros(len(array))
    for i in range(len(array)):
        V = betta * V + (1 - betta) * array[i]
        average_array[i] = V/(1 - pow(betta, i+1))
        
    return average_array



def mean_pooling(outputs, batch) -> torch.tensor:
    
    attention_mask = batch['attention_mask']
    embeddings = outputs.last_hidden_state
    
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask
    
    summed = torch.sum(masked_embeddings, 1)
    summed_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
    
    mean_pooled = summed/summed_mask
    
    return mean_pooled

def normilize(df_column: pd.DataFrame):
    
    df_column = (df_column - df_column.mean())/df_column.std()
    
    return df_column

def get_stat_features(df, text_col="text"):
    
    df["num_unique_words"] = normilize(df[text_col].apply(lambda x: len(set(x.split()))))
    df["num_words"] = normilize(df[text_col].apply(lambda x: len(x.split())))
    df["num_sentences"] = normilize(df[text_col].apply(lambda x: len(x.split('.'))))
    
    df["syntax_count"] = normilize(df[text_col].apply(lambda x: x.count(",") 
                                                      + x.count("-") + x.count(";") + x.count(":")))
    df['smog_index'] = normilize(df[text_col].apply(lambda x: textstat.smog_index(x)))
    
    
    return df

In [7]:
# All data

class SentenseData(Dataset):
    """
    :params: path to csv file with summaries, path to csv file with prompts, 'score' param\
    defines which score is used: content/wording
    """
    def __init__(self, summaries_path=summaries_train_path,
                 prompts_path=prompts_train_path,
                 score='content'):
        
        with open(summaries_path, encoding='utf-8') as f:
            summaries = pd.read_csv(f)
            summaries = get_stat_features(summaries)
            
            self.summaries = summaries
            
        with open(prompts_path, encoding='utf-8') as f:
            self.prompts = pd.read_csv(f)
        
        self.score_type = score
        
    def __len__(self):
        return len(self.__summaries)
    
    @property
    def summaries(self):
        return self.__summaries
    
    @summaries.setter
    def summaries(self, df):
        # df = df[df.prompt_id != '3b9047'].reset_index(drop=True)
        self.__summaries = df

    @property
    def prompts(self):
        return self.__prompts
    
    @prompts.setter
    def prompts(self, file):
        # file = file[file.prompt_id != '3b9047'].reset_index(drop=True)
        self.__prompts = file
    
    def get_batch_text(self, index):
        summary_text = self.summaries.text[index]

        res = tokenizer(summary_text, padding='max_length', return_tensors='pt', truncation=True)
        res = {k:val.squeeze() for k, val in res.items()}
        res['features'] = torch.tensor([self.summaries['num_unique_words'][index], self.summaries['num_words'][index],
                                  self.summaries['num_sentences'][index],self.summaries['syntax_count'][index],
                                  self.summaries['smog_index'][index]], dtype=torch.float32)
            
        return res
        
    def get_score(self, index):
        if self.score_type == 'content':
            score = self.summaries.content[index]
            
        elif self.score_type == 'wording':
            score = self.summaries.wording[index]
        
        return torch.tensor(score)
        
    def __getitem__(self, index) -> torch.tensor:

        batch_text = self.get_batch_text(index)
        batch_score = self.get_score(index)
        
        return batch_text, batch_score

In [8]:
# class SentenseData(Dataset):
#     """
#     :params: path to csv file with summaries, path to csv file with prompts, 'score' param\
#     defines which score is used: content/wording
#     """
#     def __init__(self, summaries_path=summaries_train_path,
#                  prompts_path=prompts_train_path,
#                  score='content', test=False):
#         self.test = test
        
#         with open(summaries_path, encoding='utf-8') as f:
#             self.summaries = pd.read_csv(f)
            
#         with open(prompts_path, encoding='utf-8') as f:
#             self.prompts = pd.read_csv(f)
        
# #         self.summaries.content = norm_score(self.summaries.content)
# #         self.summaries.wording = norm_score(self.summaries.wording)
        
#         self.score_type = score
        
#     def __len__(self):
#         return len(self.__summaries)
    
#     @property
#     def summaries(self):
#         return self.__summaries
    
#     @summaries.setter
#     def summaries(self, df):
#         if self.test:
#             self.__summaries = df[df.prompt_id == '39c16e'].reset_index(drop=True)
            
#         else:
#             self.__summaries = df[df.prompt_id != '39c16e'].reset_index(drop=True)
    
#     @property
#     def prompts(self):
#         return self.__prompts
    
#     @prompts.setter
#     def prompts(self, file):
#         self.__prompts = file
# #         for i, text in enumerate(self.__prompts.prompt_text):
# #             if len(text.split()) > 600:
# #                 self.__prompts.prompt_text[i] = short_text(text)
    
#     def get_batch_text(self, index):
#         summary_text = self.summaries.text[index]
        
#         if True:#self.score_type == 'wording':
#             res = tokenizer(summary_text, padding='max_length', return_tensors='pt', truncation=True)
#             return {k:val.squeeze() for k, val in res.items()}
        
#         prompt_text = self.prompts.prompt_text[self.prompts.prompt_id ==
#                                                self.summaries.prompt_id[index]].item().replace('\n','')
        
#         return tokenizer([summary_text, prompt_text], padding='max_length', return_tensors='pt', truncation=True)
    
#     def get_score(self, index):
#         if self.score_type == 'content':
#             score = self.summaries.content[index]
            
#         elif self.score_type == 'wording':
#             score = self.summaries.wording[index]
        
#         return torch.tensor(score)
        
#     def __getitem__(self, index) -> torch.tensor:

#         batch_text = self.get_batch_text(index)
#         batch_score = self.get_score(index)
        
#         return batch_text, batch_score

In [9]:
# class CosineSimilarityLoss(nn.Module):
#     def __init__(self):
#         super(CosineSimilarityLoss, self).__init__()
#         self.loss_func = nn.MSELoss()
#         self.identity = nn.Identity()
#         self.cos_sim = nn.CosineSimilarity(dim=1)
        
#     def forward(self, x, y):
#         embedding_1 = torch.stack([sentence[0] for sentence in x])
#         embedding_2 = torch.stack([sentence[1] for sentence in x])
        
#         cos_score = self.cos_sim(embedding_1, embedding_2)
#         cos_score = self.identity(cos_score)
        
#         loss = self.loss_func(x, y)
        
#         return loss

In [10]:
# # Реализовать батч лоадер, возвращающий с каждой итерацией батч - лист с bs количеством словарей
# # Нужно, чтобы умел с шафлом
#
# class BatchLoader(DataLoader):
#     def __init__(self, dataset, batch_size=1, shuffle=False, drop=False):
# #         super(BatchLoader, self).__init__()
#
#         self.dataset = dataset
#         self.batch_size = batch_size
#         self.shuffle = shuffle
#         self.drop = drop
#         self.dataset_len = len(dataset)
#
#     def __len__(self):
#         return math.ceil(self.dataset_len/self.batch_size)
#
#     def __iter__(self):
#         self.index = list(range(self.dataset_len))
#         if self.shuffle:
#             np.random.shuffle(self.index)
#         return self
#
#     def __next__(self):
#         batch_index = []
#
#         if self.batch_size >= len(self.index):
#             if self.drop or not self.index:
#                 raise StopIteration
#             else:
#                 batch_index = self.index.copy()
#                 self.index.clear()
#         else:
#             for _ in range(self.batch_size):
#                 batch_index.append(self.index.pop())
#
#         batch = []
#         target = []
#
#         for i in batch_index:
#             batch.append(self.dataset[i][0])
#             target.append(self.dataset[i][1])
#
#         return batch, torch.tensor(target)
#
#     def __getitem__(self, index):
#         batch = self.dataset[index][0]
#         target = self.dataset[index][1]
#
#         return batch, target

In [11]:
class STSBertModel(nn.Module):
    """
    Sentence Semantic Similarity Bert model
    :param: seg_head=True for using segmentation head instead of cosine similarity
    :param: freeze_weights=True to freeze BERT model's weights and train only the segmentation head
    """
    def __init__(self, with_features=False, input_size=size):
        super(STSBertModel, self).__init__()

        self.word_embedding = RobertaModel.from_pretrained(model_name, config=config)
        self.cos_score = nn.CosineSimilarity(dim=0)
        self.identity = nn.Identity()
        self.input_size = input_size
        self.with_features = with_features

        if with_features:
            self.n_features = 5
            self.input_size += self.n_features

        # for param in self.word_embedding.parameters():
        #     param.requires_grad = False

        self.attention = SelfAttention(self.input_size)
        self.block = AttentionBlock(self.input_size)

        self.FC_head = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(self.input_size, self.input_size)),
            ('relu1', nn.ReLU()),
            # ('fc2', nn.Linear(1024, self.input_size)),
            # ('relu2', nn.ReLU())
            ]))

        self.FC_output = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(self.input_size, 1)),
            ]))

    def forward(self, x):
        output = self.word_embedding(input_ids=x['input_ids'], attention_mask=x['attention_mask'])
        output = mean_pooling(output, x)

        if self.with_features:
            output = torch.cat((output, x['features']), dim=1)

        output = output.unsqueeze(1)
        output = self.attention(output)
        output = self.FC_output(output)

        return output


class AttentionBlock(nn.Module):
    def __init__(self, input_dim):
        super(AttentionBlock, self).__init__()

        self.input_dim = input_dim
        self.fc = nn.Linear(input_dim, input_dim)
        self.attention = SelfAttention(input_dim)
        self.relu = nn.ReLU()
        self.norm = nn.LayerNorm(input_dim)

    def forward(self, x):
        identity = x

        x = self.attention(x)
        x = self.fc(x)

        x += identity
        x = self.norm(x)
        x = self.relu(x)

        return x


class SelfAttention(nn.Module):
    def __init__(self, input_dim):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x):
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x)
        scores = torch.bmm(queries, keys.transpose(1, 2)) / (self.input_dim ** 0.5)
        attention = self.softmax(scores)
        weighted = torch.bmm(attention, values)
        return weighted


In [12]:
# # MOdel without features

# class STSBertModel(nn.Module):
#     """
#     Sentence Semantic Similarity Bert model
#     :param: seg_head=True for using segmentation head instead of cosine similarity
#     :param: freeze_weights=True to freeze BERT model's weights and train only the segmentation head
#     """
#     def __init__(self, seg_head=False, freeze_weights=False, input_size=768):
#         super(STSBertModel, self).__init__()
        
#         self.word_embedding = embeddings_model
#         self.cos_sim = nn.CosineSimilarity(dim=0)
#         self.identity = nn.Identity()
#         self.seg_head = seg_head
#         self.input_size = input_size
        
#         self.attention = SelfAttention(self.input_size)
#         self.FC_head = nn.Sequential(OrderedDict([
#             ('fc1', nn.Linear(self.input_size, 1))
#         ]))
            
#         if freeze_weights:
#             self.freeze()
    
#     def freeze(self):
#         for param in self.word_embedding.parameters():
#             param.requires_grad = False
    
#     def cos_score(self, x):
#         embedding_1 = x[0]
#         embedding_2 = x[1]
        
#         res = self.cos_sim(embedding_1, embedding_2)
# #         cos_score = torch.sigmoid(cos_score)

#         return res
    
#     def forward(self, x):
#         output = self.word_embedding(**x)
#         output = mean_pooling(output, x)
#         output = self.attention(output.unsqueeze(1))

# #         output = self.attention(output[0])
# #         output = torch.mean(output, dim=1)
# #         output = torch.max(output, dim=1).values
    
#         if self.seg_head:            
#             output = self.FC_head(output)
        
#         else:
#             output = self.cos_score(output)
#             output = (output * 6.1) - 1.8
        
#         return output

In [13]:
# class STSBertModel(nn.Module):
#     """
#     Sentence Semantic Similarity Bert model
#     :param: seg_head=True for using segmentation head instead of cosine similarity
#     :param: freeze_weights=True to freeze BERT model's weights and train only the segmentation head
#     """
#     def __init__(self, seg_head=False, freeze_weights=False, input_size=768):
#         super(STSBertModel, self).__init__()
        
#         self.word_embedding = embeddings_model
#         self.cos_sim = nn.CosineSimilarity(dim=0)
#         self.identity = nn.Identity()
#         self.seg_head = seg_head
#         self.fc = nn.Sequential(OrderedDict([
#             ('fc1', nn.Linear(input_size*2, 1024)),
#             ('tanh', nn.Tanh()),
#             ('fc2', nn.Linear(1024, 256)),
#             ('tanh', nn.Tanh()),
#             ('output', nn.Linear(256, 1)),
#         ]))
        
        
#         if seg_head:
# #             self.FC_head = SbertHead(inputs=768)
#             self.FC_head = nn.Sequential(OrderedDict([
# #                 ('dropout1', nn.Dropout(0.2)),
#                 ('fc_input', nn.Linear(input_size, 1024)),
#                 ('relu1', nn.ReLU()),
#                 ('fc1', nn.Linear(1024,1024)),
# #                 ('dropout2', nn.Dropout(0.2)),
# #                 ('batch_norm1', nn.BatchNorm1d(1024)),
#                 ('relu2', nn.ReLU()),
#                 ('fc2', nn.Linear(1024, 512)),
#                 ('relu3', nn.ReLU()),
# #                 ('dropout2', nn.Dropout(0.2)),
#                 ('fc3', nn.Linear(512, 512)),
# #                 ('batch_norm2', nn.BatchNorm1d(512)),
#                 ('relu4', nn.ReLU()),
#                 ('fc4', nn.Linear(512, 256)),
# #                 ('batch_norm3', nn.BatchNorm1d(256)),
#                 ('relu5', nn.ReLU()),
# #                 ('dropout3', nn.Dropout(0.2)),
#                 ('fc5', nn.Linear(256, 64)),
#                 ('relu6', nn.ReLU()),
#                 ('fc_output', nn.Linear(64, 1))
# #                 ('activation', nn.Sigmoid())
#             ]))
            
#         if freeze_weights:
#             self.freeze()
    
#     def freeze(self):
#         for param in self.word_embedding.parameters():
#             param.requires_grad = False
    
#     @staticmethod
#     def mean_pooling(outputs, batch) -> torch.tensor:
        
#         attention_mask = batch['attention_mask']
#         embeddings = outputs.last_hidden_state
    
#         mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
#         masked_embeddings = embeddings * mask
    
#         summed = torch.sum(masked_embeddings, 1)
#         summed_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
    
#         mean_pooled = summed/summed_mask
    
#         return mean_pooled
    
#     def cos_score(self, x):
#         embedding_1 = x[0]
#         embedding_2 = x[1]
#         emb = torch.concat((embedding_1, embedding_2))
        
        
        
#         cos_score = self.fc(emb)
        
# #         cos_score = self.cos_sim(embedding_1, embedding_2)
# #         cos_score = torch.sigmoid(cos_score)
        
#         return cos_score
    
#     def forward(self, x):
#         output = self.word_embedding(**x)
#         output = self.mean_pooling(output, x)
# #         output = output[1]
        
#         if self.seg_head:
#             output = self.FC_head(output)
        
#         else:
#             output = self.cos_score(output)
        
#         return output

In [14]:
# # Полносвязная модель - принимает тензор с эмбедингами текстов
# # Выдает 1 число [0, 1]

# class SbertHead(nn.Module):
#     def __init__(self, inputs=1024):
#         super(SbertHead, self).__init__()
        
#         self.FC_input = nn.Linear(inputs, 1024)
#         self.FC_hidden_0 = nn.Linear(1024, 1024)
#         self.FC_hidden_1 = nn.Linear(1024, 512)
#         self.FC_hidden_2 = nn.Linear(512, 512)
#         self.FC_hidden_3 = nn.Linear(512, 256)
#         self.FC_hidden_4 = nn.Linear(256, 64)
#         self.FC_output = nn.Linear(64, 1)
#         self.activation = nn.Sigmoid()
#         self.batch_norm = nn.BatchNorm1d(1024)
#         self.batch_norm_2 = nn.BatchNorm1d(512)
#         self.batch_norm_3 = nn.BatchNorm1d(64)

#     def forward(self, x: torch.tensor) -> torch.tensor:
        
#         x = F.relu(self.FC_input(x))
#         x = F.relu(self.FC_hidden_0(x))
#         x = self.batch_norm(x)
#         x = F.relu(self.FC_hidden_1(x))
#         x = F.relu(self.FC_hidden_2(x))
#         x = self.batch_norm_2(x)
#         x = F.relu(self.FC_hidden_3(x))
#         x = F.relu(self.FC_hidden_4(x))
#         x = self.batch_norm_3(x)
        
#         x = self.FC_output(x)
#         x = self.activation(x)

#         return x

# Data loading

In [15]:
score_type = 'content'

train_data = SentenseData(score=score_type)
# test_data = SentenseData(test=True, score=score_type)
# train_data = SentenseData(score=score_type)

In [16]:
batch_size = 5


train_loader = DataLoader(train_data,
                           batch_size=batch_size,
                           shuffle=True, drop_last=True)

# test_loader = DataLoader(test_data,
#                           batch_size=batch_size,
#                           shuffle=True)

In [17]:
# dataset = SentenseData(summaries_train_path, prompts_train_path)

In [18]:
# train_size = int(0.8*len(dataset))
# test_size = len(dataset) - train_size

# train_data, test_data = random_split(dataset, [train_size, test_size])

# # torch.save(train_data, 'dataset\TRAIN_DATA_split_1.pt')
# # torch.save(test_data, 'dataset\TEST_DATA_split_1.pt')

In [19]:
# train_data = torch.load('dataset\TRAIN_DATA_split_1.pt')
# test_data = torch.load('dataset\TEST_DATA_split_1.pt')

## Model Train

In [20]:
model = STSBertModel(with_features=True)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
model.float()

model.to(device)

STSBertModel(
  (word_embedding): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNo

In [22]:
 lr_head = 5e-05

 params = [
        {'params': model.word_embedding.parameters()},
        {'params': model.attention.parameters(), 'lr': lr_head},
        {'params': model.block.parameters(), 'lr': lr_head},
        {'params': model.FC_head.parameters(), 'lr': lr_head},
        {'params': model.FC_output.parameters(), 'lr': lr_head}
    ]

Loss_func = nn.MSELoss()

optimizer = torch.optim.AdamW(params, lr=2e-06, weight_decay=1e-02)

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.7)

scaler = torch.cuda.amp.GradScaler()

In [23]:
def validate_model():

    losses_epoch_test = []
    
    model.eval()
    with torch.no_grad():
        for inputs, targets in tqdm(test_loader, position=0, leave=True):
            targets = targets.float().to(device)
            
            if True:#score_type == 'wording':
                inputs = {k:val.squeeze().to(device) for k, val in inputs.items()}
                outputs = model(inputs)
                
                loss = torch.sqrt(Loss_func(outputs.squeeze(), targets))
            else:
                outputs = torch.stack([model(sentence.to(device)) for sentence in inputs])              
                loss = torch.sqrt(Loss_func(outputs, targets))

            
            losses_epoch_test.append(loss.item())
    
    return losses_epoch_test


n_epochs = 3

losses_train = []
losses_test = []
t0 = datetime.now()

for i in range(n_epochs):
    
    losses_train_per_epoch = []
    
    for j, batch in enumerate(tqdm(train_loader, position=0, leave=True), 1):
        inputs, targets = batch
        targets = targets.float().to(device)
        
        model.train()
        
        optimizer.zero_grad()
        
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            if True:#score_type == 'wording':
                inputs = {k:val.squeeze().to(device) for k, val in inputs.items()}
                outputs = model(inputs)
                loss = torch.sqrt(Loss_func(outputs.squeeze(), targets))
            else:
                outputs = torch.stack([model(sentence.to(device)) for sentence in inputs])           
                loss = torch.sqrt(Loss_func(outputs, targets))
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        
        scaler.update()
        
        losses_train_per_epoch.append(loss.item())
        
#         if j%20 == 0 or j == len(train_loader):
            
#             losses_test_per_epoch = validate_model()
#             losses_test_mean = np.mean(losses_test_per_epoch)
#             losses_train_mean = np.mean(losses_train_per_epoch)
            
#             losses_train.append(losses_train_mean)
#             losses_test.append(losses_test_mean)
            
            
#             if losses_test_mean == np.min(losses_test):
#                 best_score = losses_test_mean
#                 n_iter = len(losses_train)
#                 torch.save(model.state_dict(), 'best__.pt')
#                 print(f'Best saved, loss: {best_score:.5f}')
    lr_scheduler.step()

    print(f'Epoch: {i+1}/{n_epochs}, Iter: {len(losses_train)}, L_train: {np.mean(losses_train_per_epoch):.5f}')#,\
            #L_test: {losses_test_mean:.5f}') 
            
    torch.cuda.empty_cache()

#print(f"Time elapsed:{datetime.now()-t0}, Best score:{best_score:.5f} at {n_iter} iteration")

100%|██████████| 1433/1433 [27:14<00:00,  1.14s/it]


Epoch: 1/3, Iter: 0, L_train: 0.45529


100%|██████████| 1433/1433 [24:02<00:00,  1.01s/it]


Epoch: 2/3, Iter: 0, L_train: 0.38083


100%|██████████| 1433/1433 [15:14<00:00,  1.57it/s]

Epoch: 3/3, Iter: 0, L_train: 0.34577





In [25]:
torch.save(model.state_dict(), 'models\Roberta_large_content_8.pt')