In [None]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.utils.tensorboard import SummaryWriter
from torch.nn.functional import mse_loss
from transformers import AutoModel,AutoTokenizer,get_cosine_schedule_with_warmup, AutoConfig, AdamW

from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
g_ = Fore.GREEN
sr_ = Style.RESET_ALL

In [None]:
train_df = pd.read_csv('../input/train-val-split/train.csv')
val_df = pd.read_csv('../input/train-val-split/val.csv')

# kfold_df = pd.read_csv('../input/train-val-split/kfold_data.csv')
aux_df = pd.read_csv('../input/clrauxdata/aux_data_embed.csv', index_col='index', converters={'aux_text': eval})


In [None]:
class Config:
    model_name = 'roberta-base'
    pretrained_model_path = '../input/clrp-roberta-pretrain/clrp_roberta_base'
    output_hidden_states = True
    epochs = 3
    evaluate_interval = 10
    batch_size = 8
    device = 'cuda'
    seed = 42
    max_len = 256
    lr = 2e-5
    wd = 0.01


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=Config.seed)



In [None]:

def convert_examples_to_features(text, tokenizer, max_len):

    tok = tokenizer.encode_plus(
        text, 
        max_length=max_len, 
        truncation=True,
        padding='max_length',
    )
    return tok


class CLRPDataset(Dataset):
    def __init__(self, data, tokenizer, aux_data=None, is_test=False):
        self.data = data
        if not is_test:
            self.targets = self.data.target.tolist()
            self.aux_data = aux_data
            
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = Config.max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            
            if self.aux_data is not None:
                # if we have augments for the data sample
                row_index = self.data.id[item]
                if row_index in self.aux_data.index:
                    # then we choose one from all the options randomly
                    excerpt = random.choice(self.aux_data.loc[row_index].aux_text + [self.data.excerpt[item]])
                else:
                    excerpt = self.data.excerpt[item]
            else:
                excerpt = self.data.excerpt[item]
            label = self.targets[item]
#             label = np.random.normal(self.targets[item], self.data.standard_error[item] ** 2)
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.float),
            }
        else:
            excerpt = self.data.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class CLRPModel(nn.Module):
    def __init__(self,transformer,config):
        super(CLRPModel,self).__init__()
        self.h_size = config.hidden_size
        self.transformer = transformer
        self.head = AttentionHead(self.h_size, self.h_size, 1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.h_size, 1)
              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        x = self.head(transformer_out[0])
        x = self.dropout(x)
        x = self.linear(x)
        return x
    

In [None]:
class AvgCounter:
    def __init__(self):
        self.reset()
        
    def update(self, loss, n_samples):
        self.loss += loss * n_samples
        self.n_samples += n_samples
        
    def avg(self):
        return self.loss / self.n_samples
    
    def reset(self):
        self.loss = 0
        self.n_samples = 0

class LossesRecorder:
    def __init__(self, suffix=''):
        self.best_val_loss = float('inf')
        self.tb = SummaryWriter(filename_suffix=suffix)
        
    def update_train_loss(self, loss, step):
        self.tb.add_scalar("train loss", loss, step)
        self.tb.flush()
        
    def update_val_loss(self, loss, step):
        self.tb.add_scalar("val loss", loss, step)
        self.tb.flush()
    
    def close(self):
        tb.close()
        
def make_dataloader(data, tokenizer, aux_data=None, is_train=True):
    dataset = CLRPDataset(data, tokenizer=tokenizer, aux_data=None)
    if is_train:
        sampler = RandomSampler(dataset)
        shuffle = True
    else:
        sampler = SequentialSampler(dataset)
        shuffle = False

    batch_dataloader = DataLoader(dataset, sampler=sampler, batch_size=Config.batch_size, pin_memory=True, drop_last=False)
    return batch_dataloader
                   
            
class Trainer:
    def __init__(self, train_dl, val_dl, model, optimizer, scheduler, criterion):
        self.train_dl = train_dl
        self.val_dl = val_dl
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = Config.device
        self.batches_per_epoch = len(self.train_dl)
        self.criterion = criterion
        
    def run(self):
        losses_recorder = LossesRecorder()
        train_loss_counter = AvgCounter()
        
        for epoch in range(Config.epochs):
            print(f'{r_}Epoch: {epoch+1}/{Config.epochs}{sr_}')
            for step, batch in enumerate(self.train_dl):
                train_loss = self.train(batch)
                
                train_loss_counter.update(train_loss, len(batch))
                losses_recorder.update_train_loss(train_loss.item(), epoch*self.batches_per_epoch+step+1)

                if step % Config.evaluate_interval == 0 or ((step + 1) == self.batches_per_epoch):
                    val_loss = self.evaluate()
                    
                    losses_recorder.update_val_loss(val_loss.item(), epoch*self.batches_per_epoch+step+1)
                    print(f'\t{epoch+1}#[{step+1}/{self.batches_per_epoch}]: train loss - {train_loss_counter.avg()} | val loss - {val_loss}',)
                    train_loss_counter.reset()

                    if val_loss < losses_recorder.best_val_loss:
                        print(f"\t\t{g_}Val loss decreased from {losses_recorder.best_val_loss} to {val_loss}{sr_}")
                        losses_recorder.best_val_loss = val_loss.item()
                        torch.save(self.model, f'best_model.pt')

    def train(self, batch):
        self.model.train()
        sent_id, mask, labels = batch['input_ids'].to(self.device), batch['attention_mask'].to(self.device), batch['label'].to(self.device), 
        self.model.zero_grad() 
        preds = self.model(sent_id, mask)
        train_loss = self.criterion(preds, labels.unsqueeze(1))
        train_loss.backward()
        self.optimizer.step()
        self.scheduler.step()
        return train_loss

    def evaluate(self):
        self.model.eval()
        val_loss_counter = AvgCounter()

        for step,batch in enumerate(self.val_dl):
            sent_id, mask, labels = batch['input_ids'].to(self.device), batch['attention_mask'].to(self.device), batch['label'].to(self.device)
            with torch.no_grad():
                preds = self.model(sent_id, mask)
                loss = self.criterion(preds,labels.unsqueeze(1))
                val_loss_counter.update(loss, len(labels))
        return val_loss_counter.avg()
    
    
def rmse_loss(y_true,y_pred):
    return torch.sqrt(nn.functional.mse_loss(y_true,y_pred))

In [None]:


tokenizer = AutoTokenizer.from_pretrained(Config.model_name)
train_dl = make_dataloader(train_df, tokenizer)
val_dl = make_dataloader(val_df, tokenizer, is_train=False)
config = AutoConfig.from_pretrained(Config.model_name)
transformer = AutoModel.from_pretrained(Config.pretrained_model_path, output_hidden_states=True)  

model = CLRPModel(transformer, config)
model = model.to(Config.device)
optimizer = optim.AdamW(model.parameters(), lr = Config.lr, weight_decay=Config.wd)  
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=10*len(train_dl)
)

criterion = rmse_loss


In [None]:
trainer = Trainer(train_dl, val_dl, model, optimizer, scheduler, criterion)
trainer.run()
!date '+%A %W %Y %X' > execution_time