# Introduction
* I use pretrained RoBERTa-base model on downstream task
* I use two models: mean pooling and attention on top of RoBERTa. Each model is trained on 5 folds. 
* To make the final predictions, I blend the predictions of each model of each fold

## Inference Notebook
### RoBERTa-large 5-fold single model (MeanPooling): 
https://www.kaggle.com/jcesquiveld/roberta-large-5-fold-single-model-meanpooling
### CLRP: Pytorch Roberta Finetune
https://www.kaggle.com/maunish/clrp-pytorch-roberta-finetune

Give an upvote for this notebook and these notebooks are useful!

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version 20210331 --apt-packages libomp5 libopenblas-dev
!rm -rf /kaggle/working/*.whl
!rm -rf /kaggle/working/*.py+
!pip install accelerate

In [None]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

from accelerate import Accelerator
from transformers import (AutoModel, AutoTokenizer,AutoConfig,
                          get_cosine_schedule_with_warmup)



from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

train_data['excerpt'] = train_data['excerpt'].apply(lambda x: x.replace('\n',''))

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

bins = train_data.bins.to_numpy()
target = train_data.target.to_numpy()


In [None]:
config = {
    'lr': 2e-5,
    'wd':0.01,
    'batch_size':16,
    'valid_step':50,
    'max_len':300,
    'epochs':10,
    'nfolds':5,
    'seed':42,
    
}

for i in range(config['nfolds']):
    os.makedirs(f'model{i}',exist_ok=True)

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

train_data['Fold'] = -1
kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=bins)):
    train_data.loc[valid_idx,'Fold'] = k

In [None]:
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer,max_len=256):
        self.excerpt = df['excerpt'].to_numpy()
        self.targets = df['target'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        target = torch.tensor(self.targets[idx],dtype=torch.float) 
        return encode, target # encode has format {'input_ids': tensor, 'attention_mask'}
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class AttentionModel(nn.Module):
    def __init__(self):
        super(AttentionModel,self).__init__()
        self.roberta =  AutoModel.from_pretrained('roberta-base') 
        self.config = AutoConfig.from_pretrained('roberta-base')
        self.head = AttentionHead(self.config.hidden_size,self.config.hidden_size) # self attention
        self.mlp = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(self.config.hidden_size,128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128,1)
        )

    def forward(self,**xb):
        x = self.roberta(**xb)[0] # last_hidden_state 
        x = self.head(x)
        x = self.mlp(x)
        return x
    

class MeanPoolingModel(nn.Module):
    
    def __init__(self):
        super(MeanPoolingModel,self).__init__()
        
        self.config = AutoConfig.from_pretrained('roberta-base')
        self.roberta = AutoModel.from_pretrained('roberta-base')
        self.layer_norm = nn.LayerNorm(self.config.hidden_size)
        self.linear = nn.Linear(self.config.hidden_size, 1)
        
    def forward(self, **xb):
        attention_mask = xb['attention_mask']
        
        outputs = self.roberta(**xb)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        norm_mean_embeddings = self.layer_norm(mean_embeddings)
        logits = self.linear(norm_mean_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        return preds.view(-1).float()

In [None]:
def create_optimizer(model, model_name):
    
    parameters = []
    if model_name == 'attention':
        attention_group = [params for params in model.head.parameters()]
        regressor_group = [params for params in model.mlp.parameters()]
        parameters.append({"params": attention_group})
        parameters.append({"params": regressor_group})
    
    elif model_name == 'mean':
        layer_norm_group = [params for params in model.layer_norm.parameters()]
        regressor_group = [params for params in model.linear.parameters()]
        parameters.append({"params": layer_norm_group})
        parameters.append({"params": regressor_group})
    
    for layer_num, (name, params) in enumerate(model.roberta.named_parameters()):
        weight_decay = 0.0 if "bias" in name else 0.01
        
        lr = 2e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
       

    return optim.AdamW(parameters)

def run(fold, model_name, verbose=True):
    
    def loss_fn(outputs,targets):
        outputs = outputs.view(-1)
        targets = targets.view(-1)
        return torch.sqrt(nn.MSELoss()(outputs,targets))
   
    def train_and_evaluate_loop(model_name, train_loader,valid_loader,model, loss_fn,optimizer,epoch,fold,best_loss,valid_step=20,lr_scheduler=None):
        train_loss = 0

        for i, (inputs1,targets1) in enumerate(train_loader):
            
            model.train()
            optimizer.zero_grad()
            inputs1 = {key:val.reshape(val.shape[0],-1) for key,val in inputs1.items()}
            outputs1 = model(**inputs1)
            loss1 = loss_fn(outputs1,targets1)
            loss1.backward()
            optimizer.step()
            gc.collect()
            train_loss += loss1.item()
            
            if lr_scheduler:
                lr_scheduler.step()
            
            #evaluating for every valid_step
            if (i % valid_step == 0) or (i == (len(train_loader)-1)):
                model.eval()
                valid_loss = 0
                with torch.no_grad():
                    for j, (inputs2,targets2) in enumerate(valid_loader):
                        inputs2 = {key:val.reshape(val.shape[0],-1) for key,val in inputs2.items()}
                        outputs2 = model(**inputs2)
                        loss2 = loss_fn(outputs2,targets2)
                        valid_loss += loss2.item()
                     
                    valid_loss /= len(valid_loader)
                    if valid_loss <= best_loss:
                        if verbose:                            
                            xm.master_print(f"epoch:{epoch} | Train Loss:{train_loss/(i+1)} | Validation loss:{valid_loss}")
                            xm.master_print(f"{g_}Validation loss Decreased from {best_loss} to {valid_loss}{sr_}")
                        best_loss = valid_loss
                        xm.save(model.state_dict(),f'./{model_name}/model{fold}/model{fold}.bin')
                        tokenizer.save_pretrained(f'./{model_name}/model{fold}')
                        
        return best_loss
        
    accelerator = Accelerator()
    xm.master_print(f"{accelerator.device} is used")
    x_train,x_valid = train_data.query(f"Fold != {fold}"),train_data.query(f"Fold == {fold}")
    
    model = None
    if  model_name == 'attention':
        model = AttentionModel()
    elif model_name == 'mean':
        model = MeanPoolingModel()    
    tokenizer = AutoTokenizer.from_pretrained('roberta-base')
    optimizer = create_optimizer(model, model_name)
    
    train_ds = CLRPDataset(x_train,tokenizer,config['max_len'])
    train_dl = DataLoader(train_ds,
                        batch_size = config["batch_size"],
                        shuffle=True,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

    valid_ds = CLRPDataset(x_valid,tokenizer,config['max_len'])
    valid_dl = DataLoader(valid_ds,
                        batch_size = config["batch_size"],
                        shuffle=False,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

    num_training_steps = config['epochs'] * len(train_dl)
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=200,
                                                   num_training_steps =num_training_steps)

    model,train_dl,valid_dl,optimizer,lr_scheduler = accelerator.prepare(model,train_dl,valid_dl,optimizer,lr_scheduler)

    xm.master_print(f"Fold: {fold}") 
    best_loss = 9999
    
    for epoch in range(config["epochs"]):
        xm.master_print(f"Epoch Started:{epoch}")
        
        best_loss = train_and_evaluate_loop(model_name, train_dl,valid_dl,model,loss_fn,optimizer,epoch,fold,best_loss,
                                            valid_step=config['valid_step'],lr_scheduler=lr_scheduler)
    return best_loss

In [None]:
import gc
performance = {
    'mean': [],
    'attention': []
}
for model_name in ['mean', 'attention']:
    try:
        os.mkdir(f'./{model_name}', mode = 0o666)
    except OSError as error:
        pass
    for f in range(config['nfolds']):
        try:
            os.mkdir(f'./{model_name}/model{f}', mode = 0o666)
        except OSError as error:
            pass
        fold_err = run(f, model_name)
        performance[model_name].append(fold_err)
        gc.collect()