**Solution Overview:**

Train Roberta-Base and RobertaLarge models on the contest data along with supplmemental sources similar to that data.  Fine tune the models using cross-validation folds. Inference weights all 10 models (two trained models * five fine-tuned models [five folds] per model) equally.

**Notebook Sequence:**
* [Train Roberta Base Model](https://www.kaggle.com/charliezimmerman/clrp-train-robertabase-maskedlm-model)
* [Train Roberta Large Model](https://www.kaggle.com/charliezimmerman/clrp-train-robertalarge-masked-lm-model/)
* [Fine Tune Trained Roberta-Base Model -- **This Notebook**](https://www.kaggle.com/charliezimmerman/clrp-finetune-trained-robertabase)
* [Fine Tune Trained Roberta Large Model](https://www.kaggle.com/charliezimmerman/clrp-finetune-trained-robertalarge)
* [Inference Notebook](https://www.kaggle.com/charliezimmerman/clrp-inference-robertabase-robertalarge-ensemble)

**This Notebook influenced by:**

* [https://www.kaggle.com/chamecall/clrp-finetune-single-roberta-base?scriptVersionId=68893027](https://www.kaggle.com/chamecall/clrp-finetune-single-roberta-base?scriptVersionId=68893027)
* [https://www.kaggle.com/maunish/clrp-pytorch-roberta-finetune](https://www.kaggle.com/maunish/clrp-pytorch-roberta-finetune)

In [None]:
import transformers
import pandas as pd
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.functional import mse_loss
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler#, Sampler
from transformers import AutoModel,AutoTokenizer,get_cosine_schedule_with_warmup, AutoConfig, AdamW
from time import time
from tqdm import tqdm
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error

In [None]:
class configuration:
    tokenizer_path = '../input/roberta-base'
    clrp_data_path= '../input/commonlitreadabilityprize'
    pretrained_model_path = '../input/clrp-trained-robertabase/robertabase_clrp_model'
    output_path='/kaggle/working/clrp-robertabase-modelweights')
    output_hidden_states = True
    epochs = 3
    evaluate_interval = 10
    batch_size = 16
    device = 'cuda'
    seed = 42
    max_len = 256
    lr = 2e-5
    wd = 0.01
    eval_schedule = [(float('inf'), 16), (0.5, 8), (0.49, 4), (0.48, 2), (0.47, 1), (0, 0)]
    num_folds=5
    base_seed=1000
    fold_seeds=[9183,4309,4071,98,4071]
    max_length = 300
    train_batch_size = 8
    val_batch_size = 32
    num_warmup_steps=50
   

In [None]:
scaler = torch.cuda.amp.GradScaler() 
DEVICE = torch.device(configuration.device if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)


In [None]:
train = pd.read_csv(configuration.clrp_data_path + "/train.csv")
test = pd.read_csv(configuration.clrp_data_path + "/test.csv")

In [None]:
models_dir = Path(configuration.output_path)
models_dir.mkdir(exist_ok=True)

In [None]:
def seed_everything(seed=configuration.base_seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
def get_data_loaders(data, fold):
    
    x_train = data.loc[data.fold != fold, 'excerpt'].tolist()
    y_train = data.loc[data.fold != fold, 'target'].values
    x_val = data.loc[data.fold == fold, 'excerpt'].tolist()
    y_val = data.loc[data.fold == fold, 'target'].values
    
    tokenizer = AutoTokenizer.from_pretrained(configuration.tokenizer_path)
    
    encoded_train = tokenizer.batch_encode_plus(
        x_train, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=configuration.max_length, 
        return_tensors='pt'
    )
    
    encoded_val = tokenizer.batch_encode_plus(
        x_val, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=configuration.max_length, 
        return_tensors='pt'
    )
    
    dataset_train = TensorDataset(
        encoded_train['input_ids'],
        encoded_train['attention_mask'],
        torch.tensor(y_train)
    )
    dataset_val = TensorDataset(
        encoded_val['input_ids'],
        encoded_val['attention_mask'],
        torch.tensor(y_val)
    )
    
    dataloader_train = DataLoader(
        dataset_train,
        sampler = RandomSampler(dataset_train),
        batch_size=configuration.train_batch_size
    )

    dataloader_val = DataLoader(
        dataset_val,
        sampler = SequentialSampler(dataset_val),
        batch_size=configuration.val_batch_size
    )

    return dataloader_train, dataloader_val

In [None]:
#create folds
seed = 1000
seed_everything(seed=seed)
x=train.index.to_list()
rand_idx=random.sample(x, len(x))
train.loc[:,'fold'] = pd.cut(rand_idx, bins=configuration.num_folds,labels=False)
target = train.target.to_numpy()


In [None]:
def convert_examples_to_features(text, tokenizer, max_len):

    tok = tokenizer.encode_plus(
        text, 
        max_length=max_len, 
        truncation=True,
        padding='max_length',
    )
    return tok


class CLRPDataset(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.tolist()
        if not is_test:
            self.targets = self.data.target.tolist()
            
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt = self.excerpts[item]
            label = self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.float),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class CLRPModel(nn.Module):
    def __init__(self,transformer,config):
        super(CLRPModel,self).__init__()
        self.h_size = config.hidden_size
        self.transformer = transformer
        self.head = AttentionHead(self.h_size)
        self.linear = nn.Linear(self.h_size, 1)
              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        x = self.head(transformer_out.last_hidden_state)
        x = self.linear(x)
        return x


In [None]:
def create_optimizer(model):
    parameters = []
    lr = configuration.lr
    multiplier=.990
    classifier_lr=lr
    for layer in range(11,-1,-1):
        layer_params = {
            'params': [p for n,p in model.named_parameters() if f'encoder.layer.{layer}.' in n],
            'lr': lr
        }
        parameters.append(layer_params)
        lr *= multiplier
    classifier_params = {
        'params': [p for n,p in model.named_parameters() if 'layer_norm' in n or 'linear' in n 
                   or 'pooling' in n],
        'lr': classifier_lr
    }
    parameters.append(classifier_params)
    
    return optim.AdamW(parameters)

In [None]:
   model_config = AutoConfig.from_pretrained(configuration.pretrained_model_path)
   model_config.update({
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
            }) 

   transformer = AutoModel.from_pretrained(configuration.pretrained_model_path, config=model_config) 
   model = CLRPModel(transformer, model_config)
   model = model.to(configuration.device) 

In [None]:
class AvgCounter:
    def __init__(self):
        self.reset()
        
    def update(self, loss, n_samples):
        self.loss += loss * n_samples
        self.n_samples += n_samples
        
    def avg(self):
        return self.loss / self.n_samples
    
    def reset(self):
        self.loss = 0
        self.n_samples = 0

class EvaluationScheduler:
    def __init__(self, evaluation_schedule, penalize_factor=1, max_penalty=8):
        self.evaluation_schedule = evaluation_schedule
        self.evaluation_interval = self.evaluation_schedule[0][1]
        self.last_evaluation_step = 0
        self.prev_loss = float('inf')
        self.penalize_factor = penalize_factor
        self.penalty = 0
        self.prev_interval = -1
        self.max_penalty = max_penalty

    def step(self, step):
        # should we to make evaluation right now
        if step >= self.last_evaluation_step + self.evaluation_interval:
            self.last_evaluation_step = step
            return True
        else:
            return False
        
            
    def update_evaluation_interval(self, last_loss):
        # set up evaluation_interval depending on loss value
        cur_interval = -1
        for i, (loss, interval) in enumerate(self.evaluation_schedule[:-1]):
            if self.evaluation_schedule[i+1][0] < last_loss < loss:
                self.evaluation_interval = interval
                cur_interval = i
                break
           
        self.prev_loss = last_loss
        self.prev_interval = cur_interval
        
          
        
def make_dataloader(data, tokenizer, is_train=True):
    dataset = CLRPDataset(data, tokenizer=tokenizer, max_len=configuration.max_len)
    if is_train:
        sampler = RandomSampler(dataset)
    else:
        sampler = SequentialSampler(dataset)

    batch_dataloader = DataLoader(dataset, sampler=sampler, batch_size=configuration.batch_size, pin_memory=True)
    return batch_dataloader
                   
            
class CLRPTrainer:
    def __init__(self, train_dl, val_dl, model, optimizer, scheduler, criterion, model_num):
        self.train_dl = train_dl
        self.val_dl = val_dl
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device =configuration.device
        self.batches_per_epoch = len(self.train_dl)
        self.criterion = criterion
        self.model_num = model_num
                
    def run(self):
        record_info = {
            'train_loss': [],
            'val_loss': [],
        }
        
        best_val_loss = float('inf')
        evaluation_scheduler = EvaluationScheduler(configuration.eval_schedule)
        train_loss_counter = AvgCounter()
        step = 0
        
        for epoch in range(configuration.epochs):
            
            print(f'Epoch: {epoch+1}/{configuration.epochs}')
            start_epoch_time = time()
            
            for batch_num, batch in enumerate(self.train_dl):
                train_loss = self.train(batch)
                train_loss_counter.update(train_loss, len(batch))
                record_info['train_loss'].append((step, train_loss.item()))

                if evaluation_scheduler.step(step):
                    val_loss = self.evaluate()
                    
                    record_info['val_loss'].append((step, val_loss.item()))        
                    print(f'\t\t{epoch+1}#[{batch_num+1}/{self.batches_per_epoch}]: train loss - {train_loss_counter.avg()} | val loss - {val_loss}',)
                    train_loss_counter.reset()

                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        print(f"Val loss decreased from {best_val_loss} to {val_loss}")
                        torch.save(self.model, f'{configuration.output_path}/model_{self.model_num}.bin')
                        
                    evaluation_scheduler.update_evaluation_interval(val_loss.item())
                        

                step += 1
            end_epoch_time = time()
            print(f'The epoch took {end_epoch_time - start_epoch_time} sec..')

        return record_info, best_val_loss
            

    def train(self, batch):
        self.model.train()
        sent_id, mask, labels = batch['input_ids'].to(self.device), batch['attention_mask'].to(self.device), batch['label'].to(self.device), 
        self.model.zero_grad() 
        preds = self.model(sent_id, mask)
        train_loss = self.criterion(preds, labels.unsqueeze(1))
        
        train_loss.backward()
        self.optimizer.step()
        self.scheduler.step()
        return torch.sqrt(train_loss)

    def evaluate(self):
        self.model.eval()
        val_loss_counter = AvgCounter()

        for step,batch in enumerate(self.val_dl):
            sent_id, mask, labels = batch['input_ids'].to(self.device), batch['attention_mask'].to(self.device), batch['label'].to(self.device)
            with torch.no_grad():
                preds = self.model(sent_id, mask)
                loss = self.criterion(preds,labels.unsqueeze(1))
                val_loss_counter.update(torch.sqrt(loss), len(labels))
        return val_loss_counter.avg()
    
    
def mse_loss(y_true,y_pred):

    return nn.functional.mse_loss(y_true,y_pred)

In [None]:
  seed_everything(seed)
  for model_num in range(configuration.num_folds): 
    best_loss=999
    seed=configuration.fold_seeds[model_num]
    print(f'seed={seed} , Model#{model_num+1}')
      
    tokenizer = AutoTokenizer.from_pretrained(configuration.tokenizer_path)
    config = AutoConfig.from_pretrained(configuration.pretrained_model_path)
    config.update({
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
            }) 


    train_dl = make_dataloader(train[train.fold!=model_num], tokenizer)
    val_dl = make_dataloader(train[train.fold==model_num], tokenizer, is_train=False)

    transformer = AutoModel.from_pretrained(configuration.pretrained_model_path, config=config)  
    model = CLRPModel(transformer, config)
    model = model.to(configuration.device)
    optimizer = create_optimizer(model)
    scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_training_steps=configuration.epochs * len(train_dl),
            num_warmup_steps=configuration.num_warmup_steps)  

    criterion = mse_loss

    clrp_trainer =CLRPTrainer(train_dl, val_dl, model, optimizer, scheduler, criterion, model_num)
    
    record_info, best_val_loss = clrp_trainer.run()
    steps, train_losses = list(zip(*record_info['train_loss']))
    steps, val_losses = list(zip(*record_info['val_loss']))

!date '+%A %W %Y %X' > execution_time
