In [None]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_constant_schedule

import gc
gc.enable()

In [None]:
NUM_EPOCHS = 5
BATCH_SIZE = 16
MAX_LEN = 256
EVAL_SCHEDULE = [(0.50, 16), (0.48, 8), (0.465, 2), (-1., 1)]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = "../input/robertapretrained30/roberta-base-pretrained"
TOKENIZER_PATH = "../input/robertapretrained30/roberta-base-pretrained"

In [None]:
def set_random_seed(random_seed):
    """set random seed for all necessary places"""
    
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
class LitDataset(Dataset):
    """Implementation of the pytorch Dataset class for this Readability dataset and Bert-based models output"""
    
    def __init__(self, df, tokenizer, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

In [None]:
class LitModel(nn.Module):
    """Custom model that will use the pretrained roberta model, set config parameters and add necessary layers on top"""
    
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(MODEL_PATH)
        config.update({"output_hidden_states":True})        
        
        self.roberta = AutoModel.from_pretrained(MODEL_PATH, config=config)  

        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        
        last_layer_hidden_states = roberta_output.hidden_states[-1]
        weights = self.attention(last_layer_hidden_states)
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)
        return self.regressor(context_vector)

In [None]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    
    model.eval()            
    mse_sum = 0

    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            
            pred = model(input_ids, attention_mask)                       

            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
                
    return mse_sum / len(data_loader.dataset)

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

In [None]:
def train(model, train_loader, val_loader,
          optimizer, model_path, scheduler=None, num_epochs=NUM_EPOCHS):     
    """Training algorithm that takes batches, creates input data, applies the model, gets the loss and then does the backpropagation"""
    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = EVAL_SCHEDULE[0][1]
    
    for epoch in range(num_epochs):              
 
        for batch_num, (input_ids, attention_mask, target) in enumerate(train_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)   
            target = target.to(DEVICE)                        
 
            optimizer.zero_grad()
            
            model.train()
            pred = model(input_ids, attention_mask)                               
            mse = nn.MSELoss(reduction="mean")(pred.flatten(), target)
            mse.backward()
 
            optimizer.step()
            if scheduler:
                scheduler.step()
            # selective evaluation part
            if step >= last_eval_step + eval_period:
                last_eval_step = step
                
                val_rmse = math.sqrt(eval_mse(model, val_loader))                            
 
                print(f"Epoch: {epoch} batch_num: {batch_num} train_rmse: {mse:0.4}", 
                      f"val_rmse: {val_rmse:0.4}")
 
                for rmse, period in EVAL_SCHEDULE:
                    if val_rmse >= rmse:
                        eval_period = period
                        break                               
                
                if not best_val_rmse or val_rmse < best_val_rmse:                    
                    best_val_rmse = val_rmse
                    best_epoch = epoch
                    torch.save(model.state_dict(), model_path)
                    print(f"New best_val_rmse: {best_val_rmse:0.4}")
                else:       
                    print(f"Still best_val_rmse: {best_val_rmse:0.4}",
                          f"(from epoch {best_epoch})") 
                    
            step += 1
    return val_rmse, best_val_rmse

In [None]:
def create_optimizer(model):
    """Sets learning rate and returns the learnable layers of the model with the optimizer applied"""
    
    named_parameters = list(model.named_parameters())    
    parameters = []
 
    for layer_num, (name, params) in enumerate(named_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01
        lr = 2e-5
 
        if layer_num >= 69:        
            lr = 5e-5
 
        if layer_num >= 133:
            lr = 1e-4
            
        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
    return AdamW(parameters)

In [None]:
from transformers import get_cosine_schedule_with_warmup
from sklearn import model_selection

gc.collect()
 
SEED = 1666

set_random_seed(SEED)

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

list_val_rmse = []
list_best_val_rmse = []
num_splits = 5


train_df["kfold"] = -1
train_df = train_df.sample(frac=1).reset_index(drop=True)
num_bins = int(np.floor(1 + np.log2(len(train_df))))
train_df.loc[:, "bins"] = pd.cut(
    train_df["target"], bins=num_bins, labels=False
)
 
kf = model_selection.StratifiedKFold(n_splits=num_splits)
for fold, (train_indices, val_indices) in enumerate(kf.split(X=train_df, y=train_df.bins.values)):
    model_path = f"model_{fold+1}.pth"
    print("FOLD",fold+1)
    set_random_seed(SEED)


    train_dataset = LitDataset(train_df.loc[train_indices], tokenizer)
    val_dataset = LitDataset(train_df.loc[val_indices], tokenizer)  
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=2) 
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=2)     

    model = LitModel().to(DEVICE)

    optimizer = create_optimizer(model)                        
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=NUM_EPOCHS * len(train_loader),
        num_warmup_steps=30)                       

    val_rmse, best_val_rmse = train(model, train_loader, val_loader, optimizer, model_path, scheduler=scheduler)

    del model
    gc.collect()
    
    list_val_rmse.append(val_rmse)
    list_best_val_rmse.append(best_val_rmse)
    print("\nPerformance estimates:")
    print(list_val_rmse)
    print("Mean:", np.array(list_val_rmse).mean())   
    print(list_best_val_rmse)
    print("Best Mean:", np.array(list_best_val_rmse).mean())          

In [None]:
all_predictions = []

test_dataset = LitDataset(test_df, tokenizer, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                          drop_last=False, shuffle=False, num_workers=2)   
    
for fold in range(5):                  
    model = LitModel()
    model.load_state_dict(torch.load(f"model_{fold+1}.pth"))    
    model.to(DEVICE)

    all_predictions.append(predict(model, test_loader))
    
    del model
    gc.collect()    
    
predictions = np.array(all_predictions).mean(axis=0)

In [None]:
submission_df.target = predictions
print(submission_df)
submission_df.to_csv("submission.csv", index=False)