# Overview
My Idea: 2D attention


Provide attention to hidden states for multiple layers and create a context vector based on attention weights.
then provide attention to context vectors of multiple layers

and 


Acknowledgments: some ideas were taken from kernels by [Torch](https://www.kaggle.com/rhtsingh) and [Maunish](https://www.kaggle.com/maunish) and [Andrey Tuganov](https://www.kaggle.com/andretugan).

In [None]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold

import gc
gc.enable()

In [None]:
NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 10
MAX_LEN = 248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "roberta-large"
TOKENIZER_PATH = "roberta-large"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [None]:
train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")

# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# Dataset

In [None]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

# Model
The model is inspired by the one from [Maunish](https://www.kaggle.com/maunish/clrp-roberta-svm).

In [None]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention1 = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
        )
        self.attention2 = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
        )
        self.attention4 = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
        )
        self.attention8 = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
        )
        
        self.attention16 = nn.Sequential(            
            nn.Linear(1024,256),
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
        )
        self.attention20 = nn.Sequential(            
            nn.Linear(1024,256),
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
        )
        
        self.attention_sm = nn.Sequential(            
            nn.Linear(1024, 6),
            nn.Tanh(),                       
            nn.Linear(6, 1),
            nn.Softmax(dim=1)
        )

        self.regressor = nn.Sequential(                        
            nn.Linear(1024, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        
        
        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]
        weights_1 = self.attention1(last_layer_hidden_states)
        context_vector_1 = torch.sum(weights_1 * last_layer_hidden_states, dim=1)
        
        last2_layer_hidden_states = roberta_output.hidden_states[-2]
        weights_2 = self.attention2(last2_layer_hidden_states)
        context_vector_2 = torch.sum(weights_2 * last2_layer_hidden_states, dim=1)
        
        last4_layer_hidden_states = roberta_output.hidden_states[-3]
        weights_4 = self.attention4(last4_layer_hidden_states)
        context_vector_4 = torch.sum(weights_4 * last4_layer_hidden_states, dim=1)
        
        last8_layer_hidden_states = roberta_output.hidden_states[-4]
        weights_8 = self.attention8(last8_layer_hidden_states)
        context_vector_8 = torch.sum(weights_8 * last8_layer_hidden_states, dim=1)
        
        last16_layer_hidden_states = roberta_output.hidden_states[-16]
        weights_16 = self.attention16(last16_layer_hidden_states)
        context_vector_16 = torch.sum(weights_16 * last16_layer_hidden_states, dim=1)
        
        last20_layer_hidden_states = roberta_output.hidden_states[-20]
        weights_20 = self.attention20(last20_layer_hidden_states)
        context_vector_20 = torch.sum(weights_20 * last20_layer_hidden_states, dim=1)
        
#         print(context_vector_1.shape)
        con_context_vectors = torch.stack([context_vector_1, context_vector_2, context_vector_4, context_vector_8, context_vector_16, context_vector_20], dim=1)
#         print(con_context_vectors.shape)
        layer_weights = self.attention_sm(con_context_vectors)
#         print(layer_weights.shape)
        final_context_vector = torch.sum(layer_weights * con_context_vectors, dim=1)
        
#         print(final_context_vector.shape)
        ans = self.regressor(final_context_vector)
#         print(ans.shape)
        
        # Now we reduce the context vector to the prediction score.
        return ans

In [None]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0

    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            
            pred = model(input_ids, attention_mask)                       

            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
                

    return mse_sum / len(data_loader.dataset)

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()
    result = np.zeros(len(data_loader.dataset))    
    index = 0
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            pred = model(input_ids, attention_mask)                        
            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]
    return result

In [None]:
def train(model, model_path, train_loader, val_loader,
          optimizer, scheduler=None, num_epochs=NUM_EPOCHS):    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = EVAL_SCHEDULE[0][1]    
    start = time.time()
    for epoch in range(num_epochs):                           
        val_rmse = None         
        for batch_num, (input_ids, attention_mask, target) in enumerate(train_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)            
            target = target.to(DEVICE)                        
            optimizer.zero_grad()
            model.train()
            pred = model(input_ids, attention_mask)
            mse = nn.MSELoss(reduction="mean")(pred.flatten(), target)
            mse.backward()
            optimizer.step()
            if scheduler:
                scheduler.step()
            if step >= last_eval_step + eval_period:
                elapsed_seconds = time.time() - start
                num_steps = step - last_eval_step
                last_eval_step = step
                val_rmse = math.sqrt(eval_mse(model, val_loader))                            
                for rmse, period in EVAL_SCHEDULE:
                    if val_rmse >= rmse:
                        eval_period = period
                        break                               
                if not best_val_rmse or val_rmse < best_val_rmse:                    
                    best_val_rmse = val_rmse
                    best_epoch = epoch
                    torch.save(model.state_dict(), model_path)
                    print(f"New best_val_rmse: {best_val_rmse:0.4}")
                start = time.time()                       
            step += 1
    return best_val_rmse

In [None]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:389]    
    attention_parameters = named_parameters[391:395]
    regressor_parameters = named_parameters[395:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 1.2e-5

        if layer_num >= 133:        
            lr = 3e-5

        if layer_num >= 261:
            lr = 7.5e-5

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

In [None]:
# del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
gc.collect()

SEED = 1000
list_val_rmse = []

kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):    
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")
    model_path = f"model_{fold + 1}.pth"
        
    set_random_seed(SEED + fold)
    
    train_dataset = LitDataset(train_df.loc[train_indices])    
    val_dataset = LitDataset(train_df.loc[val_indices])    
        
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=2)    
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=2)    
        
    set_random_seed(SEED + fold)    
    
    model = LitModel().to(DEVICE)
    
    optimizer = create_optimizer(model)                        
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=NUM_EPOCHS * len(train_loader),
        num_warmup_steps=50)    
    
    list_val_rmse.append(train(model, model_path, train_loader,
                               val_loader, optimizer, scheduler=scheduler))

    del model
    gc.collect()
    torch.cuda.empty_cache()

    
print("\nPerformance estimates:")
print(list_val_rmse)
print("Mean:", np.array(list_val_rmse).mean())

In [None]:
# 0.474