# Context:
In the previous notebook we investigated which components improve the performance of my transformer using cross validation score. In this notebook we verify if the LB score increases.

Trainer notebook: https://www.kaggle.com/vigneshbaskaran/commonlit-making-my-transformer-good-enough  
Story of how I improved my transformer: https://www.kaggle.com/vigneshbaskaran/commonlit-halftime-recap-of-my-transformer-journey  

In [None]:
import gc
import torch
import numpy as np
import pandas as pd

from torch import nn
from transformers.file_utils import ModelOutput
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer

# Prediction Dataset and Dataloader

In [None]:
class PredictionDataset(Dataset):
    def __init__(self, text_excerpts):
        self.text_excerpts = text_excerpts
    
    def __len__(self):
        return len(self.text_excerpts)
    
    def __getitem__(self, idx):
        sample = {'text_excerpt': self.text_excerpts[idx]}
        return sample

In [None]:
def create_prediction_dataloader(data, batch_size, num_workers=4):
    text_excerpts = data['excerpt'].tolist()
    dataset = PredictionDataset(text_excerpts=text_excerpts)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, drop_last=False)
    return dataloader

In [None]:
def clear_cuda():
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
def predict(dataloader, model, tokenizer, padding, max_length, device):
    clear_cuda()
    model.eval()
    model.to(device)
    predictions = []
    for batch_num, batch in enumerate(dataloader):
        # Forward Propagation
        inputs = tokenizer(batch['text_excerpt'], padding=padding, truncation=True, max_length=max_length,return_tensors="pt")
        inputs = {key:value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_predictions = outputs.logits.detach().cpu().numpy()
        predictions.append(batch_predictions)
    predictions = np.vstack(predictions)
    return predictions

# Model

In [None]:
class RegressorOutput(ModelOutput):
    loss = None
    logits = None
    hidden_states = None
    attentions = None

In [None]:
class RobertaPoolerRegressor(nn.Module):
    def __init__(self, model_path, apply_sqrt_to_loss):
        super(RobertaPoolerRegressor, self).__init__()
        self.roberta = AutoModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(self.roberta.config.hidden_dropout_prob)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, 1)
        self.loss_fn = nn.MSELoss()
        self.apply_sqrt_to_loss = apply_sqrt_to_loss
    
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        roberta_outputs = self.roberta(input_ids=input_ids, 
                                       attention_mask=attention_mask)
        pooler_output = roberta_outputs['pooler_output']
        pooler_output = self.dropout(pooler_output)
        logits = self.regressor(pooler_output)
        if self.apply_sqrt_to_loss:
            loss = torch.sqrt(self.loss_fn(labels, logits)) if labels is not None else None
        else:
            loss = self.loss_fn(labels, logits) if labels is not None else None
        return RegressorOutput(loss=loss, logits=logits)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, hidden_dim):
        super(AttentionHead, self).__init__()
        self.W = nn.Linear(hidden_dim, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        attention_scores = self.V(torch.tanh(self.W(x)))
        attention_scores = torch.softmax(attention_scores, dim=1)
        attentive_x = attention_scores * x
        attentive_x = attentive_x.sum(axis=1)
        return attentive_x

In [None]:
class RobertaPoolerRegressor(nn.Module):
    def __init__(self, model_path, apply_sqrt_to_loss):
        super(RobertaPoolerRegressor, self).__init__()
        self.roberta = AutoModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(self.roberta.config.hidden_dropout_prob)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, 1)
        self.loss_fn = nn.MSELoss()
        self.apply_sqrt_to_loss = apply_sqrt_to_loss
    
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        roberta_outputs = self.roberta(input_ids=input_ids, 
                                       attention_mask=attention_mask)
        pooler_output = roberta_outputs['pooler_output']
        pooler_output = self.dropout(pooler_output)
        logits = self.regressor(pooler_output)
        if self.apply_sqrt_to_loss:
            loss = torch.sqrt(self.loss_fn(labels, logits)) if labels is not None else None
        else:
            loss = self.loss_fn(labels, logits) if labels is not None else None
        return RegressorOutput(loss=loss, logits=logits)

In [None]:
class RobertaLastHiddenStateRegressor(nn.Module):
    def __init__(self, model_path):
        super(RobertaLastHiddenStateRegressor, self).__init__()
        self.roberta = AutoModel.from_pretrained(model_path)
        self.head = AttentionHead(self.roberta.config.hidden_size)
        self.dropout = nn.Dropout(self.roberta.config.hidden_dropout_prob)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, 1)
        self.loss_fn = nn.MSELoss()
    
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        roberta_outputs = self.roberta(input_ids=input_ids,
                                       attention_mask=attention_mask)
        last_hidden_state = roberta_outputs['last_hidden_state']
        attentive_vector = self.head(last_hidden_state)
        attentive_vector = self.dropout(attentive_vector)
        logits = self.regressor(attentive_vector)
        loss = torch.sqrt(self.loss_fn(labels, logits)) if labels is not None else None
        return RegressorOutput(loss=loss, logits=logits)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_dataloader = create_prediction_dataloader(data=test_data, batch_size=8)

In [None]:
EXPERIMENT_NAME = 'experiment_7'

In [None]:
experiment_1_fold_predictions = []
for fold in range(5):
    print(f'Inferring fold: {fold}')
    pretrained_model_path = '../input/maunish-clrp-model/clrp_roberta_base'
    tokenizer_path = '../input/commonlit-data-download/roberta-base'
    model = RobertaLastHiddenStateRegressor(model_path=pretrained_model_path)
    finetuned_model_path = f'../input/commonlit-making-my-transformer-good-enough/experiment_1/fold_{fold}/model.pth'
    model.load_state_dict(torch.load(finetuned_model_path, map_location=torch.device('cpu')))
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    padding = 'max_length'
    max_length = 256
    experiment_1_fold_predictions.append(predict(test_dataloader, model, tokenizer, padding, max_length, device))
experiment_1_fold_predictions = np.hstack(experiment_1_fold_predictions)

In [None]:
experiment_7_fold_predictions = []
for fold in range(5):
    print(f'Inferring fold: {fold}')
    pretrained_model_path = '../input/commonlit-data-download/roberta-base'
    tokenizer_path = '../input/commonlit-data-download/roberta-base'
    model = RobertaPoolerRegressor(model_path=pretrained_model_path, apply_sqrt_to_loss=False)
    finetuned_model_path = f'../input/commonlit-making-my-transformer-good-enough/experiment_7/fold_{fold}/model.pth'
    model.load_state_dict(torch.load(finetuned_model_path, map_location=torch.device('cpu')))
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    padding = True
    max_length = None
    experiment_7_fold_predictions.append(predict(test_dataloader, model, tokenizer, padding, max_length, device))
experiment_7_fold_predictions = np.hstack(experiment_7_fold_predictions)

In [None]:
fold_predictions = np.hstack((experiment_1_fold_predictions, experiment_7_fold_predictions))
mean_predictions = np.mean(fold_predictions, axis=1)
test_data['target'] = mean_predictions
test_data[['id','target']].to_csv('submission.csv', index=False)

In [None]:
test_data[['id','target']]