# How This Notebook Works
This notebook takes my own pretrained and finetuned roberta-large and roberta-base models and uses them to give predictions on the competition data.  
The parameters as well as the attention head class were taken from **Maunish Dave's** fantastic notebook [here](https://www.kaggle.com/maunish/clrp-pytorch-roberta-inference)  

### Roberta Large
[My pretraining notebook](https://www.kaggle.com/bumjunkoo/comlit-pretrain-rob-lrg)  
[My finetuning notebook](https://www.kaggle.com/bumjunkoo/comlit-finetune-rob-lrg)

### Roberta Base
[My pretraining notebook](https://www.kaggle.com/bumjunkoo/comlit-pretrainer)  
[My finetuning notebook](https://www.kaggle.com/bumjunkoo/comlit-finetune)  

The output is a weighted sum of the results of roBERTa base and roBERTa large with more weight given to roBERTa large's results.

## Imports

In [None]:
# transformers
from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer, AutoConfig, DataCollatorWithPadding, get_scheduler, AdamW
from transformers import logging
logging.set_verbosity(50) # prevents the model warning from popping up

# sklearn
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt

# torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from collections import defaultdict
import random, time, sys, os, gc, math

from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Data

In [None]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything()

In [None]:
class CommonLitDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


def data_loader(text, tokenizer, labels=None, batch=32, shuffle=False):
    tokens = tokenizer(text, max_length=256, padding='max_length', truncation=True) 
    dataset = CommonLitDataset(tokens, labels)
    dataloader = DataLoader(dataset, shuffle=shuffle, batch_size=batch, pin_memory=True) # set pin_memory = True when training on GPU
    return dataloader

# Model

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class CLModel(nn.Module):
    def __init__(self, model_or_path):
        super(CLModel, self).__init__()
        self.roberta = AutoModel.from_pretrained(model_or_path)
        self.hidden_size = self.roberta.config.hidden_size
        self.head = AttentionHead(self.hidden_size, self.hidden_size, 1) # hidden size is 768
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        attn_head = self.head(outputs['last_hidden_state'])
        dropout = self.dropout(attn_head)
        linear = self.linear(dropout)
        return linear

In [None]:
class MeanPoolingModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(model_name)
        hidden_size = self.model.config.hidden_size
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs['last_hidden_state']
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        norm_mean_embeddings = self.layer_norm(mean_embeddings)
        logits = self.linear(norm_mean_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        return preds

# Inference

In [None]:
def get_prediction(df, model, path, model_path,device='cuda'):        

    model.load_state_dict(torch.load(path,map_location=device))
    model.to(device)
    model.eval()
    
    print("Tokenizing data...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    dataloader = data_loader(df.excerpt.tolist(), tokenizer, batch = 16)
    
    predictions = list()
    print("Predicting...")
    for step, batch in enumerate(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
            
        outputs = outputs.cpu().detach().numpy().ravel().tolist()
        predictions.extend(outputs)
    print("Prediction Finished")
    torch.cuda.empty_cache()
    return np.array(predictions)

In [None]:
# roberta large models
large_predictions=[]
model_path = '../input/robertalarge'
cl = CLModel(model_path)
print('-'*30)
for i in range(5):
    i=i+1
    print(f"Model {i}")
    print('-'*30)
    path = f'../input/comlit-finetune-rob-lrg/model_{i}/model_{i}.bin'
    large_predictions.append( get_prediction(test[['excerpt']], cl, path, model_path ) )
    print('-'*30)

In [None]:
# roberta base models
base_predictions=[]
model_path = '../input/roberta-base'
mp = MeanPoolingModel(model_path)
print('-'*30)
for i in range(5):
    i=i+1
    print(f"Model {i}")
    print('-'*30)
    path = f'../input/cl-base-finetune-mean-pool/model_{i}/model_{i}.bin'
    base_predictions.append( get_prediction(test[['excerpt']], mp, path, model_path ) )
    print('-'*30)

In [None]:
sum_base_preds = sum(base_predictions)/5
sum_large_preds = sum(large_predictions)/5
# combined = (sum_large_preds+sum_base_preds)/2
weighted = (sum_large_preds*0.7) + (sum_base_preds*0.3)
sample['target'] = weighted
sample.to_csv('submission.csv',index=False)
sample