**Solution Overview:**

Train Roberta-Base and RobertaLarge models on the contest data along with supplmemental sources similar to that data.  Fine tune the models using cross-validation folds. Inference weights all 10 models (two trained models * five fine-tuned models [five folds] per model) equally.

**Notebook Sequence:**
* [Train Roberta Base Model](https://www.kaggle.com/charliezimmerman/clrp-train-robertabase-maskedlm-model)
* [Train Roberta Large Model](https://www.kaggle.com/charliezimmerman/clrp-train-robertalarge-masked-lm-model/)
* [Fine Tune Trained Roberta Base Model](https://www.kaggle.com/charliezimmerman/clrp-finetune-trained-robertabase)
* [Fine Tune Trained Roberta Large Model](https://www.kaggle.com/charliezimmerman/clrp-finetune-trained-robertalarge)
* [Inference Notebook  -- **This Notebook**](https://www.kaggle.com/charliezimmerman/clrp-inference-robertabase-robertalarge-ensemble)

In [None]:
import transformers
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import SequentialSampler
from transformers import AutoTokenizer

In [None]:
class configuration:
    finetuned_robertabase_path="../input/robertabasemodelweights/clrp-robertabase-modelweights"
    finetuned_robertalarge_path="../input/robertalargemodelweights"
    robertabase_tokenizer="../input/roberta-base"
    robertalarge_tokenizer="../input/robertalarge"
    batch_size = 16
    device = 'cuda'
    max_len = 256
    contest_data="../input/commonlitreadabilityprize/test.csv"
    model_count=5

In [None]:
scaler = torch.cuda.amp.GradScaler() 
device = torch.device(configuration.device if torch.cuda.is_available() else 'cpu')
print(f"using device {torch.cuda.get_device_name(0)}")

In [None]:
test = pd.read_csv(configuration.contest_data)

In [None]:
def convert_examples_to_features(text, tokenizer, max_len):

    tok = tokenizer.encode_plus(
        text, 
        max_length=configuration.max_len, 
        truncation=True,
        padding='max_length',
    )
    return tok


class CLRPDataset(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.tolist()
        if not is_test:
            self.targets = self.data.target.tolist()
            
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt = self.excerpts[item]
            label = self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.float),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

In [None]:
class AttentionHead(nn.Module):
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)
        return context_vector

class CLRPModel(nn.Module):

              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        x = self.head(transformer_out.last_hidden_state)
        x = self.linear(x)
        return x

In [None]:
robertabase_tokenizer=AutoTokenizer.from_pretrained(configuration.robertabase_tokenizer)
robertalarge_tokenizer=AutoTokenizer.from_pretrained(configuration.robertalarge_tokenizer)

In [None]:
roberta_base_predictions = []

for model_num in range(configuration.model_count):
    print(f'Model #{model_num+1}/{configuration.model_count}')
    test_ds = CLRPDataset(data=test, tokenizer=robertabase_tokenizer, max_len=configuration.max_len, is_test=True)
    test_sampler = SequentialSampler(test_ds)
    test_dataloader = DataLoader(test_ds, sampler = test_sampler, batch_size=configuration.batch_size)
    model = torch.load(f'{configuration.finetuned_robertabase_path}/model_{model_num}.bin').to(configuration.device)

    all_preds = []
    model.eval()

    for step,batch in enumerate(test_dataloader):
        sent_id, mask = batch['input_ids'].to(configuration.device), batch['attention_mask'].to(configuration.device)
        with torch.no_grad():
            preds = model(sent_id, mask)
            all_preds += preds.flatten().cpu().tolist()
    
    roberta_base_predictions.append(all_preds)



In [None]:

roberta_large_predictions = []

for model_num in range(configuration.model_count):
    print(f'Model #{model_num+1}/{configuration.model_count}')
    test_ds = CLRPDataset(data=test, tokenizer=robertalarge_tokenizer, max_len=configuration.max_len, is_test=True)
    test_sampler = SequentialSampler(test_ds)
    test_dataloader = DataLoader(test_ds, sampler = test_sampler, batch_size=configuration.batch_size)
    model = torch.load(f'{configuration.finetuned_robertalarge_path}/model_{model_num}.bin').to(configuration.device)

    all_preds = []
    model.eval()

    for step,batch in enumerate(test_dataloader):
        sent_id, mask = batch['input_ids'].to(configuration.device), batch['attention_mask'].to(configuration.device)
        with torch.no_grad():
            preds = model(sent_id, mask)
            all_preds += preds.flatten().cpu().tolist()
    
    roberta_large_predictions.append(all_preds)

In [None]:
roberta_base_predictions = np.array(roberta_base_predictions)
roberta_large_predictions = np.array(roberta_large_predictions)

   #each model weighted equally
allpreds= (roberta_large_predictions[0] * .10) + (roberta_large_predictions[1] * .10) + (roberta_large_predictions[2] * .10) + \
    (roberta_large_predictions[3] * .10) + (roberta_large_predictions[4] * .10) + (roberta_base_predictions[0] * .10) + \
    + (roberta_base_predictions[1] * .10) + (roberta_base_predictions[2] * .10) + (roberta_base_predictions[3] * .10) + \
   (roberta_base_predictions[4] * .10)



In [None]:
result_df = pd.DataFrame(
    {
        'id': test.id,
        'target': allpreds
    })

print(result_df)
result_df.to_csv('submission.csv', index=False)