# Commonlit Finetuned Roberta inference
A Pretrained Roberta-base transformer is finetuned with the Competition dataset. This inference notebook is based on the training notebook here: https://www.kaggle.com/vigneshbaskaran/commonlit-easy-transformer-finetuner

In [None]:
import gc
import torch
import numpy as np
import pandas as pd
from torch import nn

from pathlib import Path
from transformers.file_utils import ModelOutput
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, AutoTokenizer, RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

In [None]:
COMPETITION_DATA_PATH = Path('../input/commonlitreadabilityprize')
TEST_DATA_PATH = COMPETITION_DATA_PATH / 'test.csv'

# Define Dataset and Dataloader

In [None]:
class PredictionDataset(Dataset):
    def __init__(self, text_excerpts):
        self.text_excerpts = text_excerpts
        
    def __len__(self):
        return len(self.text_excerpts)
    
    def __getitem__(self, idx):
        sample = {'text_excerpt': self.text_excerpts[idx]}
        return sample
    
def create_prediction_dataloader(data, batch_size):
    text_excerpts = data['excerpt'].tolist()
    dataset = PredictionDataset(text_excerpts=text_excerpts)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return dataloader

# Define Model

In [None]:
class RegressorOutput(ModelOutput):
    loss = None
    logits = None
    hidden_states = None
    attentions = None

In [None]:
class RobertaPoolerRegressor(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self.loss_fct = nn.MSELoss()
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        bert_outputs = self.roberta(input_ids=input_ids,
                                    attention_mask=attention_mask)
        pooler_output = bert_outputs['pooler_output']
        pooler_output = self.dropout(pooler_output)
        logits = self.regressor(pooler_output)
        loss = self.loss_fct(labels, logits) if labels is not None else None
        return RegressorOutput(loss=loss, logits=logits)

# Define prediction loop

In [None]:
def predict(dataloader, model, tokenizer, device):
    model.eval()
    predictions = []
    for batch_num, batch in enumerate(dataloader):
        inputs = tokenizer(batch['text_excerpt'], padding=True, truncation=True, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_predictions = outputs.logits.detach().cpu().numpy()
        predictions.append(batch_predictions)
    predictions = np.vstack(predictions)
    return predictions

In [None]:
BATCH_SIZE = 16

test_data = pd.read_csv(TEST_DATA_PATH)
test_dataloader = create_prediction_dataloader(test_data, batch_size=4)
TOKENIZER_PATH = '../input/commonlit-data-download/roberta-base'
predictions = []
for fold_num in range(5):
    model_path = '../input/commonlit-easy-transformer-finetuner/additional-pretrained-roberta-base-pooler-regressor/' + str(fold_num)
    model = RobertaPoolerRegressor.from_pretrained(model_path)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
    
    gc.collect()
    torch.cuda.empty_cache()
    model.to(device)
    
    fold_predictions = predict(test_dataloader, model, tokenizer, device)
    predictions.append(fold_predictions)

predictions = np.hstack(predictions)
mean_predictions = np.mean(predictions, axis=1)
test_data['target'] = mean_predictions
test_data[['id','target']].to_csv('submission.csv', index=False)