Inference with a simple RoBerta model.

The notebook to train the model is available here: https://www.kaggle.com/hannes82/commonlit-readability-roberta-simple-baseline

In [None]:
import numpy as np 
import pandas as pd 

import os
from transformers import *

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import random

import torch
import torch.nn as nn

from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
test_df

In [None]:
class Data(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):       
        excerpt = self.data.excerpt[idx]
        return excerpt

In [None]:
test_data = Data(data = test_df) 
test_loader = DataLoader(dataset = test_data, shuffle=False, batch_size = 64)

In [None]:
class ReadabilityModel(PreTrainedModel): 
    def __init__(self, conf):
        super(ReadabilityModel, self).__init__(conf) 
        self.roberta = RobertaModel(config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l1 = nn.Linear(768 * 1, 1)
        torch.nn.init.normal_(self.l1.weight, std=0.02)
    
    def forward(self, ids, mask):
        out = self.roberta(
            input_ids=ids,
            attention_mask=mask
        )
        out = out['hidden_states']
        out = out[-1]
        out = self.drop_out(out)
        out = torch.mean(out, 1, True)
        
        preds = self.l1(out)

        preds = preds.squeeze(-1).squeeze(-1)

        return preds

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('../input/robertabase', model_max_length=514) 

model_config = RobertaConfig()
model_config.output_hidden_states = True
model_config.max_position_embeddings=514
model_config.vocab_size = 50265
model_config.type_vocab_size = 1

model = ReadabilityModel(model_config)
if torch.cuda.is_available():
    model.load_state_dict(torch.load("../input/base-train-commonlit/roberta_baseline.bin"))
else: 
    model.load_state_dict(torch.load("../input/base-train-commonlit/roberta_baseline.bin", map_location=torch.device('cpu')))
model = model.to(device)

In [None]:
model.eval()
with torch.no_grad():
    for i, excerpts in enumerate(tqdm(test_loader)):
        batch = tokenizer(list(excerpts), truncation=True, padding=True, return_tensors='pt', add_special_tokens=False)
        input_ids = batch['input_ids']
        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = batch['attention_mask']
        attention_mask = attention_mask.to(device, dtype=torch.long)
            
        preds = model(input_ids, attention_mask)       
        preds = preds.cpu().detach().numpy()

        if i==0:
            preds_test = preds
        else:
            preds_test = np.concatenate((preds_test,preds), axis=None)

In [None]:
submission_df = pd.DataFrame({'id': test_df.id, 'target': preds_test})

In [None]:
submission_df.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
submission_df