Just a very simple baseline applying a RoBerta model to predict the readibility of excerpts. If you have any questions, please let me know.

The notebook for inference can be found here: https://www.kaggle.com/hannes82/commonlit-readability-roberta-inference/

In [None]:
import numpy as np 
import pandas as pd 

import os
from transformers import *

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import random

import torch
import torch.nn as nn

from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm

In [None]:
def set_seed(seed = 0):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

seed = 82
random_state = set_seed(seed)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

In [None]:
train_df

In [None]:
train_df['excerpt_len'] = train_df.excerpt.apply(lambda x: len(x.split()))

In [None]:
print(train_df.excerpt_len.max())

In [None]:
train_df['fold'] = -1
gkf = KFold(n_splits=5)
for fold, (train, val) in enumerate(gkf.split(train_df.excerpt, train_df.target)):
    train_df.loc[val,'fold']=fold

fold = 0
validation_df = train_df[train_df.fold==0].reset_index(drop=True)
train_df = train_df[train_df.fold!=0].reset_index(drop=True)

In [None]:
print(train_df.target.mean(), validation_df.target.mean())

In [None]:
class Data(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):       
        excerpt = self.data.excerpt[idx]
        target = self.data.target[idx]
        return excerpt, target

In [None]:
train_data = Data(data = train_df) 
train_loader = DataLoader(dataset = train_data, shuffle=True, batch_size = 8)

val_data = Data(data = validation_df) 
val_loader = DataLoader(dataset = val_data, shuffle=False, batch_size = 64)

In [None]:
class ReadabilityModel(PreTrainedModel): 
    def __init__(self, conf):
        super(ReadabilityModel, self).__init__(conf) 
        self.roberta = RobertaModel.from_pretrained(model_name, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l1 = nn.Linear(768 * 1, 1)
        torch.nn.init.normal_(self.l1.weight, std=0.02)
    
    def forward(self, ids, mask):
        out = self.roberta(
            input_ids=ids,
            attention_mask=mask
        )
        out = out['hidden_states']
        out = out[-1]
        out = self.drop_out(out)
        out = torch.mean(out, 1, True)
        
        preds = self.l1(out)

        preds = preds.squeeze(-1).squeeze(-1)

        return preds

In [None]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

model_config = RobertaConfig.from_pretrained(model_name)
model_config.output_hidden_states = True

model = ReadabilityModel(model_config)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
scheduler = get_constant_schedule_with_warmup(optimizer, 100)

loss_fct = nn.MSELoss()

epochs = 3

In [None]:
for epoch in range(epochs):
    model.train()
    for i, (excerpts, targets) in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()
        batch = tokenizer(list(excerpts), truncation=True, padding=True, return_tensors='pt', add_special_tokens=True)
        input_ids = batch['input_ids']
        input_ids = input_ids.to(device, dtype=torch.long)
        attention_mask = batch['attention_mask']
        attention_mask = attention_mask.to(device, dtype=torch.long)
            
        targets=torch.tensor(targets).to(device, dtype=torch.float)
 
        preds = model(input_ids, attention_mask)       
        
        loss = torch.sqrt(loss_fct(preds, targets))
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        loss = loss.item()
        
        if i==0:
            loss_train = loss
        else:
            loss_train = loss_train + loss  
    loss_train = loss_train/(i+1)
    
    model.eval()
    with torch.no_grad():
        for i, (excerpts, targets) in enumerate(tqdm(val_loader)):
            optimizer.zero_grad()
            batch = tokenizer(list(excerpts), truncation=True, padding=True, return_tensors='pt', add_special_tokens=True)
            input_ids = batch['input_ids']
            input_ids = input_ids.to(device, dtype=torch.long)
            attention_mask = batch['attention_mask']
            attention_mask = attention_mask.to(device, dtype=torch.long)
                
            targets=torch.tensor(targets).to(device, dtype=torch.float)
     
            preds = model(input_ids, attention_mask)       
            
            loss = torch.sqrt(loss_fct(preds, targets))
            loss = loss.item()
            
            preds = preds.cpu().detach().numpy()
            targets = targets.cpu().detach().numpy()
            if i==0:
                loss_val = loss
                preds_val = preds
                targets_val = targets
            else:
                loss_val = loss_val + loss  
                preds_val = np.concatenate((preds_val,preds), axis=None)
                targets_val = np.concatenate((targets_val,targets), axis=None)
                
        loss_val = loss_val / (i+1)
        rms_val = mean_squared_error(targets_val, preds_val, squared=False)
        print('Epoch: {} - Loss: {:.6f} - Loss val: {:.6f} - RMSE: {:.3f}'.format(
            epoch + 1, loss_train, loss_val, rms_val))

In [None]:
torch.save(model.state_dict(), 'roberta_baseline_2.bin')