# Objective
The objective of this notebook is to set up the easiest and fastest notebook to quickly finetune a transformer model and make a submission. This notebook contains only the bare minimum transparent code necessary with no external trainer functions. Although the notebook is simple, it includes all the components to train a model such as :
1. Early stopping
2. Model Saver
3. Kfold Cross-validation

Inference notebook: https://www.kaggle.com/vigneshbaskaran/commonlit-easy-finetuner-inference

# Plan
1. Define model
2. Define Dataset and DataLoader
3. Define training and evaluation loop
4. Create cross-validation folds
5. For each fold: Train -> Save best model
6. Make predictions and submit

In [None]:
import gc
import torch
import numpy as np
import pandas as pd
from torch import nn

from pathlib import Path
from sklearn.model_selection import KFold
from transformers.file_utils import ModelOutput
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, AutoTokenizer, AutoModel
from transformers import BertModel, BertPreTrainedModel, RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

In [None]:
COMPETITION_DATA_PATH = Path('../input/commonlitreadabilityprize')
TRAIN_DATA_PATH = COMPETITION_DATA_PATH / 'train.csv'
TEST_DATA_PATH = COMPETITION_DATA_PATH / 'test.csv'

# Define Dataset and DataLoader

In [None]:
class TrainingDataset(Dataset):
    def __init__(self, text_excerpts, targets):
        self.text_excerpts = text_excerpts
        self.targets = targets
        
    def __len__(self):
        return len(self.text_excerpts)
    
    def __getitem__(self, idx):
        sample = {'text_excerpt': self.text_excerpts[idx],
                  'target': self.targets[idx]}
        return sample
    
class PredictionDataset(Dataset):
    def __init__(self, text_excerpts):
        self.text_excerpts = text_excerpts
        
    def __len__(self):
        return len(self.text_excerpts)
    
    def __getitem__(self, idx):
        sample = {'text_excerpt': self.text_excerpts[idx]}
        return sample

In [None]:
def transform_targets(targets):
    targets = targets.astype(np.float32).reshape(-1, 1)
    return targets

In [None]:
def create_training_dataloader(data, batch_size, shuffle):
    text_excerpts = data['excerpt'].tolist()
    targets = transform_targets(data['target'].to_numpy())
    dataset = TrainingDataset(text_excerpts=text_excerpts, targets=targets)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [None]:
def create_prediction_dataloader(data, batch_size):
    text_excerpts = data['excerpt'].tolist()
    dataset = PredictionDataset(text_excerpts=text_excerpts)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return dataloader

In [None]:
def split_into_kfolds(data, n_splits, shuffle, random_state):
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    for train_indices, valid_indices in kf.split(data):
        yield data.iloc[train_indices], data.iloc[valid_indices]

# Define Metric, EarlyStopping, Saver, Monitor

In [None]:
class Metric:
    def __init__(self):
        self.sse = 0
        self.num_samples = 0
    
    def update(self, targets, predictions):
        self.sse += np.sum(np.square(targets - predictions))
        self.num_samples += len(targets)
    
    def get_rmse(self):
        rmse = np.sqrt(self.sse / self.num_samples)
        return rmse

In [None]:
class Monitor:
    def __init__(self, num_patient_epochs):
        self.num_patient_epochs = num_patient_epochs
        self.best_epoch_num = None
        self.best_score = np.inf
        self.best_model = None
        
    def early_stopping(self, current_epoch_num):
        return True if current_epoch_num > self.best_epoch_num + self.num_patient_epochs else False
        
    def update_best_model(self, current_epoch_num, score, model, tokenizer, save_name):
        if score < self.best_score:
            self.best_epoch_num = current_epoch_num
            self.best_score = score
            self.best_model = model
            model.save_pretrained(save_name)
            tokenizer.save_pretrained(save_name)

In [None]:
class KfoldMonitor:
    def __init__(self):
        self.fold_monitor = {}
        
    def update(self, fold, monitor):
        self.fold_monitor[fold] = monitor

# Define training, validation and testing loops

In [None]:
def train(dataloader, model, tokenizer, optimizer, device):
    model.train()
    epoch_loss = 0
    for batch_num, batch in enumerate(dataloader):
        # Forward prop
        inputs = tokenizer(batch['text_excerpt'], padding=True, truncation=True, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        targets = batch['target'].to(device)
        outputs = model(**inputs, labels=targets)
        epoch_loss += outputs.loss.item()
        # Backprop
        outputs.loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    average_epoch_loss = epoch_loss / len(dataloader)
    return model, average_epoch_loss

In [None]:
def evaluate(dataloader, model, tokenizer, device):
    model.eval()
    epoch_loss = 0
    metric = Metric()
    for batch_num, batch in enumerate(dataloader):
        inputs = tokenizer(batch['text_excerpt'], padding=True, truncation=True, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        targets = batch['target'].to(device)
        with torch.no_grad():
            outputs = model(**inputs, labels=targets)
        epoch_loss += outputs.loss.item()
        targets = targets.detach().cpu().numpy()
        predictions = outputs.logits.detach().cpu().numpy()
        metric.update(targets=targets, predictions=predictions)
    average_epoch_loss = epoch_loss / len(dataloader)
    return average_epoch_loss, metric

In [None]:
def predict(dataloader, model, tokenizer, device):
    model.eval()
    predictions = []
    for batch_num, batch in enumerate(dataloader):
        inputs = tokenizer(batch['text_excerpt'], padding=True, truncation=True, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_predictions = outputs.logits.detach().cpu().numpy()
        predictions.append(batch_predictions)
    predictions = np.vstack(predictions)
    return predictions

# Define Model

In [None]:
class RegressorOutput(ModelOutput):
    loss = None
    logits = None
    hidden_states = None
    attentions = None

In [None]:
class BertPoolerRegressor(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self.loss_fct = nn.MSELoss()
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        bert_outputs = self.bert(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids)
        pooler_output = bert_outputs['pooler_output']
        pooler_output = self.dropout(pooler_output)
        logits = self.regressor(pooler_output)
        loss = self.loss_fct(labels, logits) if labels is not None else None
        return RegressorOutput(loss=loss, logits=logits)

In [None]:
class RobertaPoolerRegressor(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self.loss_fct = nn.MSELoss()
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        bert_outputs = self.roberta(input_ids=input_ids,
                                    attention_mask=attention_mask)
        pooler_output = bert_outputs['pooler_output']
        pooler_output = self.dropout(pooler_output)
        logits = self.regressor(pooler_output)
        loss = self.loss_fct(labels, logits) if labels is not None else None
        return RegressorOutput(loss=loss, logits=logits)

# Experiments

In [None]:
BATCH_SIZE = 16
RANDOM_STATE = 41
SAVE_NAME = Path('roberta-base-pooler-regressor')

In [None]:
train_data = pd.read_csv(TRAIN_DATA_PATH)
#### Remove before submission #####
# train_data = train_data.sort_values(by='excerpt', key=lambda x: x.str.len())[:len(train_data)//4]
# train_data = train_data[:20]
###################################
kfolf_monitor = KfoldMonitor()
for fold, (train_data, valid_data) in enumerate(split_into_kfolds(train_data, n_splits=5, shuffle=True, random_state=RANDOM_STATE)):
    gc.collect()
    torch.cuda.empty_cache()
    print(f'Length of train data: {len(train_data)}, valid data: {len(valid_data)}')
    train_dataloader = create_training_dataloader(data=train_data, batch_size=BATCH_SIZE, shuffle=True)
    valid_dataloader = create_training_dataloader(data=valid_data, batch_size=BATCH_SIZE * 4, shuffle=False)
    monitor = Monitor(num_patient_epochs=3)
    MODEL_PATH = '../input/commonlit-data-download/roberta-base'
    num_epochs = 20
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = RobertaPoolerRegressor.from_pretrained(MODEL_PATH)
    model.to(device)
    optimizer = AdamW(params=model.parameters(), lr=2e-5)
    for epoch_num in range(num_epochs):
        model, train_loss = train(train_dataloader, model, tokenizer, optimizer, device)
        valid_loss, valid_metric = evaluate(valid_dataloader, model, tokenizer, device)
        monitor.update_best_model(current_epoch_num=epoch_num, score=valid_metric.get_rmse(), model=model,
                                  tokenizer=tokenizer, save_name=SAVE_NAME/str(fold))
        print(f'Epoch num: {epoch_num} Train epoch loss: {train_loss}')
        print(f'Epoch num: {epoch_num} Valid epoch loss: {valid_loss}, RMSE: {valid_metric.get_rmse()}')
        if monitor.early_stopping(current_epoch_num=epoch_num):
            print(f'Exiting at epoch_num {epoch_num} due to early stopping')
            break
    kfolf_monitor.update(fold=fold, monitor=monitor)
    print(2*'--------------------------------------')

mean_cross_validation_score = np.mean([fold_monitor.best_score for fold_monitor in kfolf_monitor.fold_monitor.values()])
print(f'Mean cross validation score: {mean_cross_validation_score}')

# Make submission

In [None]:
test_data = pd.read_csv(TEST_DATA_PATH)
test_dataloader = create_prediction_dataloader(test_data, batch_size=4)
MODEL_PATH = '../input/commonlit-data-download/roberta-base'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

predictions = []
for monitor in kfolf_monitor.fold_monitor.values():
    gc.collect()
    torch.cuda.empty_cache()
    model = monitor.best_model
    model.to(device)
    fold_predictions = predict(test_dataloader, model, tokenizer, device)
    predictions.append(fold_predictions)
    
mean_predictions = np.mean(np.hstack(predictions), axis=1)
test_data['target'] = mean_predictions
test_data[['id','target']].to_csv('submission.csv', index=False)