# CommonLit Readability - DistilBERT fine tuning and data augmentation
This notebook is using DistilBert fine tuning for the CommonLit readability competition, with kfold validation.

Most of the code comes from S Canisir's notebooks : https://www.kaggle.com/snnclsr/commonlit-pytorch-distilbert-training  
With the help of V Baskaran's work as well : https://www.kaggle.com/vigneshbaskaran/commonlit-easy-transformer-finetuner  

My first attempts concentrated on using DistilBERT with some linear and dropout layers, not freezing any parameter. The global model was then built with around 66 millions parameters to update. With the small number of data available, it turned out to overfit a lot, even with the dropout layer : loss curve decreasing a lot on training set but not on validation set.   
  
I tried to freeze the 66 million parameters of the DistilBERT model (see Model class), to make it possible to train the model with the amount of data provided.    
  
I also tried to generate data augmentation on the texts (see parameter in CFG class).   
Be aware that it takes a long time to generate the augmented data, that's why I only augmented 10% of the data at each epoch.   
Note : to use NLPaug library offline, one should import it as a dataset.


# Imports

In [None]:
import os
import time
import math
import gc
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn.model_selection import KFold
from sklearn import metrics

from transformers import DistilBertTokenizer
from transformers import DistilBertModel

from logging import getLogger
from logging import INFO
from logging import FileHandler
from logging import Formatter
from logging import StreamHandler
from pathlib import Path

# text augmentation library
!cp ../input/nlpaug-114/nlpaug-1.1.4-py3-none-any.whl /kaggle/working/
!pip install --no-index --find-links /kaggle/working/ nlpaug
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

In [None]:
# clean memory
gc.collect()
torch.cuda.empty_cache()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Data Loading

In [None]:
OUTPUT_DIR = "./"
BASE_DATA_PATH = Path("../input/commonlitreadabilityprize/")

!ls {BASE_DATA_PATH}

In [None]:
df_train = pd.read_csv(BASE_DATA_PATH / "train.csv")
display(df_train.head(3))

# explore excerpt
df_train['nb_words'] = df_train.excerpt.apply(lambda x: len(x.split()))
df_train['nb_chars'] = df_train.excerpt.apply(lambda x: len(x))

# plot histograms
fix, ax = plt.subplots(1, 2, figsize=(15, 5))
sns.histplot(data=df_train, x="nb_chars", bins=100, ax=ax[0],
             color='orange', edgecolor=None, stat='density')
sns.kdeplot(data=df_train, x="nb_chars", color='red', ax=ax[0])
ax[0].axvline(np.median(df_train.nb_chars), 0, np.max(df_train.nb_chars))
ax[0].set_title('Text length', fontsize=10)

sns.histplot(data=df_train, x="nb_words", bins=100, ax=ax[1],
             color='salmon', edgecolor=None, stat='density')
sns.kdeplot(data=df_train, x="nb_words", color='red', ax=ax[1])
ax[1].axvline(np.median(df_train.nb_words), 0, np.max(df_train.nb_words))
ax[1].set_title('Number of words', fontsize=10)

plt.show()

In [None]:
# target labels
labels = df_train['target']
print("Values of the target : {} -> {}".format(labels.min(), labels.max()))

fig, ax = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(data=df_train, x="target", bins=100, ax=ax[0],
             color='lightgreen', edgecolor=None, stat='density')
sns.kdeplot(data=df_train, x="target", ax=ax[0], color='forestgreen')
ax[0].axvline(np.median(df_train.target), 0, np.max(df_train.target))
ax[0].set_title('Targets histogram', fontsize=10)

sns.boxplot(x=df_train["target"], ax=ax[1], color='lightgreen')
ax[1].set_title('Targets boxplot', fontsize=10)
plt.show()

# Utils

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value
    of a given object:
    val is the current value
    sum is the total sum of all increments
    count is the number of increments
    avg is the mean increment"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.cur_val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.cur_val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class Metric:
    """sums the squared residuals (target - excepted) for a period (epoch)
    and returns the rmse for the whole epoch data"""

    def __init__(self):
        self.sse = 0
        self.num_samples = 0

    def update(self, targets, predictions):
        predictions = flatten(predictions)
        self.sse += np.sum(np.square(targets - predictions))
        self.num_samples += len(targets)

    def get_rmse(self):
        rmse = np.sqrt(self.sse / self.num_samples)
        return rmse


def flatten(array):
    """takes an output array and flatten it
    returns a list"""

    my_list = []
    for i in range(len(array)):
        my_list.append(array[i][0])
    return np.array(my_list)


def asMinutes(s):
    """converts seconds to minutes"""

    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    """returns the time elapsed from the 'since' point
    and an estimation of the remaining time
    percent : current % of progress compared to total length"""

    now = time.time()
    elapsed = now - since
    estimated = elapsed / (percent)
    remaining = estimated - elapsed
    return '%s (remain %s)' % (asMinutes(elapsed), asMinutes(remaining))


def init_logger(log_file=OUTPUT_DIR+'train.log'):
    """initiate logger"""
    logger = getLogger(__name__)
    logger.setLevel(INFO)

    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))

    logger.handlers.clear()
    logger.addHandler(handler1)
    logger.addHandler(handler2)

    return logger


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def get_scheduler(optimizer):
    """instanciate a scheduler to update learning rate"""
    if CFG.scheduler == 'CosineAnnealingLR':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=CFG.T_max,
                                                         eta_min=CFG.min_lr,
                                                         last_epoch=-1)
    if CFG.scheduler == 'StepLR':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=30,
                                              gamma=0.1)
    return scheduler


def get_optimizer(model):
    """instanciate an optimizer
    weight_decay is for L2 regularization"""
    if CFG.optimizer == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=CFG.lr,
                               weight_decay=CFG.weight_decay)
    if CFG.optimizer == 'RMSprop':
        optimizer = optim.RMSprop(model.parameters(), lr=CFG.lr,
                                  weight_decay=CFG.weight_decay)
    if CFG.optimizer == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=CFG.lr,
                              weight_decay=CFG.weight_decay)
    return optimizer

# Configuration

In [None]:
class CFG:
    model_name = "distilbert-base-uncased"
    model_path = "../input/distilbertbaseuncased"
    max_length = 256
    dropout_p = 0.4
    batch_size = 32
    n_epochs = 10
    weight_decay = 1e-6
    lr = 3e-4
    min_lr = 1e-6
    scheduler = "CosineAnnealingLR"
    optimizer = "Adam"
    T_max = 10
    seed = 28
    n_folds = 5
    print_freq = 50
    num_workers = 4
    augment_text = False
    augment_ratio = 0.1


# Dataset

In [None]:
def augment_sentences(text, action='switch'):
    """switches or deletes randomly sentences in the text
    warning : looses the different ponctuations (only dots)"""
    list_replace = ['.', '!', '?', '...']
    sep = '.'
    for char in list_replace:
        text = text.replace(char, sep)
    sentences = text.split(sep)
    sentences = [x for x in sentences if x]  # cleans empty sentences
    nb_sentences = len(sentences)

    if (action == 'switch') & (nb_sentences > 1):
        sents = random.sample(range(nb_sentences), 2)
        s0 = sentences[sents[0]]
        s1 = sentences[sents[1]]
        sentences[sents[0]] = s1
        sentences[sents[1]] = s0
        augmented_text = '. '.join(sentences)
        del(s0, s1)
        return augmented_text + '.'
    elif (action == 'delete') & (nb_sentences > 0):
        sents = random.sample(range(nb_sentences), 1)
        del sentences[sents[0]]
        augmented_text = '. '.join(sentences)
        return augmented_text + '.'
    else:
        return text


def augment_words(text, action='delete'):
    """words random augmentations
    uses NLPAug library:
    with ‘swap’, ‘delete’ or ‘crop’ actions
    insertion with distilbert model """
    augmented_text = ''
    if action in ['delete', 'crop', 'swap']:
        aug = naw.RandomWordAug(action=action, aug_p=0.3,
                                aug_min=5, aug_max=50)
    elif action in ['insert']:
        aug = naw.ContextualWordEmbsAug(model_path=CFG.model_path,
                                        action=action)

    augmented_text = aug.augment(text)
    return augmented_text


def apply_augment(text):
    """randomly applies 1 augmentation on sentence level
    and 1 augmentation on word level"""

    list_aug_sentence = ['switch', 'delete']
    list_aug_word = ['delete', 'crop', 'swap', 'insert']

    # pick randomly 2 transformations
    trans_s = random.sample(range(len(list_aug_sentence)), 1)[0]
    trans_w = random.sample(range(len(list_aug_word)), 1)[0]

    aug_text = augment_sentences(text, action=list_aug_sentence[trans_s])
    aug_text = augment_words(aug_text, action=list_aug_word[trans_w])
    return aug_text

In [None]:
class CommonLitDataset(Dataset):

    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        tokenized_input = self.tokenizer(row.excerpt, return_tensors="pt",
                                         max_length=self.max_length,
                                         padding="max_length", truncation=True)

        return {
                "ids": tokenized_input["input_ids"][0],
                "masks": tokenized_input["attention_mask"][0],
                "targets": torch.tensor(row.target).float()
        }

# Model

In [None]:
class TextRegressionModel(nn.Module):

    def __init__(self, model_name, dropout_p=0.3):
        super(TextRegressionModel, self).__init__()

        # load DistilBERT model
        self.model = DistilBertModel.from_pretrained(CFG.model_path)
        # and freeze its parameters if wanted
        # for param in self.model.parameters():
        # param.requires_grad = False

        # define other layers
        self.features = nn.Linear(768, 200)
        self.dropout = nn.Dropout(dropout_p)
        self.regressor = nn.Linear(200, 1)
        self.list_layers = [self.model, self.features,
                            self.dropout, self.regressor]
        self.list_layers_names = ['DistilBERT', 'Features',
                                  'Dropout', 'Regressor']

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids,
                            attention_mask=attention_mask)
        output = output.last_hidden_state[:, 0]
        output = F.relu(self.features(output))
        output = self.dropout(output)
        output = self.regressor(output)
        return output

    def count_parameters(self):
        """counts number of trainable parameters"""
        total = 0
        for layer in self.list_layers:
            total += sum(p.numel() for p in layer.parameters()
                         if p.requires_grad)
        return total

    def details_parameters(self):
        """details the number of trainable parameters by layer"""

        for i in range(len(self.list_layers)):
            layer = self.list_layers[i]
            table = pd.DataFrame(columns=['module', 'parameters'])
            for name, parameter in layer.named_parameters():
                if not parameter.requires_grad:
                    continue
                param = parameter.numel()
                new_line = {'module': name, 'parameters': param}
                table.loc[len(table)] = new_line
            print(self.list_layers_names[i], 'layer :')
            display(table)

# Train and eval Functions

In [None]:
def train_step(model, criterion, optimizer, data_loader, epoch, device=device):

    batch_time = AverageMeter()
    data_time = AverageMeter()
    train_loss = AverageMeter()
    metric = Metric()
    model.train()

    start = end = time.time()

    for step, batch in enumerate(data_loader):
        # update data loading time
        data_time.update(time.time() - end)

        # get data for the current batch
        input_ids = batch["ids"].to(device)
        attention_masks = batch["masks"].to(device)
        targets = batch["targets"].to(device)
        bs = input_ids.size(0)

        # compute output and loss
        output = model(input_ids, attention_masks)
        loss = criterion(output.squeeze(1), targets)
        train_loss.update(loss.item(), bs)
        # loss.item() is the avg of mse over the batch
        loss.backward()

        m_targets = targets.detach().cpu().numpy()
        m_predictions = output.detach().cpu().numpy()
        metric.update(targets=m_targets, predictions=m_predictions)

        # step optimizer
        optimizer.step()
        optimizer.zero_grad()

        # update batch time
        batch_time.update(time.time() - end)
        end = time.time()

        # if current step is a multiple of the printing param (or end)
        if step % CFG.print_freq == 0 or step == (len(data_loader) - 1):
            print('Epoch {0} [{1}/{2}]: '
                  'Elapsed {remain:s} '
                  'Loss: {loss.cur_val:.4f} (avg {loss.avg:.4f}) '
                  .format(
                      epoch+1, step, len(data_loader),
                      loss=train_loss,
                      remain=timeSince(start,
                                       float(step + 1) / len(data_loader))))

    rmse = metric.get_rmse()
    return train_loss.avg, rmse


def eval_step(model, criterion, data_loader, epoch, device=device):

    batch_time = AverageMeter()
    data_time = AverageMeter()
    eval_loss = AverageMeter()
    metric = Metric()
    model.eval()

    start = end = time.time()

    for step, batch in enumerate(data_loader):
        data_time.update(time.time() - end)

        input_ids = batch["ids"].to(device)
        attention_masks = batch["masks"].to(device)
        targets = batch["targets"].to(device)
        bs = input_ids.size(0)
        with torch.no_grad():
            output = model(input_ids, attention_masks)
        loss = criterion(output.squeeze(1), targets)
        eval_loss.update(loss.item(), bs)
        batch_time.update(time.time() - end)
        end = time.time()

        m_targets = targets.detach().cpu().numpy()
        m_predictions = output.detach().cpu().numpy()
        metric.update(targets=m_targets, predictions=m_predictions)

        if step % CFG.print_freq == 0 or step == (len(data_loader) - 1):
            print('EVAL - Epoch {0} [{1}/{2}]: '
                  'Loss: {loss.cur_val:.4f} (avg {loss.avg:.4f}) '
                  .format(epoch+1, step, len(data_loader),
                          loss=eval_loss))
    rmse = metric.get_rmse()
    return eval_loss.avg, rmse

In [None]:
def train_loop(folds, fold):
    print('\n*** FOLD ', fold)

    # get indexes tagged for the current fold
    train_index = folds[folds["fold"] != fold].index
    valid_index = folds[folds["fold"] == fold].index

    # get data
    train_folds = folds.loc[train_index].reset_index(drop=True)
    valid_folds = folds.loc[valid_index].reset_index(drop=True)

    # create tokenizer and create dataset
    tokenizer = DistilBertTokenizer.from_pretrained(CFG.model_path)
    tokenizer.save_pretrained(f"{CFG.model_name}_tokenizer")

    # create validation data loader
    valid_dataset = CommonLitDataset(df=valid_folds, tokenizer=tokenizer,
                                     max_length=CFG.max_length)
    valid_data_loader = DataLoader(valid_dataset,
                                   batch_size=CFG.batch_size,
                                   shuffle=False,
                                   num_workers=CFG.num_workers,
                                   pin_memory=True)

    # define scheduler, model, optimizer, loss
    model = TextRegressionModel(model_name=CFG.model_name,
                                dropout_p=CFG.dropout_p)
    model.to(device)
    if fold == 0:
        print('Model instanciated. Number of trainable parameters : {}\n'
              .format(model.count_parameters()))
        model.details_parameters()

    optimizer = get_optimizer(model)
    scheduler = get_scheduler(optimizer)

    criterion = nn.MSELoss().to(device)
    best_loss = np.inf  # initiate with positive infinity

    # iterate through epochs
    for epoch in range(CFG.n_epochs):

        # augment part of the data except for the first epoch
        if CFG.augment_text & (epoch != 0):
            print("Epoch {} - Augmenting data...".format(epoch + 1))
            start = time.time()
            ratio = CFG.augment_ratio
            idx_aug = random.sample(range(len(train_folds)),
                                    int(len(train_folds) * ratio))
            for idx in idx_aug:
                cur_excerpt = train_folds.loc[idx, 'excerpt']
                train_folds.loc[idx, 'excerpt'] = apply_augment(cur_excerpt)
            print("... done ({} sec)".format(round(time.time() - start)))

        # build train data loader
        train_dataset = CommonLitDataset(df=train_folds, tokenizer=tokenizer,
                                         max_length=CFG.max_length)
        train_data_loader = DataLoader(train_dataset,
                                       batch_size=CFG.batch_size,
                                       shuffle=True,
                                       num_workers=CFG.num_workers,
                                       pin_memory=True)

        # train and evaluate
        start_time = time.time()
        train_loss, train_rmse = train_step(model, criterion, optimizer,
                                            train_data_loader, epoch)
        eval_loss, eval_rmse = eval_step(model, criterion,
                                         valid_data_loader, epoch)
        scheduler.step()
        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_loss:.4f}'
                    ' time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - avg_eval_loss: {eval_loss:.4f} '
                    '- avg_rmse: {eval_rmse:.4f}')
        save_training(fold, epoch, train_loss, eval_loss,
                      train_rmse, eval_rmse)

        # save best loss and model
        if eval_loss < best_loss:
            best_loss = eval_loss
            file_name = f"{CFG.model_name}_fold_{fold}_best.pth"
            print('  -> save model as', file_name)
            torch.save({
                "model": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict()
            }, file_name)

In [None]:
def save_training(fold, epoch, train_loss, val_loss, train_rmse, val_rmse):
    """stores the values of loss and rmse during training"""
    new_line = {'fold': fold, 'epoch': epoch,
                'train_loss': train_loss, 'val_loss': val_loss,
                'train_rmse': train_rmse, 'val_rmse': val_rmse}
    training_evals.loc[len(training_evals)] = new_line
    return training_evals

# Main

#### CV split

In [None]:
# separate the observations in CFG.n_folds folds
# the fold number is recorded in the 'fold' column of folds df
folds = df_train.copy()
Fold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
for n, (train_idx, valid_idx) in enumerate(Fold.split(folds)):
    folds.loc[valid_idx, "fold"] = int(n)

folds["fold"] = folds["fold"].astype(int)
print('Number of observations per fold :\n')
print(folds.groupby(["fold"]).size())

In [None]:
def main():
    for fold in range(CFG.n_folds):
        gc.collect()
        torch.cuda.empty_cache()
        train_loop(folds, fold)

In [None]:
# inits
seed_torch(seed=CFG.seed)
LOGGER = init_logger()
training_evals = pd.DataFrame(columns=['fold', 'epoch',
                                       'train_loss', 'val_loss',
                                       'train_rmse', 'val_rmse'])

# run training
main()

In [None]:
# print results
epochs = range(1, CFG.n_epochs + 1)
colors = ['mediumpurple', 'deepskyblue', 'mediumseagreen', 'gold', 'tomato']

# loss curve
fig = plt.figure(figsize=(8, 8))
max_y = 0
for f in range(CFG.n_folds):
    loss_train = training_evals.loc[training_evals.fold == f, 'train_loss']
    loss_val = training_evals.loc[training_evals.fold == f, 'val_loss']
    curmax = max(max(loss_train), max(loss_val))
    if curmax > max_y:
        max_y = curmax
    plt.plot(epochs, loss_train, c=colors[f], label='Training loss fold {}'.format(f))
    plt.plot(epochs, loss_val, c=colors[f], linestyle='dashed',
             label='validation loss fold {}'.format(f))

plt.title('Training and Validation loss')
plt.xticks(range(0, CFG.n_epochs + 1))
plt.xlim(0.5, CFG.n_epochs + 0.5)
plt.ylim(0, max_y)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(bbox_to_anchor=(1, 1, 0, 0))
plt.show()

# RMSE
max_y = 0
fig = plt.figure(figsize=(8, 8))
for f in range(CFG.n_folds):
    rmse_train = training_evals.loc[training_evals.fold == f, 'train_rmse']
    rmse_val = training_evals.loc[training_evals.fold == f, 'val_rmse']
    curmax = max(max(rmse_train), max(rmse_val))
    if curmax > max_y:
        max_y = curmax
    plt.plot(epochs, rmse_train, c=colors[f], label='Training RMSE fold {}'.format(f))
    plt.plot(epochs, rmse_val, c=colors[f], linestyle='dashed',
             label='validation RMSE fold {}'.format(f))
plt.title('Training and Validation RMSE')
plt.xticks(range(0, CFG.n_epochs + 1))
plt.xlim(0.5, CFG.n_epochs + 0.5)
plt.ylim(0, max_y)
plt.xlabel('Epochs')
plt.ylabel('RMSE')
plt.legend(bbox_to_anchor=(1, 1, 0, 0))
plt.show()

# Inference

In [None]:
class CommonLitDataset_test(Dataset):

    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        tokenized_input = self.tokenizer(row.excerpt, return_tensors="pt",
                                         max_length=self.max_length,
                                         padding="max_length", truncation=True)

        return {
                "ids": tokenized_input["input_ids"][0],
                "masks": tokenized_input["attention_mask"][0]
        }

In [None]:
def inference(model, states, data_loader, device=device):
    """ runs inference for the test set
    takes each state (cf fold) and computes the mean of the results"""
    results = []
    n_state = 1
    with torch.no_grad():
        for state in states:
            state_results = []
            print('State {}/{}'.format(n_state, len(states)))
            model.load_state_dict(state)
            model.to(device)
            model.eval()

            for step, batch in enumerate(data_loader):
                input_ids = batch["ids"].to(device)
                attention_masks = batch["masks"].to(device)
                output = model(input_ids, attention_masks)
                predictions = flatten(output.detach().cpu().numpy())
                state_results.extend(predictions)

            results.append(state_results)
            n_state += 1

    mean_results = np.mean(results, axis=0)
    return results, mean_results

In [None]:
# get previous states from k fold
states = [torch.load(f"distilbert-base-uncased_fold_{f}_best.pth")["model"]
          for f in range(CFG.n_folds)]

# build model
model_inf = TextRegressionModel(CFG.model_name, CFG.dropout_p)

### On full dataset 
(**Warning** : this should give great RMSE as the data has been used for training)

In [None]:
# build dataset with no target
tokenizer = DistilBertTokenizer.from_pretrained(CFG.model_path)

df_train = pd.read_csv(BASE_DATA_PATH / "train.csv")
test_dataset = CommonLitDataset_test(df_train, tokenizer, CFG.max_length)

data_loader_test = DataLoader(test_dataset,
                              batch_size=CFG.batch_size, shuffle=False)

In [None]:
# predict outputs
res_states, mean_res = inference(model_inf, states, data_loader_test, device)

In [None]:
y_pred = mean_res
y_true = df_train.target

# compute RMSE on training set
rmse = np.sqrt(metrics.mean_squared_error(y_true, y_pred))
print('RMSE on train dataset :', round(rmse, 4))

# plot results
fig, axes = plt.subplots(figsize=(12, 6), nrows=1, ncols=2)
xmin = np.min(y_true) - 1
xmax = np.max(y_true) + 1
axes[0].scatter(y_true, y_pred, color='yellowgreen', s=1)
axes[0].plot([xmin, xmax], [xmin, xmax],
             color='darkolivegreen', linewidth=1)
axes[0].set_xlabel('True target')
axes[0].set_ylabel('Predicted target')
axes[0].set_title('Predicted vs true targets')

residuals = y_true - y_pred
moy_residuals = np.mean(residuals)
lab = 'Mean residuals({})'.format(np.round(moy_residuals, decimals=2))

xmin = np.min(y_pred) - 1
xmax = np.max(y_pred) + 1
axes[1].scatter(y_pred, residuals, color='burlywood', s=1.5)
axes[1].plot([xmin, xmax], [0, 0], color='grey', alpha=0.5, linewidth=0.5)
axes[1].plot([xmin, xmax], [moy_residuals, moy_residuals],
             color='saddlebrown', linewidth=1, label=lab)
axes[1].set_xlabel('Predicted target')
axes[1].set_ylabel('Residuals')
axes[1].legend()
axes[1].set_title('Residuals')

plt.show()

### On test set (7 entries)

In [None]:
# build test dataset
tokenizer = DistilBertTokenizer.from_pretrained(CFG.model_path)

df_test = pd.read_csv(BASE_DATA_PATH / "test.csv")
test_dataset = CommonLitDataset_test(df_test, tokenizer, CFG.max_length)

data_loader_test = DataLoader(test_dataset,
                              batch_size=CFG.batch_size, shuffle=False)

# predict outputs
out_states, out_mean = inference(model_inf, states, data_loader_test, device)

In [None]:
# build submission file
df_sub = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
df_sub["target"] = out_mean
df_sub.to_csv("submission.csv",index=False)
df_sub