In [None]:
%%html

<div style="background: linear-gradient(320deg, rgb(62, 0, 42), rgb(2, 2, 106), rgb(36, 0, 181)); 
color: #fff; border-radius: 10px;">
<div style="color: #fff; padding-top: 20px; padding-left: 10px; padding-bottom: 5px; font-size: 24px; line-height: 25px">CommonLit Readability Prize (with pretraining)</div>
<div style="padding: 10px; color: #fff;">By <strong>Kauvin Lucas</strong> in Kaggle</div></div>

### Background
I've used this notebook to generate a submission for the CommonLit Readability Prize competition. I joined this competition without having any idea of attention-based models or Transformer. Any feedback or question will be appreciated.

### The purpose of this notebook
With this notebook I've tried to build a simple Transformer model for the competition. It consists of pretraining, fine-tuning and inference steps, and it takes around 40min on GPU to run everything. Unlike many other submissions, I did not attempt to build (and ensemble) more than one model (although I should to get a better score).

### The problem
A readability score measures reading effort and speed of written text, and may take into account several different metrics, each one with pros and drawbacks. This is a problem when choosing between formulas, as many of these readability tests may lack construct and theoretical validity. In addition, commercially available formulas may lack transparency and can be cost-prohibitive.

For that reason, it's required to build a NLP model to rate the complexity of passages of text for grade 3-12 students that is trained on the pairwise comparisons given by the teachers.

In [None]:
import transformers
from transformers import (AutoTokenizer, AutoModel, AutoConfig, AutoModelForMaskedLM, 
                          Trainer, TrainingArguments, DataCollatorForLanguageModeling, 
                          RobertaForSequenceClassification, AdamW,
                          get_linear_schedule_with_warmup)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import (dataset, TensorDataset, DataLoader, RandomSampler, 
                                      SequentialSampler, random_split)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from typing import Dict
from transformers.tokenization_utils import PreTrainedTokenizer
from tqdm.notebook import tqdm
import os
import random
import time
import datetime

# Set verbosity to the error level
transformers.logging.set_verbosity_error()

### 1 - Load datasets and base models

In [None]:
class LineByLineTextDataset(dataset.Dataset):
    def __init__(self, data, tokenizer:PreTrainedTokenizer, block_size: int):
        data = data["excerpt"]
        lines = [line for line in data if (len(line) > 0 and not line.isspace())]
        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
        self.examples = batch_encoding["input_ids"]
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> Dict[str, torch.tensor]:
        return self.examples[i]

In [None]:
ROBERTA_MODEL = "../input/roberta-base"
TRAINING_FILE = "../input/commonlitreadabilityprize/train.csv"
TEST_FILE = "../input/commonlitreadabilityprize/test.csv"
SAMPLE_FILE = "../input/commonlitreadabilityprize/sample_submission.csv"
MODEL_PATH = 'Models/clrp_roberta-pretrained'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(ROBERTA_MODEL, do_lower_case=True)
MODEL = transformers.AutoModelForMaskedLM.from_pretrained(ROBERTA_MODEL)

In [None]:
train_data = pd.read_csv(TRAINING_FILE)
test_data = pd.read_csv(TEST_FILE)
train_dataset = LineByLineTextDataset(
    train_data,
    tokenizer=TOKENIZER,
    block_size=256)
valid_dataset = LineByLineTextDataset(
    train_data,
    tokenizer=TOKENIZER,
    block_size=256)
test_dataset = LineByLineTextDataset(
    test_data,
    tokenizer=TOKENIZER,
    block_size=256)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=TOKENIZER, mlm=True, mlm_probability=0.15)

## 1 - Pretaining

#### 1.1 - Pretraining arguments

In [None]:
WARMUP_STEPS = 0
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0
EVAL_STEPS = 200
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    evaluation_strategy= 'steps',
    save_total_limit=2,
    eval_steps=EVAL_STEPS,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end =True,
    prediction_loss_only=True,
    warmup_steps=WARMUP_STEPS,
    weight_decay=WEIGHT_DECAY,
    report_to = "none")

#### 1.2 - Trainer configuration

In [None]:
trainer = Trainer(
    model=MODEL,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset)

#### 1.3 - Train and save pretrained model

In [None]:
trainer.train()

In [None]:
trainer.save_model(MODEL_PATH)
del trainer
torch.cuda.empty_cache()

## 2 - Fine-tuning

#### 2.1 - Define fine-tuning parameters

In [None]:
epochs = 6
lr = 5e-5
folds = 5
max_length = 160
batch_size = 32
train_size = 0.9

#### 2.2 - Define CLRP model

In [None]:
class CLRP_Model(nn.Module):
    def __init__(self,path):
        super().__init__()
        self.config = AutoConfig.from_pretrained(path)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7}) 
        self.roberta = RobertaForSequenceClassification.from_pretrained(path, config = self.config)
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size,1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        x = self.roberta(input_ids=input_ids, 
                         attention_mask=attention_mask, 
                         token_type_ids = token_type_ids)
        weights = self.attention(x.hidden_states[-1])
        x = torch.sum(weights * x.hidden_states[-1], dim=1)
        x = self.dropout(x)
        x = self.linear(x)
        return x

#### 2.2 - Define Finetune class

In [None]:
class Finetune():
    def __init__(self, sentences, labels, model, base_path, seed = 42, batch_size = 32, epochs = 4, train_size = 0.9):
        self.model = model
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(base_path, do_lower_case=True)
        self.device = torch.device("cuda")
        self.sentences = sentences
        self.labels = labels
        self.batch_size = batch_size
        self.epochs = epochs
        self.train_size = train_size
        self.seed = seed
    def load_data(self, max_len):
        input_ids = []
        attention_masks = []
        token_type_ids = []
        for sent in self.sentences:
            encoded_dict = self.tokenizer.encode_plus(
                                sent,
                                add_special_tokens = True,
                                max_length = max_len,
                                padding = 'max_length',
                                return_attention_mask = True,
                                return_token_type_ids=True,
                                return_tensors = 'pt',
                                truncation=True
                           )   
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
            token_type_ids.append(encoded_dict['token_type_ids'])
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        token_type_ids = torch.cat(token_type_ids, dim=0)
        labels = torch.tensor(self.labels, dtype=torch.float)
        dataset = TensorDataset(input_ids, attention_masks, token_type_ids, labels)
        train_size = int(self.train_size * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        train_dataloader = DataLoader(
                    train_dataset,
                    sampler = RandomSampler(train_dataset),
                    batch_size = self.batch_size
                )
        validation_dataloader = DataLoader(
                    val_dataset,
                    sampler = SequentialSampler(val_dataset),
                    batch_size = self.batch_size
                )
        return train_dataloader, validation_dataloader
    def optimizer(self, train_dataloader, lr = 2e-5, eps = 1e-8, wd = 1e-2):
        total_steps = len(train_dataloader) * self.epochs
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.1},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr = lr,
                          eps = eps,
                          weight_decay = wd
                        )
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps = 0,
                                                    num_training_steps = total_steps)
        return optimizer, scheduler
    def train_and_evaluate(self, train_dataloader, validation_dataloader, optimizer, scheduler, tqdm):
        self.model.cuda()
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)
        training_stats = []
        total_t0 = time.time()
        for epoch_i in range(0, self.epochs):
            t0 = time.time()
            total_train_loss = 0
            self.model.train()
            for step, batch in enumerate(train_dataloader):
                b_input_ids = batch[0].to(self.device)
                b_input_mask = batch[1].to(self.device)
                b_input_token_type_ids = batch[2].to(self.device)
                b_labels = batch[3].to(self.device)
                self.model.zero_grad()
                optimizer.zero_grad()
                result = self.model(b_input_ids, 
                                    attention_mask=b_input_mask, 
                                    token_type_ids = b_input_token_type_ids)
                loss = torch.sqrt(nn.MSELoss()(result.flatten(),b_labels.view(-1)))
                total_train_loss += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                tqdm.update(1)
            avg_train_loss = total_train_loss / len(train_dataloader)
            training_time = str(datetime.timedelta(seconds=int(round((time.time() - t0)))))
            t0 = time.time()
            self.model.eval()
            total_eval_score = 0
            total_eval_loss = 0
            nb_eval_steps = 0
            for batch in validation_dataloader:
                b_input_ids = batch[0].to(self.device)
                b_input_mask = batch[1].to(self.device)
                b_input_token_type_ids = batch[2].to(self.device)
                b_labels = batch[3].to(self.device)
                with torch.no_grad():        
                    result = self.model(b_input_ids, 
                                        attention_mask=b_input_mask, 
                                        token_type_ids = b_input_token_type_ids)
                loss = torch.sqrt(nn.MSELoss()(result.flatten(),b_labels.view(-1)))
                total_eval_loss += loss.item()
                logits = result.detach().cpu().numpy()
            avg_val_loss = total_eval_loss / len(validation_dataloader)
            validation_time = str(datetime.timedelta(seconds=int(round((time.time() - t0)))))
            training_stats.append(
                {
                    'epoch': epoch_i + 1,
                    'Training Loss': avg_train_loss,
                    'Valid. Loss': avg_val_loss,
                    'Training Time': training_time,
                    'Validation Time': validation_time
                }
            )
        return self.model, training_stats

#### 2.3 - Train and save folds

In [None]:
sentences = train_data.excerpt.values
labels = train_data.target.values
progress_bar = tqdm(total = int(epochs * folds * np.ceil(len(sentences)/batch_size * train_size)))
progress_bar.set_description("Finetuning progress over {} folds".format(folds))
for fold in range(folds):
    fold = fold + 1
    model = CLRP_Model(MODEL_PATH)
    finetune = Finetune(sentences, 
                        labels, 
                        model,
                        ROBERTA_MODEL, 
                        epochs = epochs)
    train_dataloader, validation_dataloader = finetune.load_data(max_length)
    optimizer, scheduler = finetune.optimizer(train_dataloader, lr = lr)
    model, training_stats = finetune.train_and_evaluate(train_dataloader, 
                                                        validation_dataloader, 
                                                        optimizer, 
                                                        scheduler,
                                                        progress_bar)
    if fold == 1:
        df = pd.DataFrame(data=training_stats)
        df["Fold"] = fold
    else:
        df1 = pd.DataFrame(data=training_stats)
        df1["Fold"] = fold
        df = df1.append(df)
    output_dir = './Models/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    torch.save(model.state_dict(), f"./Models/model{fold}.bin")
    df.groupby(df.index).mean()
    del model

# Delete finetune class
del finetune
torch.cuda.empty_cache()

In [None]:
# Display loss history.
pd.set_option('precision', 2)
df_stats = df.set_index('Fold')
df_stats

## 3 - Inference

In [None]:
def base_model(base_path, num_labels=1):
    config = AutoConfig.from_pretrained(base_path)
    config.update({'num_labels': num_labels})
    model = CLRP_Model(base_path)
    return model
class Inference():
    def __init__(self, sentences, model_path, base_path, seed = 42, batch_size = 16):
        self.model = model
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(base_path, do_lower_case=True)
        self.device = torch.device("cuda")
        self.sentences = sentences
        self.batch_size = batch_size
    def load_data(self, max_len):
        input_ids = []
        attention_masks = []
        token_type_ids = []
        for sent in self.sentences:
            encoded_dict = self.tokenizer.encode_plus(
                                sent,
                                add_special_tokens = True,
                                max_length = max_len,
                                padding = 'max_length',
                                return_attention_mask = True, 
                                return_token_type_ids=True,
                                return_tensors = 'pt',
                                truncation=True
                           )   
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
            token_type_ids.append(encoded_dict['token_type_ids'])
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        token_type_ids = torch.cat(token_type_ids, dim=0)
        test_dataset = TensorDataset(input_ids, attention_masks, token_type_ids)
        test_dataloader = DataLoader(
                    test_dataset,
                    sampler = SequentialSampler(test_dataset),
                    batch_size = self.batch_size,
                    pin_memory=False, 
                    drop_last=False, 
                    num_workers=0
                )
        return test_dataloader
    def predict(self, test_dataloader):
        self.model.to(self.device)
        self.model.eval()
        result = np.zeros(len(test_dataloader.dataset))    
        index = 0
        with torch.no_grad():
            for batch_num, batch_data in enumerate(test_dataloader):
                input_ids, attention_mask, token_type_ids = batch_data[0], \
                    batch_data[1], batch_data[2]
                input_ids, attention_mask, token_type_ids = input_ids.cuda(), \
                    attention_mask.cuda(), token_type_ids.cuda()
                pred = self.model(input_ids, attention_mask, token_type_ids)                        
                result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
                index += pred.shape[0]
        return result

In [None]:
all_predictions = np.zeros((folds, len(test_data)))
sentences = test_data.excerpt.values
submission_df = pd.read_csv(SAMPLE_FILE)
for fold in range(folds):
    model_path = f"./Models/model{fold+1}.bin"
    model = base_model(ROBERTA_MODEL)
    model.load_state_dict(torch.load(model_path))
    inference = Inference(sentences, model, ROBERTA_MODEL)
    test_dataloader = inference.load_data(144)
    result = inference.predict(test_dataloader)
    all_predictions[fold] = result
predictions = all_predictions.mean(axis=0)
submission_df.target = predictions
submission_df

In [None]:
submission_df.to_csv("submission.csv", index=False)