### Install Huggingface datasets library

* Datasets library is required to package training data, both inputs and targets into a dataset compatible with the Huggingface Trainer API 

In [None]:
!pip install datasets

### Import libraries

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random

import torch

from datasets.arrow_dataset import Dataset

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

2021-09-17 03:34:47.796777: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv


### Seed everything for reproducibility

* Considerable amount of randomness during model training would lead to different results each time the notebook is executed

* Sources of randomness include dropout and weight initialisation

In [3]:
seed = 3

# python RNG
random.seed(seed)

# pytorch RNGs
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# numpy RNG
np.random.seed(seed)

## Data preparation

### Load training and prediction data

* Utilise pandas library to read in csv files for the training and test sets
* Columns are dropped in the train DataFrame to contain only useful data 

In [4]:
source_dir = '/kaggle/input/commonlitreadabilityprize/'
train_data = pd.read_csv(source_dir + 'train.csv')

# Drop unnecessary columns and rename target column to labels (required for Trainer)
train_data = train_data.drop(columns = ["id", "url_legal", "license", "standard_error"]).rename(columns = {"target": "labels"})

preds_data = pd.read_csv(source_dir + 'test.csv')

train_data.head()

Unnamed: 0,excerpt,labels
0,When the young people returned to the ballroom...,-0.340259
1,"All through dinner time, Mrs. Fayre was somewh...",-0.315372
2,"As Roger had predicted, the snow departed as q...",-0.580118
3,And outside before the palace a great garden w...,-1.054013
4,Once upon a time there were Three Bears who li...,0.247197


### Replace new line indicators with spaces to remove unnecessary tokens

In [5]:
train_data['excerpt'] = train_data['excerpt'].apply(lambda x : x.replace('\n', ' '))
preds_data['excerpt'] = preds_data['excerpt'].apply(lambda x : x.replace('\n', ' '))

train_data['excerpt'][0]

'When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape. The floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches. At each end of the room, on the wall, hung a beautiful bear-skin rug. These rugs were for prizes, one for the girls and one for the boys. And this was the game. The girls were gathered at one end of the room and the boys at the other, and one end was called the North Pole, and the other the South Pole. Each player was given a small flag which they were to plant on reaching the Pole. This would have been an easy matter, but each traveller was obliged to wear snowshoes.'

### Split train.csv into training and validation sets to evaluate model training

**Using scikit-learn's train_test_split:**
<br>
* Specify evaluation size to be 20% of the entire train set
* random_state argument shuffles the data and makes the shuffling reproducible

In [6]:
train_set, eval_set = train_test_split(train_data, test_size = 0.2, random_state = 42)

train_set.head()

Unnamed: 0,excerpt,labels
2743,The building of rotary presses for printing il...,-1.51835
2347,The idea of a trip on Bob's yacht suited every...,-0.548807
2387,"Seeing the front door wide open, the enchanter...",-0.193262
2202,"The widow she cried over me, and called me a p...",-1.033799
786,"Jacobitism was (and, to a much smaller extent,...",-1.725606


### Convert pandas DataFrame to Dataset object

* This is done in order to convert DataFrame into a type which is compatible with the in-built Huggingface Trainer

In [7]:
train_dataset = Dataset.from_pandas(df = train_set)
eval_dataset = Dataset.from_pandas(df = eval_set)

train_dataset

Dataset({
    features: ['excerpt', 'labels', '__index_level_0__'],
    num_rows: 2267
})

### Load pretrained RoBERTa tokenizer

* Tokenizer has a pretrained set of vocabulary and assigns an integer to each word in a passage
* Convert text into numerical form to pass into the transformer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenizer.save_pretrained('./Commonlit-RoBERTa-Base/tokenizer')

tokenizer

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

### Map tokenized text excerpts to Dataset

**1.** Define a function which returns sequences of tokens, taking the texts as input in order to prepare the data for the model
    * The sequences are padded to the maximum length defined by the max_length argument
    * Truncation of the sequence occurs when the excerpt is too long
    * Sequences will have an attention mask with them, which 'masks' padded tokens and tells the model to disregard them
    * Token '1' correlates to the padding tokens<br><br>
**2.** Utilise map method on the Datasets to execute the 'tokenize_data' function on each excerpt stored in the Dataset<br><br>
**3.** Convert data to PyTorch tensors

Reference: https://huggingface.co/transformers/training.html

In [14]:
MAX_LENGTH = 256

def tokenize_data(dataset):
    token_sequence = tokenizer(dataset["excerpt"], padding = "max_length", truncation = True, max_length = MAX_LENGTH)
    return token_sequence

# Map tokenized text (input_ids, attention_mask) to new dataset
tokenized_train_dataset = train_dataset.map(tokenize_data, batched = True).remove_columns(["__index_level_0__", "excerpt"])
tokenized_eval_dataset = eval_dataset.map(tokenize_data, batched = True).remove_columns(["__index_level_0__", "excerpt"])

# Convert into PyTorch tensors
tokenized_train_dataset.set_format("torch")
tokenized_eval_dataset.set_format("torch")

tokenized_train_dataset['input_ids'][0]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

tensor([    0,   133,   745,     9, 13351,  1766, 34578,    13, 12168, 22827,
         6665,    21,  3751,    25,   419,    25,   504,  5243,    50,   504,
         2545,    11,   928,     6,    30,     5,  1513,     6,    53,  4100,
          396,  1282,     6,    25,   117,   285,  4521,    34,   655,    57,
          156,     9,   143,  9879,   898,     4,    20, 32130,   368,     9,
            5,   928, 25596,   491,  4756,   357,   775,     4,    96,   504,
         4718,    41, 22827, 20879,  2225,     6,    41,    66, 14596,     9,
           39,   372,  8812,     6,    21, 11118,  2115,    10, 13351,  1766,
         1228,    61,    21,     6,   309,     7,    39,   445,     6, 11236,
           30,    10, 20399,   179,   661,  1440,  2367,  1054,     4,    20,
           78,    65,     6,   959,     6,   222,    45,    23,    70,   972,
            5,   723,  4501,     9, 22827,   675,  3569, 12168,     6,     8,
            6,   150,   277,  3563, 11236,    15,     5,   276, 

In [15]:
tokenized_train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 2267
})

## Model selection

### Load RoBERTa-Base model

* RoBERTa transformer was pretrained on larger dataset, and generally performs better than BERT
* Attention mechanism is one factor which makes transformers more effective compared to RNN or LSTM, as it enables the model to model long-term dependencies more effectively and weight each word in terms of its significance
* Set **num_labels** to 1 to indicate a regression problem is involved

In [16]:
# Load model from Huggingface
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels = 1)

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

### Send the model to GPU memory

* Required to increase training rate drastically

In [17]:
%%capture

# Send model to GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

### Set up Trainer instance to train model

* Trained on 5 epochs and save checkpoints of the model at the end of each epoch
* Best model is loaded at the end and determined by the evaluation loss

In [18]:
batch_size = 16

training_args = TrainingArguments(
    output_dir="./Commonlit-RoBERTa-Base-CP", # Select model path for checkpoint
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    evaluation_strategy = 'epoch',
    save_strategy = "epoch", # Save checkpoint at end of each epoch
    metric_for_best_model = 'eval_loss',
    greater_is_better = False,
    load_best_model_at_end = True,
    report_to = "none",
    seed = 3)

# Create Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_eval_dataset)

In [19]:
# Train model with given parameters
trainer.train()

***** Running training *****
  Num examples = 2267
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 710


Epoch,Training Loss,Validation Loss
1,No log,0.408047
2,No log,0.544274
3,No log,0.444234
4,0.296400,0.277034
5,0.296400,0.333592


***** Running Evaluation *****
  Num examples = 567
  Batch size = 16
Saving model checkpoint to ./Commonlit-RoBERTa-Base-CP/checkpoint-142
Configuration saved in ./Commonlit-RoBERTa-Base-CP/checkpoint-142/config.json
Model weights saved in ./Commonlit-RoBERTa-Base-CP/checkpoint-142/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 567
  Batch size = 16
Saving model checkpoint to ./Commonlit-RoBERTa-Base-CP/checkpoint-284
Configuration saved in ./Commonlit-RoBERTa-Base-CP/checkpoint-284/config.json
Model weights saved in ./Commonlit-RoBERTa-Base-CP/checkpoint-284/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 567
  Batch size = 16
Saving model checkpoint to ./Commonlit-RoBERTa-Base-CP/checkpoint-426
Configuration saved in ./Commonlit-RoBERTa-Base-CP/checkpoint-426/config.json
Model weights saved in ./Commonlit-RoBERTa-Base-CP/checkpoint-426/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 567
  Batch size = 16
Saving model checkpoint 

TrainOutput(global_step=710, training_loss=0.22970777431004483, metrics={'train_runtime': 354.8342, 'train_samples_per_second': 31.944, 'train_steps_per_second': 2.001, 'total_flos': 1491168517532160.0, 'train_loss': 0.22970777431004483, 'epoch': 5.0})

### Convert model to evaluation mode

* Required to perform predictions, otherwise the model would still behave as if it was being trained and implement dropout, leading to inconsistent predictions

In [20]:
%%capture
# Convert model to evaluation mode to indicate model is ready for prediction
model.eval()

### Save pre-trained model

In [21]:
trainer.save_model('./Commonlit-RoBERTa-Base')

Saving model checkpoint to ./Commonlit-RoBERTa-Base
Configuration saved in ./Commonlit-RoBERTa-Base/config.json
Model weights saved in ./Commonlit-RoBERTa-Base/pytorch_model.bin


## Evaluation

### Make predictions

* Tokenize data in the test dataset and pass it through model to obtain the scores
* Add scores to list and export the scores as csv file

In [22]:
preds_targets = []

for excerpt in preds_data['excerpt']:
    token_seq = tokenizer(excerpt, padding = "max_length", max_length = MAX_LENGTH, truncation = True, return_tensors = "pt")
    token_seq.to(device)
    pred = model(**token_seq) # Unpack token sequences tensor 
    preds_targets.append(pred.logits[0].item())
    
preds_targets

[-0.007161587942391634,
 -0.3461619019508362,
 -0.18076074123382568,
 -2.6938998699188232,
 -1.7281757593154907,
 -0.8599075675010681,
 0.5470359325408936]

In [24]:
submission_df = pd.DataFrame({'id' : preds_data['id'], 'target': preds_targets})
submission_df

Unnamed: 0,id,target
0,c0f722661,-0.007162
1,f0953f0a5,-0.346162
2,0df072751,-0.180761
3,04caf4e0c,-2.6939
4,0e63f8bea,-1.728176
5,12537fe78,-0.859908
6,965e592c0,0.547036


In [25]:
submission_df.to_csv("submission.csv", index = False)