In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os 
import matplotlib.pyplot as plt
import numpy as np  # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch 
import random
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# try to install transformers package.
try:
  from transformers import (
      AutoModelForSequenceClassification, 
      AutoTokenizer, 
      EvalPrediction,
      Trainer, 
      TrainingArguments,
      set_seed
  )
except ImportError as e:
    !pip install transformers

    from transformers import (
      AutoModelForSequenceClassification, 
      AutoTokenizer, 
      EvalPrediction,
      Trainer, 
      TrainingArguments,
      set_seed
    )

try:
  import optuna
except ImportError as e:
    !pip install optuna
    import optuna 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
for dirname, _, filenames in os.walk('/kaggle/input/commonlitreadabilityprize/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Set up for reproducibility
The cell below setups the seed of `numpy`, `random`, `torch`, and  `transformers` packages to ensure reproducibility across executions.

In [None]:
# path to distilbert pre-trained model.
PRETRAINED_MODEL  = 'distilbert-base-uncased'

# path to temporary working dir.
WORK_OUTPUT_DIR = '/kaggle/working/'

# random seed used by packages.
RANDOM_SEED = 2021

# setup the package seeds.
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
set_seed(RANDOM_SEED)
print('Notebook was set up.')

# Load and split train dataset
The cell below load train dataset provided. Then the this database is split in train and valid datasets.

In [None]:
train_dataframe = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv', doublequote=True)
train_dataframe = train_dataframe.rename(columns={'excerpt':'text', 'target':'label'})
train_dataframe.head()

In [None]:
train_texts, train_labels = list(train_dataframe['text']), list(train_dataframe['label'])
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=.25, random_state=RANDOM_SEED)
print("Train set has {} rows, and valid set has {} rows".format(len(train_texts), len(valid_texts)))

# Basic Exploratory Data Analysis

In [None]:
words_counter = train_dataframe['text'].apply(lambda s: len(s.split()))
sns.histplot(data=words_counter, kde=True)
print("Max. count words: {}".format(max(words_counter)))
print("Min. count words: {}".format(min(words_counter)))
print("Median count words: {}".format(np.median(words_counter)))

# Tokenize in the BERT word embeddings
The cell below tokenize the excerpt field from train and valid datasets using Distibert tokenizer.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True)
print('Train and valid sets tokenized')

# Format train and test datasets in Pytorch format
The cell below tokenize the excerpt field from train and valid datasets using Distibert tokenizer.

In [None]:
# Create torch dataset
class CommonLitDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        
        return item
        
    def __len__(self):
        return len(self.encodings["input_ids"])
               
train_dataset = CommonLitDataset(train_encodings, train_labels)
valid_dataset = CommonLitDataset(valid_encodings, valid_labels)
print('Torch dataset created.')

# Train the regression model
The cell below train the regression model for predicting the readability.

In [None]:
# Define computing metrics Trainer.
from transformers import EarlyStoppingCallback

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=1, return_dict=True)

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions 
    preds = np.squeeze(preds) 
    return {"rmse": mean_squared_error(y_true=p.label_ids, y_pred=preds, squared=False)}

def get_objective_metric(metrics):
    return metrics['eval_rmse']

# Define Trainer parameters
os.environ["WANDB_DISABLED"] = "true"
batch_size = 16
training_args = TrainingArguments(
    # The output directory where the model predictions 
    # and checkpoints will be written.
    output_dir='/kaggle/working/output',
    
    # Overwrite the content of the output directory.
    overwrite_output_dir=True,
    
    # Whether to run training or not.
    do_train=True,
    
    # Whether to run evoluation on the dev or not.
    do_eval=True,
    
    # Batch size GPU/TPU core/CPU for training
    per_device_train_batch_size=batch_size,
    
    # Batch size GPU/TPU core/CPU for evaluation
    per_device_eval_batch_size=batch_size * 4, 
    
    # Evaluation strategy to adopt during training.
    evaluation_strategy="steps",
    
    # How often to show logs.
    logging_steps=100,
    
    # Number of update steps between two 
    # evaluations if evaluation_strategy="steps".
    # Will default to the same value as l
    # logging_steps if not set.
    eval_steps=100,
    
    # Set prediction loss to 'True' in order to 
    # return loss for perplexity calculation
    # prediction_loss_only=True,
    
    # The initial learning rate for Adam.
    # Default to 5e-5
    learning_rate=2.5e-5,
    
    # The weigth decay to apply (if not zero)
    weight_decay=1.5, 
    
    # Epsilon for Adam optimizer,
    # Defaults to 1e-8
    adam_epsilon=3.890824499297403e-10,
    
    # Maximum gradient norm (for gradient 
    # clipping). Defaults to 0.
    max_grad_norm=0,
    
    # Total number of training epochs to perform 
    # (if not an integer, will perform the 
    # decimal part percents of
    # the last epoch before stopping training).
    num_train_epochs=5,

    # Number of updates steps before two checkpoint saves. 
    # Defaults to 500
    # save_steps=-1,
    
    # Number of steps used for a linear warmup from 0 to learning_rate.
    warmup_steps=500, 
    
    # Use in conjunction with load_best_model_at_end to specify 
    # the metric to use to compare two different models.
    metric_for_best_model="eval_rmse",
    
    # Whether or not to load the best model found during training 
    # at the end of training.
    load_best_model_at_end=True,
    
    # Use in conjunction with load_best_model_at_end and metric_for_best_model 
    # to specify if better models should have a greater metric or not. 
    greater_is_better=False,
    
    # TensorBoard log directory.
    logging_dir='/kaggle/working/logs',
    
    # Random seed that will be set at the beginning of training.
    seed=18,
    
    # The list of integrations to report the results and logs to.
    report_to=None
)

print('Train arguments set up.')

In [None]:
print("Loading `Trainer`...")
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    model_init=model_init,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    tokenizer=tokenizer
)
print("`Trainer` loaded.")

In [None]:
# Train pre-trained model
if training_args.do_train:
    train_results = trainer.train()

# Evaluate the trained model.
The cells below the trained model is evaluated using loss, perplexity and RMSE.

In [None]:
import math
# Keep track of train models and evaluate loss.
loss_history = {'train_loss':[], 'eval_loss':[]}

# Keep track of train and evaluate perplexity
perplexity_history = {'train_perplexity':[], 'eval_perplexity':[]}
for log_history in trainer.state.log_history:
    if 'loss' in log_history.keys():
        loss_history['train_loss'].append(log_history['loss'])
        perplexity_history['train_perplexity'].append(math.exp(log_history['loss']))
        
    elif 'eval_loss' in log_history.keys():
        loss_history['eval_loss'].append(log_history['eval_loss'])
        perplexity_history['eval_perplexity'].append(math.exp(log_history['eval_loss']))
print('Metrics collected.')

## Plot the loss chart

In [None]:
sns.set_style("whitegrid")
sns.lineplot(data=loss_history)

## Plot the perplexity chart

In [None]:
sns.set_style("whitegrid")
sns.lineplot(data=perplexity_history)

## Calculate the RMSE and Perplexity

In [None]:
if training_args.do_eval:    
    eval_output = trainer.evaluate()
    perplexity = math.exp(eval_output['eval_loss'])
    rmse = eval_output['eval_rmse']
    print('\nEvaluate Perplexity: {:3,.3f}'.format(perplexity))
    print('Evaluate RMSE: {:3,.3f}'.format(rmse))
else:
    print('No evaluation needed. No evaluation data provided, `do_eval=False`!')

## Evaluate the trained model
Evaluate the trained model using test dataset.

In [None]:
# Load test data
test_dataframe = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv', doublequote=True)
test_texts = test_dataframe.rename(columns={'excerpt':'text'})['text'].to_list()
test_encodings  = tokenizer(test_texts, truncation=True, padding=True)

# Create torch dataset
test_dataset  = CommonLitDataset(test_encodings)

y_pred=trainer.predict(test_dataset)

print('Prediction done')

# Submit to competition

In [None]:
# Save test predictions to file
np.set_printoptions(precision=1)
output = pd.DataFrame({'id': test_dataframe['id'],
                       'target': y_pred.predictions[:, 0]})
output.to_csv('submission.csv', index=False, float_format='%.1f')
print(output)
print('Test evaluation submitted.')