# **CommonLit Readability Assessment**
## Determinining Performance with RoBERTa Base

**_Sections:_**
- _Required Packages & Helpers_
- _Configuration_
- _Data Preparation_
- _Modeling_
- _Evaluation_
- _Submission_

**_References (My Earlier Related Work):_**
1. [*Exploratory Data Analysis (EDA)*](https://www.kaggle.com/pradipkumardas/1-commonlit-readability-eda)
2. [*Baselining Model Performance with 1D ConvNet*](https://www.kaggle.com/pradipkumardas/2-commonlit-readability-baseline-perf-1dconvnet)
3. [*Simple Model with BERT*](https://www.kaggle.com/pradipkumardas/3-commonlit-readability-simple-model-with-bert)

_**Note:** This notebook just fine-tunes pretrained RoBERTa (base) model with cross validation, and tries to find if RoBERTa (base) performs better than BERT (base, uncased) where the latter one was experimented in previous notebook (without cross validation through). Other advanced options and techniques based on this findings will be explored and shared soon._

## Required Packages & Helpers

In [None]:
# Imports required packages

import random
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from transformers import TFAutoModelForSequenceClassification, AutoTokenizer

import matplotlib.pyplot as plt
import seaborn as sns

import gc

## Configurations

In [None]:
# Sets data configurations

data_config ={
    "n_bins": 20,
    "n_splits": 5    
}

In [None]:
# Sets model specific configurations

model_config = {
    "model_name": "../input/huggingface-roberta-variants/roberta-base/roberta-base",
    "model_path": "model.h5",
    "num_labels": 1,
    "learning_rate": 5e-5,
    "batch_size": 32,
    "max_length": 256,
    "epochs": 30,
}

In [None]:
# Seeds to reproduce experiment results
random.seed(42)
np.random.seed(42)

# Setting initialization for the theme of the plots
sns.set_theme(style="whitegrid")

## Data Preparation

In [None]:
# Loads data

train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

**Segmenting Labels (Distributing Lables in Discrete Intervals)**: As target is a interval variable, these labels should be segmented so that nearly equal number of samples from each segment can be selected during training model.

In [None]:
# Segments discrete interval of label by marking each sample with a bin number

train["bin"] = pd.cut(
    x=train.target, bins=data_config["n_bins"], 
    labels=[i for i in range(data_config["n_bins"])])

## Modeling

In [None]:
training_history = []    # Stores the training and validation performance during cross validation
prediction_history = []  # Stores the prediction against test data to aggregate upon while submission

In [None]:
# Creates tokenizer to prepare data for model training

tokenizer = AutoTokenizer.from_pretrained(model_config["model_name"])

In [None]:
# Prepares test data in TensorFlow dataset format

test_encodings = tokenizer(
    test.excerpt.tolist(), 
    max_length=model_config["max_length"], 
    truncation=True, 
    padding="max_length",
    return_tensors="tf")

test_dataset = tf.data.Dataset.from_tensor_slices(
    {"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]})
test_dataset = test_dataset.batch(model_config["batch_size"])
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
"""
Performs cross validation
"""

# Applies stratified cross validation to have equal distribution for different target bins 

cv = StratifiedKFold(data_config["n_splits"], shuffle=True)

cv_generator = cv.split(train, y=train.bin)

for fold, (idx_train, idx_val) in enumerate(cv_generator):
    
    print(f"FOLD {fold+1}...")
    
    # Encodes training data
    train_encodings = tokenizer(
        train.excerpt.iloc[idx_train].tolist(), 
        max_length=model_config["max_length"], 
        truncation=True, 
        padding="max_length",
        return_tensors="tf")

    # Encodes validation data
    val_encodings = tokenizer(
        train.excerpt.iloc[idx_val].tolist(), 
        max_length=model_config["max_length"], 
        truncation=True, 
        padding="max_length",
        return_tensors="tf")
    
    # Creates TensorFlow dataset out of training data encodings
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
        train.target.iloc[idx_train]))
    train_dataset = train_dataset.shuffle(1024)
    train_dataset = train_dataset.batch(model_config["batch_size"])
    train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

    # Creates TensorFlow dataset out of validation data encodings
    val_dataset = tf.data.Dataset.from_tensor_slices((
        {"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"]},
        train.target.iloc[idx_val]))
    val_dataset = val_dataset.batch(model_config["batch_size"])
    val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
    
    # Creates encoder from Transformer
    encoder = TFAutoModelForSequenceClassification.from_pretrained(
    model_config["model_name"], num_labels = model_config["num_labels"])

    # Creates multi inputs for model
    input_ids = layers.Input(shape=(model_config["max_length"], ), dtype=tf.int32, name="input_ids")
    attention_mask = layers.Input(shape=(model_config["max_length"]), dtype=tf.int32, name="attention_mask")

    # Sets model output
    outputs = encoder({"input_ids": input_ids, "attention_mask": attention_mask})

    # Wraps all layers within a model object
    model = Model(inputs=[input_ids, attention_mask], outputs=outputs)

    # Compiles and shows the summary to check
    model.compile(
        optimizer=keras.optimizers.Adam(model_config["learning_rate"]),
        loss=keras.losses.MeanSquaredError(),
        metrics=keras.metrics.RootMeanSquaredError())

    # Configures monitor with rules for model training to stop if criterion match
    early_stopping_monitor = EarlyStopping(
        monitor="val_root_mean_squared_error", mode="min", patience=5, restore_best_weights=True, verbose=1)

    # Configures rules to store model parameters (only weights) at its best during training
    checkpoint = ModelCheckpoint(
        model_config["model_path"], monitor="val_root_mean_squared_error", mode="min", save_best_only=True, save_weights_only=True)

    # Fits the model
    history = model.fit(
        x=train_dataset,
        validation_data=val_dataset,
        callbacks=[early_stopping_monitor, checkpoint],
        epochs=model_config["epochs"],
        verbose=2).history
    
    # Adds the model training history into list for later analysis
    training_history.append(history)
    best_epoch = np.argmin(history["val_root_mean_squared_error"])
    print(f"\nBest Validation Performance: {history['val_root_mean_squared_error'][best_epoch]} (RMSE) at epoch {best_epoch + 1}")
    
    # Predicts on test data and appends the prediction into list to average later
    predictions = model.predict(test_dataset)["logits"]
    prediction_history.append(predictions)
    print("\nPerformed predictions using current model on test dataset and values were recorded.\n")
    
    # Frees resources
    del checkpoint, early_stopping_monitor, model, outputs, encoder
    del val_dataset, train_dataset, train_encodings, val_encodings
    gc.collect()

## Evaluation

In [None]:
# Plots Model's Cross Validation Performance 

fig, axes = plt.subplots(1, 5, sharey=True, figsize=(20,5))
fig.suptitle("Cross Validation Performance")
for ax, history in enumerate(training_history):
    axes[ax].plot(range(1, len(history["root_mean_squared_error"]) + 1), history["root_mean_squared_error"], "bo", label="Training Loss")
    axes[ax].plot(range(1, len(history["val_root_mean_squared_error"]) + 1), history["val_root_mean_squared_error"], "b", label="Validation Loss")
    axes[ax].set_title(f"FOLD {ax+1}")
    axes[ax].set_xlabel("Epoch")
    axes[ax].legend()
    if ax == 0:
        axes[ax].set_ylabel("Loss (RMS)")

## Submission

In [None]:
# Predicts on test data

mean_predictions = np.mean(prediction_history, axis=0)

In [None]:
# Averaging predictions across folds

submission.target = mean_predictions

In [None]:
# Submitting by saving predictions into submission file

submission.to_csv("submission.csv", index=False)

In [None]:
submission