## Import libraries

In [None]:
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import LayerNormalization
from tensorflow.keras.layers import BatchNormalization
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.layers import Conv1D, Flatten, Dense
from tensorflow.keras.layers import Input, Dropout, Activation

from transformers import RobertaTokenizer, TFRobertaModel, RobertaConfig
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel, XLMRobertaConfig

In [None]:
! mkdir "./Roberta-Base"
! mkdir "./XLM-Roberta-Base"
! mkdir "./DistilRoberta-Base"

## Load source datasets

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
train_df["excerpt_wordlen"] = train_df["excerpt"].apply(lambda x: len(str(x).split()))
train_df.drop(['url_legal','license','standard_error'], inplace=True, axis=1)
train_df.set_index("id", inplace=True)
print(f"train_df: {train_df.shape}\n")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
test_df["excerpt_wordlen"] = test_df["excerpt"].apply(lambda x: len(str(x).split()))
test_df.drop(['url_legal','license'], inplace=True, axis=1)
test_df.set_index("id", inplace=True)
print(f"test_df: {test_df.shape}\n")
test_df.head()

## Extract target label

In [None]:
Ytrain = train_df['target'].values
Ytrain_strat = pd.qcut(train_df['target'].values, q=5, labels=range(0,5))
train_df.drop(['target'], inplace=True, axis=1)
print(f"Ytrain: {Ytrain.shape}")

## Model Hyperparameters

In [None]:
FOLD = 5
NUM_SEED = 1
VERBOSE = 1
MINI_BATCH_SIZE = 16
NUM_EPOCH = 20
MAX_LEN = max(train_df['excerpt_wordlen'].max(), 
              test_df['excerpt_wordlen'].max()) + 11

ROBERTA_BASE = "../input/huggingface-roberta-variants/roberta-base/roberta-base"
XLM_ROBERTA_BASE = "../input/huggingface-roberta-variants/tf-xlm-roberta-base/tf-xlm-roberta-base"
DISTILROBERTA_BASE = "../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base"

## Helper Functions

In [None]:
def sent_encode(texts, tokenizer):
    input_ids = []
    attention_mask = []
    token_type_ids = []

    for text in tqdm(texts):
        tokens = tokenizer.encode_plus(text, max_length=MAX_LEN, truncation=True, 
                                       padding='max_length', add_special_tokens=True, 
                                       return_attention_mask=True, return_token_type_ids=True, 
                                       return_tensors='tf')
        
        input_ids.append(tokens['input_ids'])
        attention_mask.append(tokens['attention_mask'])
        token_type_ids.append(tokens['token_type_ids'])

    return np.array(input_ids), np.array(attention_mask), np.array(token_type_ids)

In [None]:
def rmse_loss(y_true, y_pred):
    y_true = tf.cast(y_true, dtype=tf.float32)
    y_pred = tf.cast(y_pred, dtype=tf.float32)
    return tf.math.sqrt(tf.math.reduce_mean((y_true - y_pred)**2))

In [None]:
def commonlit_model(transformer_model, use_tokens_type_ids=True):
    
    input_id = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")
    token_type_id = Input(shape=(MAX_LEN,), dtype=tf.int32, name="token_type_ids")

    if use_tokens_type_ids:
        embed = transformer_model(input_id, token_type_ids=token_type_id, attention_mask=attention_mask)[0]
    
    else:
        embed = transformer_model(input_id, attention_mask=attention_mask)[0]
    
    #x = embed[:, 0, :]
    embed = LayerNormalization()(embed)
    
    x = WeightNormalization(
            Conv1D(filters=384, kernel_size=5, 
                   strides=2, padding='same', 
                   kernel_regularizer=l2(0.0001),
                   kernel_initializer='he_uniform'))(embed)
    x = LayerNormalization()(x)
    x = Activation('relu')(x)
    x = SpatialDropout1D(rate=0.25)(x)
    
    x = WeightNormalization(
            Conv1D(filters=192, kernel_size=5, 
                   strides=2, padding='same', 
                   kernel_regularizer=l2(0.0001),
                   kernel_initializer='he_uniform'))(x)
    x = LayerNormalization()(x)
    x = Activation('relu')(x)
    x = SpatialDropout1D(rate=0.25)(x)
    
    x = Flatten()(x)
    x = Dropout(rate=0.5)(x)
    
    x = Dense(units=1, kernel_initializer='lecun_normal')(x)

    model = Model(inputs=[input_id, attention_mask, token_type_id], outputs=x, 
                  name='CommonLit_Readability_Model')
    return model

## Roberta-Base Model

### Generate word tokens and attention masks

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_BASE)

In [None]:
Xtrain_id, Xtrain_mask, Xtrain_token = sent_encode(train_df['excerpt'].values, tokenizer)

Xtrain_id = Xtrain_id.reshape((Xtrain_id.shape[0], Xtrain_id.shape[2]))
Xtrain_mask = Xtrain_mask.reshape((Xtrain_mask.shape[0], Xtrain_mask.shape[2]))
Xtrain_token = Xtrain_token.reshape((Xtrain_token.shape[0], Xtrain_token.shape[2]))
    
print(f"Input-ids: {Xtrain_id.shape} \nAttention Mask: {Xtrain_mask.shape} \nToken-type-ids: {Xtrain_token.shape}")

In [None]:
Xtest_id, Xtest_mask, Xtest_token = sent_encode(test_df['excerpt'].values, tokenizer)

Xtest_id = Xtest_id.reshape((Xtest_id.shape[0], Xtest_id.shape[2]))
Xtest_mask = Xtest_mask.reshape((Xtest_mask.shape[0], Xtest_mask.shape[2]))
Xtest_token = Xtest_token.reshape((Xtest_token.shape[0], Xtest_token.shape[2]))
    
print(f"Input-ids: {Xtest_id.shape} \nAttention Mask: {Xtest_mask.shape} \nToken-type-ids: {Xtest_token.shape}")

### Initialize the Bert-Base model

In [None]:
config = RobertaConfig.from_pretrained(ROBERTA_BASE)
config.output_hidden_states = False

transformer_model = TFRobertaModel.from_pretrained(ROBERTA_BASE, config=config)

In [None]:
model = commonlit_model(transformer_model)
model.summary()

### Fit the model with K-Fold validation

In [None]:
np.random.seed(23)
seeds = np.random.randint(0, 100, size=NUM_SEED)

counter = 0
oof_score = 0
y_pred_final1 = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_id, Ytrain_strat)):
        counter += 1

        train_x_id, train_x_mask, train_x_token = Xtrain_id[train], Xtrain_mask[train], Xtrain_token[train]
        val_x_id, val_x_mask, val_x_token = Xtrain_id[val], Xtrain_mask[val], Xtrain_token[val]
        train_y, val_y = Ytrain[train], Ytrain[val]
        
        tf.random.set_seed(seed)

        model = commonlit_model(transformer_model)
        
        model.compile(loss=rmse_loss,
                      metrics=[RootMeanSquaredError(name='rmse')],
                      optimizer=Adam(lr=8e-5))

        early = EarlyStopping(monitor="val_rmse", mode="min", 
                              restore_best_weights=True, 
                              patience=5, verbose=VERBOSE)
        
        reduce_lr = ReduceLROnPlateau(monitor="val_rmse", factor=0.5, 
                                      min_lr=1e-7, patience=2, 
                                      verbose=VERBOSE, mode='min')

        chk_point = ModelCheckpoint(f'./Roberta-Base/CLRP_Roberta_Base_{counter}C.h5', 
                                    monitor='val_rmse', verbose=VERBOSE, 
                                    save_best_only=True, mode='min',
                                    save_weights_only=True)
        
        history = model.fit(
            [train_x_id, train_x_mask, train_x_token], train_y, 
            batch_size=MINI_BATCH_SIZE,
            epochs=NUM_EPOCH, 
            verbose=VERBOSE, 
            callbacks=[reduce_lr, early, chk_point], 
            validation_data=([val_x_id, val_x_mask, val_x_token], val_y)
        )
        
        model.load_weights(f'./Roberta-Base/CLRP_Roberta_Base_{counter}C.h5')
        
        y_pred = model.predict([val_x_id, val_x_mask, val_x_token])
        y_pred_final1 += model.predict([Xtest_id, Xtest_mask, Xtest_token])
        
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final1 = y_pred_final1 / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

## XLM-Roberta-Base Model

### Generate word tokens and attention masks

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained(XLM_ROBERTA_BASE)

In [None]:
Xtrain_id, Xtrain_mask, Xtrain_token = sent_encode(train_df['excerpt'].values, tokenizer)

Xtrain_id = Xtrain_id.reshape((Xtrain_id.shape[0], Xtrain_id.shape[2]))
Xtrain_mask = Xtrain_mask.reshape((Xtrain_mask.shape[0], Xtrain_mask.shape[2]))
Xtrain_token = Xtrain_token.reshape((Xtrain_token.shape[0], Xtrain_token.shape[2]))
    
print(f"Input-ids: {Xtrain_id.shape} \nAttention Mask: {Xtrain_mask.shape} \nToken-type-ids: {Xtrain_token.shape}")

In [None]:
Xtest_id, Xtest_mask, Xtest_token = sent_encode(test_df['excerpt'].values, tokenizer)

Xtest_id = Xtest_id.reshape((Xtest_id.shape[0], Xtest_id.shape[2]))
Xtest_mask = Xtest_mask.reshape((Xtest_mask.shape[0], Xtest_mask.shape[2]))
Xtest_token = Xtest_token.reshape((Xtest_token.shape[0], Xtest_token.shape[2]))
    
print(f"Input-ids: {Xtest_id.shape} \nAttention Mask: {Xtest_mask.shape} \nToken-type-ids: {Xtest_token.shape}")

### Initialize the Albert-V2 model

In [None]:
config = XLMRobertaConfig.from_pretrained(XLM_ROBERTA_BASE)
config.output_hidden_states = False

transformer_model = TFXLMRobertaModel.from_pretrained(XLM_ROBERTA_BASE, config=config)

In [None]:
model = commonlit_model(transformer_model)
model.summary()

### Fit the model with K-Fold validation

In [None]:
np.random.seed(29)
seeds = np.random.randint(0, 100, size=NUM_SEED)

counter = 0
oof_score = 0
y_pred_final2 = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_id, Ytrain_strat)):
        counter += 1

        train_x_id, train_x_mask, train_x_token = Xtrain_id[train], Xtrain_mask[train], Xtrain_token[train]
        val_x_id, val_x_mask, val_x_token = Xtrain_id[val], Xtrain_mask[val], Xtrain_token[val]
        train_y, val_y = Ytrain[train], Ytrain[val]
        
        tf.random.set_seed(seed)

        model = commonlit_model(transformer_model)
        
        model.compile(loss=rmse_loss,
                      metrics=[RootMeanSquaredError(name='rmse')],
                      optimizer=Adam(lr=8e-5))

        early = EarlyStopping(monitor="val_rmse", mode="min", 
                              restore_best_weights=True, 
                              patience=5, verbose=VERBOSE)
        
        reduce_lr = ReduceLROnPlateau(monitor="val_rmse", factor=0.5, 
                                      min_lr=1e-7, patience=2, 
                                      verbose=VERBOSE, mode='min')

        chk_point = ModelCheckpoint(f'./XLM-Roberta-Base/CLRP_XLMRoberta_Base_{counter}C.h5', 
                                    monitor='val_rmse', verbose=VERBOSE, 
                                    save_best_only=True, mode='min',
                                    save_weights_only=True)
        
        history = model.fit(
            [train_x_id, train_x_mask, train_x_token], train_y, 
            batch_size=MINI_BATCH_SIZE,
            epochs=NUM_EPOCH, 
            verbose=VERBOSE, 
            callbacks=[reduce_lr, early, chk_point], 
            validation_data=([val_x_id, val_x_mask, val_x_token], val_y)
        )
        
        model.load_weights(f'./XLM-Roberta-Base/CLRP_XLMRoberta_Base_{counter}C.h5')
        
        y_pred = model.predict([val_x_id, val_x_mask, val_x_token])
        y_pred_final2 += model.predict([Xtest_id, Xtest_mask, Xtest_token])
        
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final2 = y_pred_final2 / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

## DistilRoberta-Base Model

### Generate word tokens and attention masks

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(DISTILROBERTA_BASE)

In [None]:
Xtrain_id, Xtrain_mask, Xtrain_token = sent_encode(train_df['excerpt'].values, tokenizer)

Xtrain_id = Xtrain_id.reshape((Xtrain_id.shape[0], Xtrain_id.shape[2]))
Xtrain_mask = Xtrain_mask.reshape((Xtrain_mask.shape[0], Xtrain_mask.shape[2]))
Xtrain_token = Xtrain_token.reshape((Xtrain_token.shape[0], Xtrain_token.shape[2]))
    
print(f"Input-ids: {Xtrain_id.shape} \nAttention Mask: {Xtrain_mask.shape} \nToken-type-ids: {Xtrain_token.shape}")

In [None]:
Xtest_id, Xtest_mask, Xtest_token = sent_encode(test_df['excerpt'].values, tokenizer)

Xtest_id = Xtest_id.reshape((Xtest_id.shape[0], Xtest_id.shape[2]))
Xtest_mask = Xtest_mask.reshape((Xtest_mask.shape[0], Xtest_mask.shape[2]))
Xtest_token = Xtest_token.reshape((Xtest_token.shape[0], Xtest_token.shape[2]))
    
print(f"Input-ids: {Xtest_id.shape} \nAttention Mask: {Xtest_mask.shape} \nToken-type-ids: {Xtest_token.shape}")

### Initialize the DistilBert-Base model

In [None]:
config = RobertaConfig.from_pretrained(DISTILROBERTA_BASE)
config.output_hidden_states = False

transformer_model = TFRobertaModel.from_pretrained(DISTILROBERTA_BASE, config=config)

In [None]:
model = commonlit_model(transformer_model, use_tokens_type_ids=False)
model.summary()

### Fit the model with K-Fold validation

In [None]:
np.random.seed(31)
seeds = np.random.randint(0, 100, size=NUM_SEED)

counter = 0
oof_score = 0
y_pred_final3 = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_id, Ytrain_strat)):
        counter += 1

        train_x_id, train_x_mask, train_x_token = Xtrain_id[train], Xtrain_mask[train], Xtrain_token[train]
        val_x_id, val_x_mask, val_x_token = Xtrain_id[val], Xtrain_mask[val], Xtrain_token[val]
        train_y, val_y = Ytrain[train], Ytrain[val]
        
        tf.random.set_seed(seed)

        model = commonlit_model(transformer_model, use_tokens_type_ids=False)
        
        model.compile(loss=rmse_loss,
                      metrics=[RootMeanSquaredError(name='rmse')],
                      optimizer=Adam(lr=8e-5))

        early = EarlyStopping(monitor="val_rmse", mode="min", 
                              restore_best_weights=True, 
                              patience=5, verbose=VERBOSE)
        
        reduce_lr = ReduceLROnPlateau(monitor="val_rmse", factor=0.5, 
                                      min_lr=1e-7, patience=2, 
                                      verbose=VERBOSE, mode='min')

        chk_point = ModelCheckpoint(f'./DistilRoberta-Base/CLRP_DistilRoberta_Base_{counter}C.h5', 
                                    monitor='val_rmse', verbose=VERBOSE, 
                                    save_best_only=True, mode='min',
                                    save_weights_only=True)
        
        history = model.fit(
            [train_x_id, train_x_mask, train_x_token], train_y, 
            batch_size=MINI_BATCH_SIZE,
            epochs=NUM_EPOCH, 
            verbose=VERBOSE, 
            callbacks=[reduce_lr, early, chk_point], 
            validation_data=([val_x_id, val_x_mask, val_x_token], val_y)
        )
        
        model.load_weights(f'./DistilRoberta-Base/CLRP_DistilRoberta_Base_{counter}C.h5')
        
        y_pred = model.predict([val_x_id, val_x_mask, val_x_token])
        y_pred_final3 += model.predict([Xtest_id, Xtest_mask, Xtest_token])
        
        score = np.sqrt(mean_squared_error(val_y, y_pred))
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final3 = y_pred_final3 / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

## Create submission file

In [None]:
y_pred_final = (y_pred_final1 + y_pred_final2 + y_pred_final3) / 3.0

submit_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submit_df['target'] = y_pred_final
submit_df.to_csv("./submission.csv", index=False)
submit_df.head()