In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import string

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, LSTM, Dropout, Flatten
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from transformers import RobertaTokenizer, TFRobertaModel

from tqdm.auto import tqdm
tqdm.pandas()

from matplotlib import pyplot as plt

In [None]:
df_train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
print(df_train.shape)
df_train.head()

In [None]:
df_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
print(df_test.shape)
df_test.head()

In [None]:
def show_scatter_plot(x, y, x_label, y_label, plot_color='blue'):
    fig=plt.figure()
    ax=fig.add_axes([0,0,1,1])
    ax.scatter(x, y, color=plot_color, alpha=0.3)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title("{} vs {}".format(x_label, y_label))
    plt.show()

In [None]:
target = df_train['target'].to_numpy()
standard_err = df_train['standard_error'].to_numpy()
show_scatter_plot(target, standard_err, "Target", "Standar Error")

In [None]:
# Outlier based on scatter plot
outlier_scatplot = df_train.loc[(df_train['standard_error'] < 0.4)]
outlier_scatplot.head()

In [None]:
# Remove outlier
df_train.drop(outlier_scatplot.index, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_train.shape

In [None]:
BASE_MODEL = '../input/huggingface-roberta/roberta-base'

In [None]:
def custom_standardization(text):
    text = text.lower() # if encoder is uncased
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text

def get_dataset(pandas_df, tokenizer, labeled=True, ordered=False, repeated=False, 
                batch_size=32, seq_len=128):
    """
        Return a Tensorflow dataset ready for training or inference.
    """
    text = [custom_standardization(text) for text in pandas_df['excerpt']]
    
    # Tokenize inputs
    tokenized_inputs = tokenizer(text, max_length=seq_len, truncation=True, 
                                 padding='max_length', return_tensors='tf')
    
    if labeled:
        dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs['input_ids']}, 
#                                                       'attention_mask': tokenized_inputs['attention_mask']}, 
                                                      (pandas_df['target'])))
        
    else:
        dataset = tf.data.Dataset.from_tensor_slices({'input_ids': tokenized_inputs['input_ids']}) 
#                                                       'attention_mask': tokenized_inputs['attention_mask']})
        
    if repeated:
        dataset = dataset.repeat()
    if not ordered:
        dataset = dataset.shuffle(1024)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    
    return dataset

In [None]:
def base_model(encoder, seq_len=256):
    input_ids = Input(shape=(seq_len,), dtype=tf.int32, name='input_ids')
#     input_attention_mask = Input(shape=(seq_len,), dtype=tf.int32, name='attention_mask')
    
    transformer = encoder({'input_ids': input_ids})
#                       'attention_mask': input_attention_mask})
    
    lstm = LSTM(32, return_sequences=True, name="lstm_layer")(transformer.last_hidden_state)
    
    dropout1 = Dropout(0.3, name="dropout_layer1")(lstm)
    
    dense = Dense(16, name="dense_layer")(dropout1)
    
    dropout2 = Dropout(0.5, name="dropout_layer2")(dense)
    
    flatten = Flatten(name="flatten_layer")(dropout2)
    
    output = Dense(1, activation="linear", name="output_layer")(flatten)
    
    model = Model(inputs=[input_ids], outputs=output)
    
#     model = Model(inputs=[input_ids, input_attention_mask], outputs=output)
    
#     model.summary()
    
    return model

In [None]:
# TPU or GPU detection
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f'Running on TPU {tpu.master()}')
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

In [None]:
from sklearn.model_selection import KFold

tokenizer = RobertaTokenizer.from_pretrained(BASE_MODEL)

SEQ_LEN = 256
BATCH_SIZE = 8 * REPLICAS

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1

best_fold = []

for train_idx, val_idx in kf.split(df_train):

    dataset_train = get_dataset(df_train.loc[train_idx], tokenizer, repeated=True, 
                                          batch_size=BATCH_SIZE, seq_len=SEQ_LEN)

    dataset_val = get_dataset(df_train.loc[val_idx], tokenizer, ordered=True, 
                                          batch_size=BATCH_SIZE, seq_len=SEQ_LEN)

    with strategy.scope():
        encoder = TFRobertaModel.from_pretrained(BASE_MODEL)
        model = base_model(encoder)
        model.compile(optimizer = SGD(learning_rate=0.001),
                      loss = 'mse', 
                      metrics = [tf.keras.metrics.RootMeanSquaredError()])
        
#     es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
    mc = ModelCheckpoint('model_fold{}.h5'.format(fold), monitor='val_loss',mode='min',
                         save_weights_only=True, save_best_only=True,verbose=1)
    
    history = model.fit(
                dataset_train,
                validation_data=(dataset_val),
                batch_size=BATCH_SIZE,
                steps_per_epoch=len(df_train.loc[train_idx])//BATCH_SIZE,
                epochs=50,
                verbose=1,
                callbacks=[mc]
    )
    
    plt.figure()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.plot(history.history['root_mean_squared_error'])
    plt.plot(history.history['val_root_mean_squared_error'])
    plt.ylabel('loss/error')
    plt.xlabel('epochs')
    plt.title("Training Loss and Error")
    plt.legend(['train_loss', 'val_loss', 'train_rmse', 'val_rmse'], loc='upper right')
    plt.show()
    
    lowest_idx = np.argmin(history.history['val_loss'])
    best_fold.append([fold, history.history['val_loss'][lowest_idx],
                      history.history['loss'][lowest_idx]])
    
    fold += 1

In [None]:
df_result = pd.DataFrame(
    best_fold,
    columns=['fold', 'val_loss', 'train_loss']).sort_values(by='val_loss', ascending=True)
df_result