In [None]:
import random, os, warnings, math
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
from tensorflow.keras import optimizers, losses, metrics, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from transformers import TFAutoModelForSequenceClassification, TFAutoModel, AutoTokenizer

Implementation

In [None]:
PATH = '../input/commonlitreadabilityprize/'
#BASE_MODEL = '../input/huggingface-bert/bert-base-uncased/'


In [None]:
MAX_SEQUENCE_LENGTH = 512

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = df_train['target']
input_categories =df_train['excerpt']
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

In [None]:
def fix_length(tokens, max_sequence_length=512):
    length = len(tokens)
    if length > max_sequence_length:
        tokens = tokens[:max_sequence_length-1]
    return tokens

# function for tokenizing the input data for transformer.
def transformer_inputs(text,tokenizer,MAX_SEQUENCE_LENGTH = 512):

    text_tokens = tokenizer.tokenize(str(text))
    text_tokens = fix_length(text_tokens)
    ids_q = tokenizer.convert_tokens_to_ids(["[CLS]"] + text_tokens)
    padded_ids = (ids_q + [tokenizer.pad_token_id] * (MAX_SEQUENCE_LENGTH - len(ids_q)))[:MAX_SEQUENCE_LENGTH]
    #token_type_ids = ([0] * MAX_SEQUENCE_LENGTH)[:MAX_SEQUENCE_LENGTH]
    attention_mask = ([1] * len(ids_q) + [0] * (MAX_SEQUENCE_LENGTH - len(ids_q)))[:MAX_SEQUENCE_LENGTH]

    return padded_ids,attention_mask

# function for creating the input_ids, masks and segments for the bert input
def input_for_model(df, tokenizer):
    print(f'generating input for transformer...')
    input_ids,input_attention_masks = [], []
    for text in df['excerpt'].values:
        ids, mask = transformer_inputs(text,tokenizer)
        input_ids.append(ids)
        input_attention_masks.append(mask)
    
    return (
        np.asarray(input_ids, dtype=np.int32),
        np.asarray(input_attention_masks, dtype=np.int32))

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [None]:
def model_fn(encoder, seq_len):
    input_ids = L.Input(shape=(seq_len,), dtype=tf.int32, name='input_ids')
    input_attention_mask = L.Input(shape=(seq_len,), dtype=tf.int32, name='attention_mask')
    
    outputs = encoder({'input_ids': input_ids, 
                       'attention_mask': input_attention_mask})
    
    model = Model(inputs=[input_ids, input_attention_mask], outputs=outputs)

    optimizer = optimizers.Adam(lr=LEARNING_RATE)
    model.compile(optimizer=optimizer, 
                  loss=losses.MeanSquaredError(), 
                  metrics=[metrics.RootMeanSquaredError()])
    
    return model

In [None]:
BATCH_SIZE = 8 
LEARNING_RATE = 1e-5 
EPOCHS = 35
ES_PATIENCE = 7
PATIENCE = 2
N_FOLDS = 5
SEQ_LEN = 512 #300
BASE_MODEL = '/kaggle/input/huggingface-roberta/roberta-base/'
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

In [None]:
encoder = TFAutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)
model = model_fn(encoder, 512)
model.summary()

In [None]:
#gkf = GroupKFold(n_splits=5).split(X=df_train.excerpt, groups=df_train.excerpt)
import tqdm
inputs=input_for_model(df_train,tokenizer)
test_inputs=input_for_model(df_test,tokenizer)
outputs=compute_output_arrays(df_train,'target')

In [None]:
seed=42
skf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
valid_preds = [];
valid_labels = [];
history_list = [];
test_preds = []


for fold,(train_idx,valid_idx) in enumerate(skf.split(df_train)):
    print(f'\nFOLD: {fold+1}')
    print(f'TRAIN: {len(train_idx)} VALID: {len(valid_idx)}')
    # Model
    K.clear_session()

    encoder = TFAutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)
    model = model_fn(encoder, SEQ_LEN)
        
    model_path = f'model_{fold}.h5'
    
    es = EarlyStopping(monitor='val_root_mean_squared_error', mode='min', 
                       patience=ES_PATIENCE, restore_best_weights=True, verbose=1)
    checkpoint = ModelCheckpoint(model_path, monitor='val_root_mean_squared_error', mode='min', 
                                 save_best_only=True, save_weights_only=True)
    train_inputs=[inputs[i][train_idx] for i in range(2)]
    train_outputs=outputs[train_idx].reshape((-1,1)).flatten()

    valid_inputs = [inputs[i][valid_idx] for i in range(2)]
    valid_outputs = outputs[valid_idx].reshape((-1,1)).flatten()
    valid_labels.append(valid_outputs)
   
    
    history = model.fit(train_inputs, train_outputs, batch_size=8,
                validation_data=(valid_inputs, valid_outputs), 
                    steps_per_epoch=50, 
                    callbacks=[es, checkpoint], 
                    epochs=35,  
                    verbose=2).history
    history_list.append(history)
    # Save last model weights
    model.load_weights(model_path)
    
    # Results
    print(f"#### FOLD {fold+1} OOF RMSE = {np.min(history['val_root_mean_squared_error']):.4f}")

    
    
    valid_preds.append(model.predict(valid_inputs)['logits'])
    test_preds.append(model.predict(test_inputs)['logits'])
    

In [None]:
def plot_metrics(history):
    metric_list = list(history.keys())
    size = len(metric_list)//2
    fig, axes = plt.subplots(size, 1, sharex='col', figsize=(20, size * 5))
    axes = axes.flatten()
    
    for index in range(len(metric_list)//2):
        metric_name = metric_list[index]
        val_metric_name = metric_list[index+size]
        axes[index].plot(history[metric_name], label='Train %s' % metric_name)
        axes[index].plot(history[val_metric_name], label='Validation %s' % metric_name)
        axes[index].legend(loc='best', fontsize=16)
        axes[index].set_title(metric_name)

    plt.xlabel('Epochs', fontsize=16)
    sns.despine()
    plt.show()


for fold, history in enumerate(history_list):
    print(f'\nFOLD: {fold+1}')
    plot_metrics(history)

In [None]:
y_true = np.concatenate(valid_labels)
y_preds = np.concatenate(valid_preds)


for fold, history in enumerate(history_list):
    print(f"FOLD {fold+1} RMSE: {np.min(history['val_root_mean_squared_error']):.4f}")
    
print(f'OOF RMSE: {mean_squared_error(y_true, y_preds, squared=False):.4f}')

Submission

In [None]:
submission = df_test[['id']]
df_sub['target'] = np.mean(test_preds, axis=0)
df_sub.to_csv('submission.csv', index=False)
display(df_sub.head(10))