In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from transformers import AutoTokenizer
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.layers import Input
from tensorflow.keras import Model
import tensorflow.keras.backend as K
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import gc

#### Most of the code in this notebook is the modified version of this notebook https://www.kaggle.com/dimitreoliveira/commonlit-readability-eda-roberta-tf-baseline/

#### The idea was to compare the performance of the classic BERT, DistilBERT, and RoBERTa using 5-fold CV, both with and without removing stop words.

In [None]:
df_data = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df_data.head()

In [None]:
fig, axes = plt.subplots(1, 2,figsize=(15, 5))
df_data['target'].hist(ax = axes[0], bins = 100)
df_data['standard_error'].hist(ax = axes[1], bins = 100)

In [None]:
class BERTModel:
    def __init__(self, df_train, df_test, model_config):
        self.df_data = df_train
        self.df_test = df_test
        self.model_path = model_config['model_path']
        self.batch_size = model_config['batch_size']
        self.seq_len = model_config['seq_len']
        self.text_column = model_config['text_column']
        self.n_folds = model_config['n_folds']
        self.epochs = model_config['epochs']
        self.remove_swords = model_config['remove_swords']
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        self.oof_pred = []
        self.oof_labels = []
        self.history_list = []
        self.test_pred = []
        self.history_list = []
    
    @staticmethod
    def sample_target(features, target):
        mean, stddev = target
        sampled_target = tf.random.normal([], mean=tf.cast(mean, dtype=tf.float32), 
                                      stddev=tf.cast(stddev, dtype=tf.float32),
                                      dtype=tf.float32)
        return (features, sampled_target)
    
    @staticmethod
    def custom_standardization(text):
        text = text.lower() # if encoder is uncased
        text = text.strip()
        return text
    
    @staticmethod
    def clean_paragraph(paragraph, stop_words, lemmatizer):
        """Cleans paragraph before tokenization
        Source: https://www.kaggle.com/andradaolteanu/i-commonlit-explore-xgbrf-repeatedfold-model """

        # Tokenize & convert to lower case
        tokens = word_tokenize(paragraph)
        tokens = [t.lower() for t in tokens]

        # Remove punctuation & non alphabetic characters from each word
        table = str.maketrans('', '', string.punctuation)
        tokens = [t.translate(table) for t in tokens]
        tokens = [t for t in tokens if t.isalpha()]

        # Filter out stopwords
        tokens = [t for t in tokens if not t in stop_words]

        # Lemmatizer
        tokens_lemm = [lemmatizer.lemmatize(t) for t in tokens]
        
        return " ".join(tokens_lemm)
    
    def get_dataset(self, df_data, labeled = True, ordered = False, repeated = False,
                   is_sampled = False):
        """
        Return a Tensorflow dataset ready for training or inference.
        """
        if self.remove_swords:
            df_temp = df_data.copy()
            stop_words = stopwords.words('english')
            lemmatizer = WordNetLemmatizer()
            df_temp[self.text_column] = df_temp[self.text_column].apply(
                lambda x: self.clean_paragraph(x, stop_words, lemmatizer))
            del stop_words, lemmatizer
            gc.collect()
            text = df_temp[self.text_column].to_list()
        else:
            text = [self.custom_standardization(text) for text in df_data[self.text_column]]
        
        tokenized_inputs = self.tokenizer(text, max_length=self.seq_len,
                                          truncation=True, padding='max_length',
                                          return_tensors='tf')
        
        if labeled:
            dataset = tf.data.Dataset.from_tensor_slices((
                        {'input_ids': tokenized_inputs['input_ids'], 
                        'attention_mask': tokenized_inputs['attention_mask']}, 
                        (df_data['target'], df_data['standard_error'])
                      ))
            if is_sampled:
                dataset = dataset.map(self.sample_target,
                                  num_parallel_calls=tf.data.AUTOTUNE)
        else:
            dataset = tf.data.Dataset.from_tensor_slices(
                        {'input_ids': tokenized_inputs['input_ids'], 
                        'attention_mask': tokenized_inputs['attention_mask']})
        
        if repeated:
            dataset = dataset.repeat()
        if not ordered:
            dataset = dataset.shuffle(1024)
        dataset = dataset.batch(self.batch_size)
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        return dataset
    
    def create_model(self, encoder):
        """ Returns prepared model from encoder"""
        input_ids = Input(shape=(self.seq_len,), dtype=tf.int32, name='input_ids')
        input_attention_mask = Input(shape=(self.seq_len,), dtype=tf.int32,
                                     name='attention_mask')
        outputs = encoder({'input_ids': input_ids, 
                           'attention_mask': input_attention_mask})
    
        final_model = Model(inputs=[input_ids, input_attention_mask],
                            outputs=outputs)
    
        final_model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=tf.metrics.RootMeanSquaredError(),
            )
        return final_model
    
    def train_model(self):
        
        skf = KFold(n_splits=self.n_folds, shuffle=True, random_state=0)
        
        for fold,(idxT, idxV) in enumerate(skf.split(self.df_data)):
            
            K.clear_session()
            encoder = TFAutoModelForSequenceClassification.from_pretrained(
                                        self.model_path, num_labels=1)
            model = self.create_model(encoder)
            
            model_path = f'model_{fold}.h5'
            es = EarlyStopping(monitor='val_root_mean_squared_error',
                               mode='min', patience=5,
                               restore_best_weights=True, verbose=1)
            checkpoint = ModelCheckpoint(model_path,
                                         monitor='val_root_mean_squared_error',
                                         mode='min', save_best_only=True,
                                         save_weights_only=True)
            
            history = model.fit(
                x=self.get_dataset(self.df_data.loc[idxT], repeated=True, is_sampled=True), 
                validation_data=self.get_dataset(self.df_data.loc[idxV], ordered=True), 
                steps_per_epoch=50, callbacks=[es, checkpoint], epochs=self.epochs,
                verbose=2).history
            
            self.history_list.append(history)
            
            model.load_weights(model_path)
            
            # OOF predictions
            valid_ds = self.get_dataset(self.df_data.loc[idxV],ordered=True)
            self.oof_labels.append(
                [target[0].numpy() for sample, target in iter(valid_ds.unbatch())])
            x_oof = valid_ds.map(lambda sample, target: sample)
            self.oof_pred.append(model.predict(x_oof)['logits'])
            
            # Test predictions
            test_ds = self.get_dataset(self.df_test, labeled=False, ordered=True)
            x_test = test_ds.map(lambda sample: sample)
            self.test_pred.append(model.predict(x_test)['logits'])
            
            del encoder, history, x_test, test_ds, valid_ds, x_oof, model
            gc.collect()
    
    def get_oof_results(self):
        
        y_true = np.concatenate(self.oof_labels)
        y_preds = np.concatenate(self.oof_pred)
        
        for fold, history in enumerate(self.history_list):
            print(f"FOLD {fold+1} RMSE: {np.min(history['val_root_mean_squared_error']):.4f}")
        
        oof_rmse = mean_squared_error(y_true, y_preds, squared=False)
        print(f'OOF RMSE: {oof_rmse:.4f}')
        return oof_rmse
    
    def get_submission(self):
        """Returns submision df"""
        
        submission = self.df_test[['id']]
        submission['target'] = np.mean(self.test_pred, axis=0)
        return submission

In [None]:
result_dict = {}

model_path_dict = {
    'bert':'/kaggle/input/huggingface-bert/bert-base-uncased/',
    'roberta':'/kaggle/input/huggingface-roberta/roberta-base/',
    'distil':'/kaggle/input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased/'
}

submission_dict = {}

for remove_stopwords in [False]:
    for model_name, model_path in model_path_dict.items():
        data_dir = '../input/commonlitreadabilityprize'
        df_train = pd.read_csv('{}/train.csv'.format(data_dir))
        df_test = pd.read_csv('{}/test.csv'.format(data_dir))

        model_config = {'model_path':'/kaggle/input/huggingface-roberta/roberta-base/',
                        'batch_size':32,'seq_len': 256,
                        'text_column':'excerpt','n_folds':5,'epochs':30,
                        'remove_swords': False}
        model_config['model_path'] = model_path
        model_config['remove_swords'] = remove_stopwords

        b = BERTModel(df_train, df_test, model_config)
        b.train_model()
        result_dict['{}_{}'.format(model_name, remove_stopwords)] = b.get_oof_results()
        submission_dict['{}_{}'.format(model_name, remove_stopwords)] = b.get_submission()
        del b
        gc.collect()

In [None]:
plt.rcParams["figure.figsize"] = (10,3)
plt.bar(result_dict.keys(), result_dict.values())
plt.title('Out of fold, RMSE')
print(result_dict)

In [None]:
for submission_df in submission_dict.values():
    print(submission_df.head())

In [None]:
target = None
for submission_df in submission_dict.values():
    if target is None:
        target = submission_df['target'].values.copy()
    else:
        target += submission_df['target'].values
target/=3

In [None]:
submission_df['target'] = target
submission_df.to_csv("submission.csv", index=False)