# CommonLit Readability Basic EDA and RoBerta-base


## Import required modules

In [None]:
import re
import time
import nltk
import random
import warnings
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
from tensorflow.keras import optimizers, losses, metrics, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from transformers import TFAutoModelForSequenceClassification, TFAutoModel, AutoTokenizer

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    
seed = 0
seed_everything(seed)
warnings.filterwarnings('ignore')

## load dataset

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.isnull().sum()

We don't have any missing values in the columns of our interest, i.e., excerpt, target and standard_error!

## Target : 
Our target variable starts at -3.67, the highest possible difficulty and stops at 1.71, which is the lowest difficulty to read.

In [None]:
print('Min Target Value = ', train['target'].min())
print('\nText : ', train[train['target'] == train['target'].min()]['excerpt'][1705])

print('\n\nMax Target Value = ', train['target'].max())
print('\nText : ', train[train['target'] == train['target'].max()]['excerpt'][2829])

In [None]:
sns.distplot(train['target'])
plt.title('Target Distribution', size=15)
plt.xlabel('Value')
plt.ylabel('Frequency')

## Let's see standard error

In [None]:
print('Min Standard Error : ', train['standard_error'].min())
print('Target Value : ', train[train['standard_error'] == train['standard_error'].min()]['target'][106])

print('\nText : ',train[train['standard_error'] == train['standard_error'].min()]['excerpt'][106])

print('\n\nMax Standard Error : ', train['standard_error'].max())
print('Target Value : ', train[train['standard_error'] == train['standard_error'].max()]['target'][2235])

print('\nText : ',train[train['standard_error'] == train['standard_error'].max()]['excerpt'][2235])

In [None]:
sns.distplot(x=train['standard_error'], color='red')
plt.title('Standard Error Distribution', size=15)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

We can standard error has outliers

## Target vs Standard error

In [None]:
plt.figure(figsize=(10, 5))
sns.scatterplot(x=train['target'], y=train['standard_error'], color='black', size=train['standard_error'])
plt.title('Target vs Standard Error', size=15)
plt.show()

We can see only one outlier present.

## Data preprocessing

In [None]:
def clean_data(data):
    cleaned_excerpt = []
    for text in data['excerpt']:
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        text = nltk.word_tokenize(text)
        
        text = [word for word in text if word not in stopwords.words('english')]
        
        lemma = nltk.WordNetLemmatizer()
        text = [lemma.lemmatize(word) for word in text]
        text = ' '.join(text)
        
        cleaned_excerpt.append(text)
    return cleaned_excerpt

In [None]:
train['cleaned_excerpt'] = clean_data(train)

In [None]:
train.head()

## Let's plot top unigrams, bigrams and trigrams

In [None]:
def top_n_ngrams(corpus, n_gram=(1, 1), n=None):
    vec = CountVectorizer(ngram_range = n_gram).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
   
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    
    return words_freq[:n]

unigrams = top_n_ngrams(train['cleaned_excerpt'], n_gram = (1, 1), n=20)
bigrams = top_n_ngrams(train['cleaned_excerpt'], n_gram = (2, 2), n=20)
trigrams = top_n_ngrams(train['cleaned_excerpt'], n_gram = (3, 3), n=20)

In [None]:
def create_dataframe(data):
    word = []
    freq = []
    for d in data:
        word.append(d[0])
        freq.append(d[1])
    return pd.DataFrame({'word': word, 'freq': freq})

uni_df = create_dataframe(unigrams)
bi_df = create_dataframe(bigrams)
tri_df = create_dataframe(trigrams)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8, 20))
sns.barplot(x='freq', y='word', color='#00e6b8', data=uni_df, ax= ax1)
sns.barplot(x='freq', y='word', color='#ff5050', data=bi_df, ax= ax2)
sns.barplot(x='freq', y='word', color='#e600e6', data=tri_df, ax= ax3)

ax1.set_title('Top 20 Uni-grams', size=12)
ax2.set_title('Top 20 Bi-grams', size=12)
ax3.set_title('Top 20 Tri-grams', size=12)
plt.show()

## WordCloud

In [None]:
plt.figure(figsize=(10, 10))
wc = WordCloud(stopwords=STOPWORDS,background_color="white", contour_width=2, contour_color='blue',
               width=1500, height=750,max_words=150, max_font_size=256,random_state=42)

wc.generate(' '.join(train['cleaned_excerpt']))
plt.imshow(wc)
plt.axis('off')
plt.show()

## Hardware configuration for TPU

In [None]:
# Detect hardware, return appropriate distribution strategy

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f"Running on TPU {tpu.master()}")
except ValueError:
    tpu = None
    
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS : {REPLICAS}')

## Model Parameters

In [None]:
BATCH_SIZE = 8 * REPLICAS
LEARNING_RATE = 1e-5 * REPLICAS
EPOCHS = 35
ES_PATIENCE = 7
PATIENCE = 2
N_FOLDS = 5
SEQ_LEN = 256
BASE_MODEL = '/kaggle/input/huggingface-roberta/roberta-base/'

## Auxiliary functions

In [None]:
def custom_standardization(text):
    text = text.lower()
    text = text.strip()
    return text

def sample_target(features, target):
    mean, stddev = target
    sampled_target = tf.random.normal([], mean=tf.cast(mean, dtype=tf.float32), stddev=tf.cast(stddev, dtype=tf.float32), dtype=tf.float32)
    return (features, sampled_target)

def get_dataset(df, tokenizer, labeled=True, ordered=False, repeated=False, is_sampled=False, batch_size=32, seq_len=128):
    """
        Return a Tensorflow dataset ready for training or inference
    """
    text = [custom_standardization(text) for text in df['excerpt']]
    
    tokenized_inputs = tokenizer(text, max_length=seq_len, truncation=True, padding='max_length', return_tensors='tf')
    
    if labeled:
        dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs['input_ids'],
                                                      'attention_mask' : tokenized_inputs['attention_mask']},
                                                     (df['target'], df['standard_error'])))
        if is_sampled:
            dataset = dataset.map(sample_target, num_parallel_calls = tf.data.AUTOTUNE)
    else:
        dataset = tf.data.Dataset.from_tensor_slices({'input_ids': tokenized_inputs['input_ids'],
                                                         'attention_mask': tokenized_inputs['attention_mask']})
    if repeated:
        dataset = dataset.repeat()
    if not ordered:
        dataset = dataset.shuffle(1024)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
        
    return dataset

## Model

In [None]:
def model_fn(encoder, seq_len=256):
    input_ids = L.Input(shape=(seq_len,), dtype=tf.int32, name='input_ids')
    input_attention_mask = L.Input(shape=(seq_len,), dtype=tf.int32, name='attention_mask')
    
    outputs = encoder({'input_ids': input_ids,
                      'attention_mask': input_attention_mask})
    
    model = Model(inputs=[input_ids, input_attention_mask], outputs=outputs)
    
    optimizer = optimizers.Adam(lr=LEARNING_RATE)
    model.compile(optimizer=optimizer,
                 loss = losses.MeanSquaredError(),
                 metrics=[metrics.RootMeanSquaredError()])
    return model
with strategy.scope():
    encoder = TFAutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)
    model = model_fn(encoder, SEQ_LEN)
    
model.summary()

## Training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
skf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
oof_pred = []
oof_labels = []
history_list = []
test_pred = []

for fold, (idxT, idxV) in enumerate(skf.split(train)):
    if tpu:
        tf.tpu.experimental.initialize_tpu_system(tpu)
    print(f'\nFOLD: {fold+1}')
    print(f'Train: {len(idxT)} Valid: {len(idxV)}')
    
    K.clear_session()
    with strategy.scope():
        encoder = TFAutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)
        model = model_fn(encoder, SEQ_LEN)
        
    model_path = f'model_{fold}.h5'
    es = EarlyStopping(monitor='val_root_mean_squared_error',
                       mode='min', patience=ES_PATIENCE,
                       restore_best_weights=True, verbose=1)
    checkpoint = ModelCheckpoint(model_path,
                                 monitor='val_root_mean_squared_error',
                                 mode='min', save_best_only=True,
                                 save_weights_only=True)
    
    history = model.fit(x=get_dataset(train.loc[idxT],
                                     tokenizer, repeated=True, is_sampled=True,
                                     batch_size=BATCH_SIZE, seq_len=SEQ_LEN), 
                       validation_data=get_dataset(train.loc[idxV], tokenizer,
                                                  ordered=True, batch_size=BATCH_SIZE,
                                                  seq_len=SEQ_LEN),
                       steps_per_epoch=50,
                       callbacks=[es, checkpoint],
                       epochs=EPOCHS,
                       verbose=2).history
    history_list.append(history)
    model.load_weights(model_path)
    
    print(f"#### Fold {fold+1} OOF RMSE = {np.min(history['val_root_mean_squared_error']):.4f}")
    
    valid_ds = get_dataset(train.loc[idxV], tokenizer, ordered=True,
                          batch_size=BATCH_SIZE, seq_len=SEQ_LEN)
    oof_labels.append([target[0].numpy() for sample, target in iter(valid_ds.unbatch())])
    x_oof = valid_ds.map(lambda sample, target: sample)
    
    oof_pred.append(model.predict(x_oof)['logits'])
    
    test_ds = get_dataset(test, tokenizer, labeled=False, ordered=True, batch_size=BATCH_SIZE, seq_len=SEQ_LEN)
    x_test = test_ds.map(lambda sample: sample)
    test_pred.append(model.predict(x_test)['logits'])

## Model loss and metrics graph

In [None]:
def plot_metrics(history):
    metric_list = list(history.keys())
    size = len(metric_list) // 2
    fig, axes = plt.subplots(size, 1, sharex='col', figsize=(20, size*5))
    axes = axes.flatten()
        
    for index in range(len(metric_list)//2):
        metric_name = metric_list[index]
        val_metric_name = metric_list[index+size]
        axes[index].plot(history[metric_name], label='Train %s ' % metric_name)
        axes[index].plot(history[val_metric_name], label='Validation %s' % metric_name)
        axes[index].legend(loc='best', fontsize=16)
        axes[index].set_title(metric_name)
            
    plt.xlabel('Epochs', fontsize=16)
    sns.despine()
    plt.show()
    
for fold, history in enumerate(history_list):
    print(f'Fold : {fold+1}')
    plot_metrics(history)

## Model evaluation
> We are evaluating the model on the OOF predictions, it stands for Out Of Fold, since we are training using K-Fold our model will see all the data, and the correct way to evaluate each fold is by looking at the predictions that are not from that fold.

## OOF Metrics

In [None]:
y_true = np.concatenate(oof_labels)
y_preds = np.concatenate(oof_pred)

for fold, history in enumerate(history_list):
    print(f"Fold {fold+1} RMSE : {np.min(history['val_root_mean_squared_error']):.4f}")
    
print(f"OOF RMSE: {mse(y_true, y_preds, squared=False):.4f}")

## Error analysis, label x prediction distribution
Here we can compare the distribution from the labels and the predicted values, in a perfect scenario they should align.

In [None]:
preds_df = pd.DataFrame({'Label': y_true, 'Prediction': y_preds[:, 0]})

fig, ax = plt.subplots(1, 1, figsize=(20, 6))
sns.distplot(preds_df['Label'], ax=ax, label='Label')
sns.distplot(preds_df['Prediction'], ax=ax, label='Prediction')
ax.legend()
plt.show()

In [None]:
sns.jointplot(data=preds_df, x='Label', y='Prediction', kind='reg', height=10)
plt.show()

## Make Submission

In [None]:
submission = test[['id']]
submission['target'] = np.mean(test_pred, axis=0)
submission

In [None]:
submission.to_csv('submission.csv', index=False)

Reference - https://www.kaggle.com/dimitreoliveira/commonlit-readability-eda-roberta-tf-baseline