In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from glob import glob
import gc

import random

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set2')

import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

import tensorflow.keras.backend as K

import os
print(os.listdir('../input/'))

import warnings
warnings.simplefilter('ignore')

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

In [None]:
def seed_everything(seed = 0):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

seed = 777
seed_everything(seed)

In [None]:
base_dir = '../input/commonlitreadabilityprize/'

In [None]:
train = pd.read_csv(base_dir + 'train.csv')
print(train.shape)
train.head()

In [None]:
test = pd.read_csv(base_dir + 'test.csv')
print(test.shape)
test.head()

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
print(sub.shape)
sub.head()

In [None]:
print(f"Number of unique id in trainset: {train['id'].nunique()}")
print(f"Number of unique id in testset: {test['id'].nunique()}")

<code>__Distribution of Target__</code>

In [None]:
sns.kdeplot(train['target'], shade = True, color = 'green')
plt.axvline(train['target'].mean(), label = 'Mean', color = 'r', linewidth = 1, linestyle = '--')
plt.axvline(train['target'].median(), label = 'Median', color = 'b', linewidth = 1, linestyle = '--')
plt.legend();

<code>__Distribution of Standard Error__</code>

- Not provided for testset

In [None]:
sns.kdeplot(train['standard_error'], shade = True, color = 'grey')
plt.axvline(train['standard_error'].mean(), label = 'Mean', color = 'r', linewidth = 1, linestyle = '--')
plt.axvline(train['standard_error'].median(), label = 'Median', color = 'b', linewidth = 1, linestyle = '--')
plt.legend();

In [None]:
train['excerpt_len'] = train['excerpt'].apply(lambda x: len(str(x)))
train['excerpt_wordlen'] = train['excerpt'].apply(lambda x: len(str(x).split(' ')))

test['excerpt_len'] = test['excerpt'].apply(lambda x: len(str(x)))
test['excerpt_wordlen'] = test['excerpt'].apply(lambda x: len(str(x).split(' ')))

In [None]:
print(f"Max. word length in train - Excerpt: {train['excerpt_wordlen'].max()}")
print(f"Min. word length in train - Excerpt: {train['excerpt_wordlen'].min()}")
print()
print(f"Max. word length in train - Excerpt: {test['excerpt_wordlen'].max()}")
print(f"Min. word length in train - Excerpt: {test['excerpt_wordlen'].min()}")

- The max word length is useful to determine the tokenizer's max_len 

<code>__Distribution of Text Lengths__</code>

In [None]:
plt.subplot(1, 2, 1)
sns.distplot(train['excerpt_len'], bins = 50)
plt.title('Train Character Length')

plt.subplot(1, 2, 2)
sns.distplot(train['excerpt_wordlen'], bins = 50)
plt.title('Train Word Length');

In [None]:
plt.subplot(1, 2, 1)
sns.distplot(test['excerpt_len'], bins = 50)
plt.title('Test Character Length')

plt.subplot(1, 2, 2)
sns.distplot(test['excerpt_wordlen'], bins = 50)
plt.title('Test Word Length');

In [None]:
train['folds'] = -1

train['bins'] = pd.cut(train['target'], bins = 6, labels = False)

skf = StratifiedKFold(n_splits = 5)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X = train['excerpt'], y = train['bins'].values)):
    train.loc[val_idx, 'folds'] = fold
        
train.drop('bins', axis = 1, inplace = True)
train.head()

# RoBERTa

In [None]:
import tokenizers
from transformers import RobertaConfig, TFRobertaModel
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification

max_len = 256

roberta_path = '../input/tf-roberta/'

In [None]:
tok = RobertaTokenizer.from_pretrained('../input/roberta-base')
tok.vocab_size

In [None]:
def roberta_encode(texts, tokenizer, max_len = max_len):
    all_tokens = np.ones((len(texts), max_len), dtype = 'int32')
    all_masks = np.zeros((len(texts), max_len), dtype = 'int32')
    
    for k, text in enumerate(texts):
        encoded = tok.encode_plus(
            text,                
            add_special_tokens = True,
            max_length = max_len,     
            pad_to_max_length = True,
            return_attention_mask = True,
       )
        #print(encoded['input_ids'])
        #print(encoded['attention_mask'])
        #For one sentence as input:
        # <s> ...word tokens... </s
        
        # bos_token_id <s>: 0
        # eos_token_id </s>: 2
        # sep_token_id </s>: 2
        # pad_token_id <pad>: 1
        
        #Roberta does not use token_type_ids like BERT does.
        #So there's no need to create token_type_ids.
        
        all_tokens[k, :max_len] = encoded['input_ids']
        all_masks[k, :max_len] = encoded['attention_mask']
    return all_tokens, all_masks

In [None]:
def build_roberta(max_len = max_len):
    input_word_ids = Input(shape = (max_len,), dtype = tf.int32, name = "input_word_ids")
    input_mask = Input(shape = (max_len,), dtype = tf.int32, name = "input_mask")
    
    config = RobertaConfig.from_pretrained(roberta_path + 'config-roberta-base.json')
    
    roberta_model = TFRobertaModel.from_pretrained(roberta_path + 'pretrained-roberta-base.h5', 
                                                                       config = config)
    
    x = roberta_model([input_word_ids, input_mask])[0]
    
    x = tf.keras.layers.Dropout(0.2)(x)
    
    out = tf.keras.layers.Dense(1, activation = 'linear')(x)
    
    model = Model(inputs = [input_word_ids, input_mask], outputs = out)
    
    model.compile(Adam(lr = 1e-5), loss = tf.keras.losses.MeanSquaredError(), 
                  metrics = tf.keras.metrics.RootMeanSquaredError())
    
    return model

In [None]:
Xtrain = roberta_encode(train['excerpt'].values, tok, max_len = max_len)
ytrain = train['target'].values

print(Xtrain[0].shape, ytrain.shape)

K.clear_session()
model = build_roberta(max_len = max_len)

#model.trainable = True

check = ModelCheckpoint(f'roberta_model.h5', monitor = 'val_loss', verbose = 1, save_best_only = True,
    save_weights_only = True, mode = 'auto', save_freq = 'epoch')

history = model.fit(Xtrain, ytrain, epochs = 4, batch_size = 8, 
          verbose = 1, callbacks = [check], 
          validation_split = 0.2
         )

print('Loading model...')
model.load_weights(f'roberta_model.h5')

Xtest = roberta_encode(test['excerpt'].values, tok, max_len = max_len)
print(Xtest[0].shape)

print('Predicting Test...')
preds = model.predict(Xtest, verbose = 1)


In [None]:
sub['target'] = np.mean(preds, axis = 1)
sub.to_csv('./submission.csv', index = False)
sub

In [None]:
plt.subplot(1, 2, 1)
sns.kdeplot(train['target'], shade = True, color = 'green')
plt.axvline(train['target'].mean(), label = 'Mean', color = 'r', linewidth = 1, linestyle = '--')
plt.axvline(train['target'].median(), label = 'Median', color = 'b', linewidth = 1, linestyle = '--')
plt.legend()
plt.title('Train Target')

plt.subplot(1, 2, 2)
sns.kdeplot(sub['target'], shade = True, color = 'blue')
plt.axvline(sub['target'].mean(), label = 'Mean', color = 'r', linewidth = 1, linestyle = '--')
plt.axvline(sub['target'].median(), label = 'Median', color = 'b', linewidth = 1, linestyle = '--')
plt.legend()
plt.title('Predicted Target');

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))