In [None]:
MAX_LEN = 250
batch_size = 8 
LR =  1.5e-5

PRETRAINED_MODEL = 'roberta-large'
D = '../input/commonlitreadabilityprize/'


import os
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.optimizers import Adam
import transformers
from transformers import TFAutoModelWithLMHead, AutoTokenizer
import logging
# no extensive logging 
logging.getLogger().setLevel(logging.NOTSET)

AUTO = tf.data.experimental.AUTOTUNE

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train_df.head(5)

In [None]:
train_df.shape

In [None]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
train_df['excerpt_len'] = train_df['excerpt'].apply(
    lambda x : len(x))

np.mean(train_df['excerpt_len'].values)

In [None]:
%%time
MAX_LEN = 250


    
def regular_encode(texts, tokenizer, maxlen = MAX_LEN):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        padding = 'max_length',
        truncation = True,
        max_length = maxlen,
    )
    
    return np.array(enc_di['input_ids'])
    

    

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
X_train = regular_encode(train_df.excerpt.tolist(), tokenizer, maxlen=MAX_LEN)
X_test = regular_encode(test_df.excerpt.tolist(), tokenizer, maxlen=MAX_LEN)


In [None]:
%%time
def prepare_mlm_input_and_labels(X):
    # 15% BERT masking
    inp_mask = np.random.rand(*X.shape)<0.2
    # do not mask special tokens
    inp_mask[X<=2] = False
    # set targets to -1 by default, it means ignore
    labels =  -1 * np.ones(X.shape, dtype=int)
    # set labels for masked tokens
    labels[inp_mask] = X[inp_mask]
    
    # prepare input
    X_mlm = np.copy(X)
    # set input to [MASK] which is the last token for the 90% of tokens
    # this means leaving 10% unchanged
    inp_mask_2mask = inp_mask  & (np.random.rand(*X.shape)<0.90)
    X_mlm[inp_mask_2mask] = 250001  # mask token is the last in the dict

    # set 10% to a random token
    inp_mask_2random = inp_mask_2mask  & (np.random.rand(*X.shape) < 1/9)
    X_mlm[inp_mask_2random] = np.random.randint(3, 250001, inp_mask_2random.sum())
    
    return X_mlm, labels

In [None]:
X_train_mlm = np.vstack([X_train, X_test])
# masks and labels
X_train_mlm, y_train_mlm = prepare_mlm_input_and_labels(X_train_mlm)

In [None]:
%%time

def create_dist_dataset(X, y=None, training=False):
    dataset = tf.data.Dataset.from_tensor_slices(X)

    ### Add y if present ###
    if y is not None:
        dataset_y = tf.data.Dataset.from_tensor_slices(y)
        dataset = tf.data.Dataset.zip((dataset, dataset_y))
        
    ### Repeat if training ###
    if training:
        dataset = dataset.shuffle(len(X)).repeat()

    dataset = dataset.batch(batch_size).prefetch(AUTO)

#     ### make it distributed  ###
#     dist_dataset = strategy.experimental_distribute_dataset(dataset)

    return  dataset
    
    
train_dist_dataset = create_dist_dataset(X_train_mlm, y_train_mlm, True)

In [None]:
def create_mlm_model_and_optimizer():
    model = TFAutoModelWithLMHead.from_pretrained(PRETRAINED_MODEL)
    optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
    return model, optimizer


mlm_model, optimizer = create_mlm_model_and_optimizer()
mlm_model.summary()

In [None]:
def masked_sparse_categorical_crossentropy(y_true, y_pred):
    y_true_masked = tf.boolean_mask(y_true, tf.not_equal(y_true, -1))
    y_pred_masked = tf.boolean_mask(y_pred, tf.not_equal(y_true, -1))
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true_masked,
                                                          y_pred_masked,
                                                          from_logits=True)
    return loss

In [None]:
mlm_model.compile(optimizer = optimizer ,loss = masked_sparse_categorical_crossentropy )

In [None]:
STEPS_PER_EPOCH = (train_df.shape[0]+test_df.shape[0])//batch_size
STEPS_PER_EPOCH

In [None]:
mlm_model.fit(train_dist_dataset,epochs = 2,steps_per_epoch=STEPS_PER_EPOCH)

In [None]:
mlm_model.save_pretrained('./mlm_tf-roberta-large')
tokenizer.save_pretrained('./mlm_tf-roberta-large')