Hello fellow Kagglers,

This notebook demonstrates how to pretrain the RoBERTa base model on a binary classification problem for simple and normal Wikipedia abstracts.

The dataset used for this pretraining is the [Simple/Normal Wikiepdia Abstracts V1](https://www.kaggle.com/markwijkhuizen/simplenormal-wikipedia-abstracts-v1) dataset containing ~250K simple and normal Wikipedia abstract with an equal distribution.

[This](https://www.kaggle.com/markwijkhuizen/simple-normal-wikipedia-abstract-dataset) notebook gives a demonstration on how the dataset was created.

RoBERTa is trained for masked-language modeling, which is predicting a masked word in a sentence. This task does learn RoBERTa to understand language, but has little in common with the CommonLit competition task. Pretraining RoBERTa on an actual readability prediction task should finetune the model for readability related tasks, such as the CommonLit competition.

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_addons as tfa

from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from tensorflow.keras.mixed_precision import experimental as mixed_precision
from scipy.stats import pearsonr
from transformers import RobertaTokenizer, TFRobertaModel

import os
import sys
import nltk
import string
import math
import logging
import glob
import random

tf.get_logger().setLevel(logging.ERROR)
        
tqdm.pandas()

print(f'tensorflow version: {tf.__version__}')
print(f'tensorflow keras version: {tf.keras.__version__}')
print(f'python version: P{sys.version}')

In [None]:
# Seed all random sources
def set_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    
set_seeds(42)

In [None]:
SEQ_LENGTH = 250

train = pd.read_pickle('/kaggle/input/simplenormal-wikipedia-abstracts-v1/wikipedia_abstracts.pkl')

The linguistic features will not be used in this notebook, but are saved as strings to reduce memory consumption and can be decoded using ```s.split(chr(0))```

In [None]:
# Example of accessing the part of speech feature
print(train.loc[0, 'pos'].split(chr(0)))

In [None]:
display(train.head())

In [None]:
display(train.info())

# Roberta Tokenize

In [None]:
# Define the model name
MODEL = 'roberta-base'

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained(MODEL)

In [None]:
# This function tokenize the text according to a transformers model tokenizer
def regular_encode(excerpt):
    enc_di = tokenizer.batch_encode_plus(
        excerpt,
        padding = 'max_length',
        truncation = True,
        max_length = SEQ_LENGTH,
    )
    
    return np.array(enc_di['input_ids'])

# Compute text encoding, this will take ~5 minutes
train['input_ids'] = regular_encode(train['abstract_clean']).tolist()
display(train.head())

# Hardware Configuration

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', TPU.master())
except ValueError:
    print('Running on GPU')
    TPU = None

if TPU:
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

# set half precision policy
mixed_precision.set_policy('float32')

print(f'Compute dtype: {mixed_precision.global_policy().compute_dtype}')
print(f'Variable dtype: {mixed_precision.global_policy().variable_dtype}')

# Model

In [None]:
def get_model():
    tf.keras.backend.clear_session()

    with strategy.scope():
        # RoBERTa
        transformer = TFRobertaModel.from_pretrained(MODEL)
        input_ids = tf.keras.layers.Input(shape = (SEQ_LENGTH), dtype=tf.int32, name='input_ids')
        sequence_output = transformer(input_ids)[0]
        # We only need the cls_token, resulting in a 2d array
        cls_token = sequence_output[:, 0, :]
        # 2 output neurons for Simple and Normal class
        output = tf.keras.layers.Dense(2, activation='softmax', dtype=tf.float32)(cls_token)
        
        model = tf.keras.models.Model(inputs = [input_ids], outputs = [output])

        loss = tf.keras.losses.SparseCategoricalCrossentropy()
        optimizer = tf.optimizers.Adam(learning_rate=1e-5)
        metrics = [
            tf.keras.metrics.SparseCategoricalAccuracy('accuracy'),
        ]

        model.compile(optimizer=optimizer, loss=loss, metrics=metrics)            
    
    return model

model = get_model()

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, show_dtype=True, show_layer_names=True, expand_nested=False)

# Configuration

In [None]:
# Training configuration
BATCH_SIZE_BASE = 32
BATCH_SIZE = BATCH_SIZE_BASE * REPLICAS
STEPS_PER_EPOCH = 100
EPOCHS = len(train) // (STEPS_PER_EPOCH * BATCH_SIZE)
KFOLDS = 5

print(f'BATCH SIZE: {BATCH_SIZE}, EPOCHS: {EPOCHS}')

# Dataset

In [None]:
def get_train_dataset():
    # Randomize the dataset order, otherwise the model will first be trained on Simple abstracts only
    idxs = np.arange(len(train))
    random.Random(42).shuffle(idxs)
    
    train_x = { 
        'input_ids': np.array(train.loc[idxs, 'input_ids'].tolist()),
    }
    train_y = train.loc[idxs, 'label_int']
    
    train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
    
    train_dataset = train_dataset.batch(BATCH_SIZE)
    
    return train_dataset

train_dataset = get_train_dataset()

In [None]:
# Example of a batch
train_x, train_y = next(iter(train_dataset))
print(f'train_x keys: {list(train_x.keys())}, train_x shape: {train_x["input_ids"].shape}')
print(f'train_y shape: {train_y.shape}, train_y dtype {train_y.dtype}')
print(f'labels: {train_y}')

# Training

Training will be split in epochs of 100 steps and the dataset will be iterated once. This means all data will be seen for the first time and the training accuracy can be interpreted as validation accuracy, because no sample will be used for training twice. Training will take roughly 1.5 hours.

In [None]:
history = model.fit(
    train_dataset,
    epochs=EPOCHS,
    verbose=1,
    steps_per_epoch=STEPS_PER_EPOCH,
)

# Training History

Training metrics are shown here, an accuracy of 90\%+ is achieved!

In [None]:
def plot_history_metric(history, metric):
    plt.figure(figsize=(15, 8))
    N_EPOCHS = len(history.history['loss'])
    x = [1, 5] + [10 + 5 * idx for idx in range((N_EPOCHS - 10) // 5 + 1)]
    x_ticks = np.arange(1, N_EPOCHS+1)
    val = 'val' in ''.join(history.history.keys())
    # summarize history for accuracy
    plt.plot(x_ticks, history.history[metric])
    if val:
        val_values = history.history[f'val_{metric}']
        val_argmin = np.argmin(val_values)
        plt.scatter(val_argmin + 1, val_values[val_argmin], color='red', s=50, marker='o')
        plt.plot(x_ticks, val_values)
    
    plt.title(f'Model {metric}', fontsize=24)
    plt.ylabel(metric, fontsize=18)
    plt.xlabel('epoch', fontsize=18)
    plt.tick_params(axis='x', labelsize=8)
    plt.xticks(x, fontsize=16) # set tick step to 1 and let x axis start at 1
    plt.yticks(fontsize=16)
    plt.legend(['train'] + ['test'] if val else ['train'],  prop={'size': 18})
    plt.grid()

In [None]:
plot_history_metric(history, 'loss')

In [None]:
plot_history_metric(history, 'accuracy')

# Save pretrained RoBERTa layer

In [None]:
# Save RoBERTa weight
for l_idx, l in enumerate(model.layers):
    print(l.name)
    if l.name == 'tf_roberta_model':
        print(f'Saving layer {l_idx} with name {l.name}')
        l.save_weights('roberta_pretrained.h5')

# CommonLit Model

Thee next function gives an example on how to use the pretrained weights in a model for CommonLit training. It is as simple using the ```load_weights``` function for the RoBERTa layer.

In [None]:
def get_model():
    tf.keras.backend.clear_session()

    with strategy.scope():
        # RoBERTa
        transformer = TFRobertaModel.from_pretrained(MODEL)
        # Load saved weights
        transformer.load_weights('roberta_pretrained.h5')
        
        input_ids = tf.keras.layers.Input(shape = (SEQ_LENGTH), dtype=tf.int32, name='input_ids')
        sequence_output = transformer(input_ids)[0]
        # We only need the cls_token, resulting in a 2d array
        cls_token = sequence_output[:, 0, :]
        output = tf.keras.layers.Dense(1, activation='linear', dtype=tf.float32)(cls_token)
        
        # Model
        model = tf.keras.models.Model(inputs=input_ids, outputs=output)

        loss = tf.keras.losses.MeanSquaredError()
        optimizer = tf.optimizers.Adam(learning_rate=4e-5)
        metrics = [
            tf.keras.metrics.RootMeanSquaredError(name='RMSE'),
        ]

        model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    
    return model

model = get_model()

In [None]:
model.summary()