# CommonLit GPU XLM-Roberta + Semi-Supervised Learning (SSL)

**Notes:**
 - Train XLM-Roberta model and leverage text augmentation + semi-supervised learning techniques
 - Use combination of training data, augmented training data, and external data with pseudo-labels

**References:**
 - https://www.kaggle.com/xhlulu/jigsaw-tpu-xlm-roberta
 - https://www.kaggle.com/yeayates21/xlm-roberta-augmentation-ssl-0-9417-pub-lb
 - https://www.kaggle.com/vecxoz/jplu-tf-xlm-roberta-large
 - https://www.kaggle.com/yeayates21/commonlit-text-augmentation-eng-to-fre-to-eng
 - https://www.kaggle.com/xhlulu
 - https://www.kaggle.com/shonenkov
 - https://www.kaggle.com/vecxoz

In [None]:
import os
import gc
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
import cloudpickle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

# Helper Functions

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        #return_attention_masks=False, 
        return_token_type_ids=False,
        padding='max_length',
        pad_to_max_length=True,
        max_length=maxlen
    )
    res = [x[:maxlen] for x in enc_di['input_ids']]
    return np.asarray(res)

def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='linear')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=9e-6), loss='mean_squared_error', metrics=['mean_squared_error'])
    
    return model

def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.xlabel('Epoch')
    plt.ylabel('Error [MSE]')
    plt.legend()
    plt.grid(True)

# Configurations

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
EPOCHS = 10
BATCH_SIZE = 4 * strategy.num_replicas_in_sync
MAX_LEN = 194
MODEL = '../input/jplu-tf-xlm-roberta-large'
downsample1 = None # for external pseudo-labeled data
downsample2 = 200 # for external pseudo-labeled data

# Tokenizer

In [None]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Data Prep

In [None]:
%%time

### 
### main training data
### 
train1 = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
train1, valid = train_test_split(train1, test_size=0.20, random_state=42)
print("training dataset 1 and validation dataset created.")

### 
### augmented training data
### 
train2 = pd.read_csv("../input/mjycommonlitdata/commonlit_train_with_augs.csv")
train2.drop(['excerpt'], axis=1, inplace=True)
train2.rename(columns={"augtext": "excerpt"}, inplace=True)
print("training dataset 2 created.")

### 
### external data with pseudo-labels
### 

# create training dataset 3
# load dataset
train3 = pd.read_csv("../input/mjycommonlitdata/bbc_commonlit_ssl.csv")
# downsample
if downsample1 is not None:
    train3 = train3.sample(n=downsample1, random_state=0)
# rename text field
train3.rename(columns={"text": "excerpt"}, inplace=True)
print("training dataset 3 created.")

# create training dataset 4
# load dataset
train4 = pd.read_csv("../input/mjycommonlitdata/fakenews_commonlit_ssl.csv")
# downsample
if downsample2 is not None:
    train4 = train4.sample(n=downsample2, random_state=0)
# rename text field
train4.rename(columns={"text": "excerpt"}, inplace=True)
print("training dataset 4 created.")

### 
### test data
### 
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
print("test dataset loaded.")

In [None]:
# stack data in 1 training set
train = pd.concat([
    train1[['excerpt', 'target']],
    train2[['excerpt', 'target']],
    train3[['excerpt', 'target']],
    train4[['excerpt', 'target']]
])

del train1, train2, train3, train4
gc.collect()

# view data
train.head()

In [None]:
%%time 
### encoding for tf bert

x_train = regular_encode(train.excerpt.values.tolist(), tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.excerpt.values.tolist(), tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(test.excerpt.values.tolist(), tokenizer, maxlen=MAX_LEN)

y_train = train.target.values
y_valid = valid.target.values

In [None]:
# QA
print("training shape:  ", x_train.shape)
print("validation shape:  ", x_valid.shape)
print("test shape:  ", x_test.shape)

# TF Datasets

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(9999)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

# Model

In [None]:
%%time

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)

model.summary()

# Training

In [None]:
%%time

n_steps = x_train.shape[0] // BATCH_SIZE

train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

# Save

In [None]:
model_folder = "./tf_model3"
os.makedirs(model_folder)
# Save the weights
model.save_weights(model_folder)

# Training Plots

In [None]:
plot_loss(train_history)

# Submission

In [None]:
test['target'] = model.predict(test_dataset, verbose=1)
test[['id','target']].to_csv('submission.csv', index=False)