In [None]:
import os

import numpy as np
import pandas as pd
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
import plotly.express as px

## Setting up the TPUs

This line is necessary in order to initialize the TPUs. 

Here, "replicas" simply means number of "cores". In the case of GPUs or CPUs, the number of replicas will be 1. [Read this](https://cloud.google.com/tpu/docs/tpus#replicas) for more information.

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU ', tpu.master())
except ValueError:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

## Define variables

Make sure to keep those variables in mind as you navigate this notebook! They are all placed below so you can easily change and rerun this notebook.

Don't worry about the model right now. We will come back to it later.

In [None]:
model_name = 'jplu/tf-xlm-roberta-large'
n_epochs = 10
max_len = 80

# Our batch size will depend on number of replicas
batch_size = 16 * strategy.num_replicas_in_sync

## Load datasets

Just regular CSV files. Nothing scary here!

In [None]:
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
submission = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/sample_submission.csv')

## Encode Training data

Now, we need to encode the training and test data into `tokens`, which are numerical representation of our words. To learn more, [read this](https://huggingface.co/transformers/main_classes/tokenizer.html).

In [None]:
# First load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Convert the text so that we can feed it to `batch_encode_plus`
train_text = train[['premise', 'hypothesis']].values.tolist()
test_text = test[['premise', 'hypothesis']].values.tolist()

# Now, we use the tokenizer we loaded to encode the text
train_encoded = tokenizer.batch_encode_plus(
    train_text,
    pad_to_max_length=True,
    max_length=max_len
)

test_encoded = tokenizer.batch_encode_plus(
    test_text,
    pad_to_max_length=True,
    max_length=max_len
)

Train and validation split happens here:

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(
    train_encoded['input_ids'], train.label.values, 
    test_size=0.2, random_state=2020
)

x_test = test_encoded['input_ids']

## Convert to tf.data.Dataset

`tf.data.Dataset` is one of many different ways to define the input to our models. Here, it is a good choice since it is easily compatible with TPUs. Read more about it [in this article](https://towardsdatascience.com/how-to-use-dataset-in-tensorflow-c758ef9e4428).

In [None]:
auto = tf.data.experimental.AUTOTUNE

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(batch_size)
    .prefetch(auto)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(batch_size)
    .cache()
    .prefetch(auto)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(batch_size)
)

## Train the model

It's time to teach our lovely XLM-Roberta how to infer natural language. Notice here we are using `strategy.scope()`. We need to load `transformer_encoder` inside this scope in order to tell Tensorflow that we want our model on the TPUs. Otherwise, it will try to load it in your CPU machine!

XLM-Roberta is one of the best models out there for multilingual classification tasks. Essentially, it is a model that was trained on inherently multilingual text, and used methods that helped it become larger, train longer and on more data! Highly recommend you to read [this blog post by the authors](https://ai.facebook.com/blog/-xlm-r-state-of-the-art-cross-lingual-understanding-through-self-supervision/), as well as the [Huggingface docs](https://huggingface.co/transformers/model_doc/xlmroberta.html) on the subject.

In [None]:
with strategy.scope():
    # First load the transformer layer
    transformer_encoder = TFAutoModel.from_pretrained(model_name)

    # This will be the input tokens 
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")

    # Now, we encode the text using the transformers we just loaded
    sequence_output = transformer_encoder(input_ids)[0]

    # Only extract the token used for classification, which is <s>
    cls_token = sequence_output[:, 0, :]

    # Finally, pass it through a 3-way softmax, since there's 3 possible laels
    out = Dense(3, activation='softmax')(cls_token)

    # It's time to build and compile the model
    model = Model(inputs=input_ids, outputs=out)
    model.compile(
        Adam(lr=1e-5), 
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )

model.summary()

Unhide below to see the exactly training accuracy and loss after each epoch:

In [None]:
n_steps = len(x_train) // batch_size

train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=n_epochs
)

## Predict on test set and submit

In [None]:
test_preds = model.predict(test_dataset, verbose=1)
submission['prediction'] = test_preds.argmax(axis=1)

In [None]:
submission.to_csv('submission.csv', index=False)
submission.head()

## Visualize Training History

With Plotly Express, this can be done in one function call:

In [None]:
hist = train_history.history

In [None]:
px.line(
    hist, x=range(1, len(hist['loss'])+1), y=['accuracy', 'val_accuracy'], 
    title='Model Accuracy', labels={'x': 'Epoch', 'value': 'Accuracy'}
)

In [None]:
px.line(
    hist, x=range(1, len(hist['loss'])+1), y=['loss', 'val_loss'], 
    title='Model Loss', labels={'x': 'Epoch', 'value': 'Loss'}
)