Looks like the textual entailment problem is kind of a next sequence predicting problem except we need to classify the relation between sentences into three classes (entailment, neuter and contradiction) instead of two.

# Preparation

In [None]:
!pip install 'transformers==3.5.0'

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
import transformers
import seaborn as sns

First we have to set up our TPU. There is an educational notebook ["TPUs in Colab"](https://colab.research.google.com/notebooks/tpu.ipynb) and I took the following code from it:

In [None]:
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
DATA_DIR = '../input/contradictory-my-dear-watson/'

train = pd.read_csv(DATA_DIR+'train.csv')
test = pd.read_csv(DATA_DIR+'test.csv')
submission = pd.read_csv(DATA_DIR+'sample_submission.csv')

Now we need to choose our [pretrained model](https://huggingface.co/transformers/pretrained_models.html).
Lets take ``` bert-base-multilingual-cased ```  for now.

*P.S. I also tried ```xlm-roberta-base``` but it didnt seem to be superiour for some reason.*

In [None]:
MODEL_NAME = 'bert-base-multilingual-cased'

# Tokenization

We need our input sentenses to be in a single string divided by special marker to be fed into the BERT model. Let's find out how long the resulting string should be:

In [None]:
premise_lengths = [len(train.premise[i].split()) for i in range(len(train))]
hypothesis_lengths = [len(train.hypothesis[i].split()) for i in range(len(train))]
primise_hypothesis_combined_length = [len(train.premise[i].split()) + 
                                      len(train.hypothesis[i].split()) for i in range(len(train))]

plt.figure(figsize=(20,5))
sns.distplot(primise_hypothesis_combined_length);

Now we need to tokenize our data.
Fortunately Huggingface have some easy-to-use tokenizers. Fow now I'll use [BertTokenizerFast](https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast) and follow the [preprocessing](https://huggingface.co/transformers/preprocessing.html) overview:

In [None]:
# Creating an instance of tokenizer class and downloading the vocab for the model
tokenizer = transformers.BertTokenizerFast.from_pretrained(MODEL_NAME);

In [None]:
# Looks like 64 would be enough.
MAX_LENGTH = 64

In [None]:
train_encoded = tokenizer(text=list(train.premise.values),
                    text_pair=list(train.hypothesis.values),
                    add_special_tokens=True,
                    max_length=MAX_LENGTH,
                    truncation=True,
                    padding=True,
                    return_attention_mask=True,
                    return_token_type_ids=True,
                    return_tensors='tf'
                    )

For some reasom ```.fit()``` of ```keras.Model```  method doesn't work with our Tensorflow ```Dataset```. So I had to split our data into *train* and *val* set beforehand.

In [None]:
BATCH_SIZE = 1024
val_size = int(len(train)*0.2)

dataset = tf.data.Dataset.from_tensor_slices((train_encoded.data, train.label.values))
val_dataset = (dataset.take(val_size).batch(BATCH_SIZE))
train_dataset = (dataset.skip(val_size).batch(BATCH_SIZE))

# Model building

In [None]:
with tpu_strategy.scope():
    transformer = transformers.TFAutoModel.from_pretrained(MODEL_NAME);

In [None]:
def create_model():

    input_1 = tf.keras.Input(shape=(MAX_LENGTH,),name='input_ids', dtype='int32')
    input_2 = tf.keras.Input(shape=(MAX_LENGTH,),name='attention_mask', dtype='int32')
    input_3 = tf.keras.Input(shape=(MAX_LENGTH,),name='token_type_ids', dtype='int32')

    x = transformer((input_1, input_2, input_3))[0]
    x = tf.keras.layers.Dense(200, activation='relu')(x[:,0,:])
    y = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(x)
    model = tf.keras.Model(inputs=(input_1, input_2, input_3), outputs=y)
    return model

In [None]:
with tpu_strategy.scope():
    model = create_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()

# Modelling

In [None]:
lr_reduction = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=1,
    min_lr=1e-7
)

history = model.fit(
    train_dataset,
    epochs = 20,
    verbose = 2,
    batch_size = BATCH_SIZE,
    callbacks=[lr_reduction],
    validation_data=val_dataset
)

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])
plt.subplot(1,2,2)
plt.plot(history.history['val_accuracy'])
plt.plot(history.history['accuracy'])

# Submission

In [None]:
test_encoded = tokenizer(text=list(test.premise.values),
                    text_pair=list(test.hypothesis.values),
                    add_special_tokens=True,
                    max_length=MAX_LENGTH,
                    truncation=True,
                    padding=True,
                    return_attention_mask=True,
                    return_tensors='tf'
                    )

In [None]:
predictions = model.predict(test_encoded.data, batch_size=128, verbose=1)

In [None]:
test_labels = np.argmax(predictions, axis=1)

WARNING:tensorflow:Gradients do not exist for variables  https://github.com/tensorflow/tensorflow/issues/37501

In [None]:
submission['prediction'] = test_labels

In [None]:
submission.to_csv("submission.csv", index = False)