We have used XLM-RoBERTA to achieve better validation accuracy of the model.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.environ["WANDB_API_KEY"] = "0" ## to silence warning

Setting the contants

In [None]:
SEED=42
max_len = 48  #it was 50
EPOCHS = 3   #it was 3; highest 10 gives no improvement
BATCH_SIZE = 64  # it was 128
LR = 1e-5  #it was 3e-5

**Importing libraries**

In [None]:
from transformers import BertTokenizer, TFBertModel,TFAutoModel,TFXLMRobertaModel, AutoTokenizer
import matplotlib.pyplot as plt
import tensorflow as tf

**Let's setup our TPU**

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

**Downloading data**

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")

In [None]:
train.head()

**Let's look at the pairs of sentences**

In [None]:
train.premise.values[1]

In [None]:
train.hypothesis.values[1]

In [None]:
train.label.values[1]

These statements are contradictory, and the label shows that.

Let's look at the distribution of languages in the training set.

In [None]:
labels, frequencies = np.unique(train.language.values, return_counts = True)

plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

**Preparing data for the model**

We are using pre-trained XLM-RoBERTA (cross-language Modeling) from huggingface

In [None]:
#Downloading the tokenizer
#model_name = 'bert-base-multilingual-cased'
model_name = 'jplu/tf-xlm-roberta-large'
#tokenizer = BertTokenizer.from_pretrained(model_name)
#transformer_layer = TFXLMRobertaModel.from_pretrained(MODEL)
#model = create_model(transformer_layer)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Tokenizers turn the sequences of words into arrays of numbers.

In [None]:
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
encode_sentence("I love machine learning")

XLM RoBERTA uses three kind of input data- input word IDs, input masks, and input type IDs.

These allow the model to know that the premise and hypothesis are distinct sentences, and also to ignore any padding from the tokenizer.

We add a [CLS] token to denote the beginning of the inputs, and a [SEP] token to denote the separation between the premise and the hypothesis. We also need to pad all of the inputs to be the same size.

We are going to encode all of our premise/hypothesis pairs for input into XLM RoBERTA

In [None]:
def bert_encode(hypotheses, premises, tokenizer):
    
  num_examples = len(hypotheses)
  
  sentence1 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(hypotheses)])
  sentence2 = tf.ragged.constant([
      encode_sentence(s)
       for s in np.array(premises)])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs

In [None]:
train_input = bert_encode(train.premise.values, train.hypothesis.values, tokenizer)

**Building and Training the Model**

We will incorporate XLM RoBERTA Transformer into Keras Functional Model

In [None]:
def build_model():
#    bert_encoder = TFBertModel.from_pretrained(model_name)
    bert_encoder = TFXLMRobertaModel.from_pretrained(model_name)
#    bert_encoder = TFAutoModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])   #Instead of softmax(), we can try out sigmoid()
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=LR), loss='sparse_categorical_crossentropy', metrics=['accuracy']) 
    #Check to see the learning rate as 1e-5, 3e-5 and 5e-5 and see the change
    
    return model

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()

In [None]:
# callback for early stoppong
from tensorflow.keras.callbacks import EarlyStopping
eas = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                    verbose=1, mode='min', baseline=None, restore_best_weights=True)

In [None]:
model.fit(train_input, train.label.values, epochs = EPOCHS, verbose = 1, batch_size = BATCH_SIZE, validation_split = 0.2, callbacks = [eas])   
#instead of 2 we can try out 3-4 epochs to see if accuracy is improved

In [None]:
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test_input = bert_encode(test.premise.values, test.hypothesis.values, tokenizer)

**Using Test data**

In [None]:
test.head()

**Generating & Submitting Predictions**

In [None]:
predictions = [np.argmax(i) for i in model.predict(test_input)]


In [None]:
submission = test.id.copy().to_frame()
submission['prediction'] = predictions

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index = False)