# What is BERT?
BERT stands for Bidirectional Encoder Representation from Transformers. The pre-trained BERT model can be fine-tuned with just one output layer to create models for NLP tasks. **Bidirectional** means that BERT learns information from both the left and right side of a token's context during the training phase.

**[CLS]** (classification task) token to put to the beginning of the sentence.
**[SEP]** (seperator) token to put to the ending of the sentence.
In this case, since our model wants one input, we will concenate premise and hypothesis sentences and put [SEP] token between the two sentences. And [CLS] token to the beginning.
Like this:
> [CLS] I love transformers (premise). [SEP] I like transformers. [SEP]

✓ All the input must be the same size.

* Then we will convert inputs to tokens and encode them and outputs as embeddings of the word.
Since we have multilingual data, I'll use XLM-RoBERTa model (train with more than 100 language) which has same architecture with BERT but it takes two input while BERT takes three.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer,TFAutoModel
import tensorflow as tf
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
test_data = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")

In [None]:
train_data.head()

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu) 
except ValueError: 
    strategy = tf.distribute.get_strategy() 
    print('Number of replicas:', strategy.num_replicas_in_sync) 

In [None]:
MAX_LEN = 150
#MAX_LEN = 227
#lr = 1e-5
#BATCH_SIZE = 32*strategy.num_replicas_in_sync
#MAX_LEN = 236
#MAX_LEN = 245

#MAX_LEN = 120
lr = 1e-6
BATCH_SIZE = 64
EPOCHS = 20

In [None]:
train_lang = train_data.language.value_counts()
test_lang = test_data.language.value_counts()

px.pie(values=train_lang, names=train_lang.index)

In [None]:
px.pie(values=test_lang, names=test_lang.index)

In [None]:
train_label = train_data.label.value_counts()

px.pie(values=train_label, names=train_label.index)

In [None]:
# download models tokenizers
model_name = "joeddav/xlm-roberta-large-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# encode a sentence to have a list of the ID for each word and seperator
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s))            # encode sentence with tokenizer
    tokens.append("[SEP]")                          # [SEP] token is added to the end of each sentence
    return tokenizer.convert_tokens_to_ids(tokens)  # return a list of each token ID, not list of tokens

In [None]:
print(train_data.premise.map(lambda x: len(encode_sentence(x))).max())

In [None]:
print(train_data.hypothesis.map(lambda x: len(encode_sentence(x))).max())

In [None]:
tokenizer.tokenize("Don't you love Transformers?")        # tokenizers turn sequences of words into arrays of numbers

In [None]:
encode_sentence("Don't you love Transformers?")

**Input words ID:** The output list of encode will have a different length for each sentence from dataset. This means each sentence belong to premises and hypothesis must have the same length. So, we will have to add zeros at the end of each ID list until it has the same lenghth of the longest list in the dataset. This process is called *padding*.

**Input masks:** We should tell BERT to which ID's to ignore and to embed. BERT should ignore paddings. The input mask variable has the same length as the ID lists. Contains 1 for each actual token ID, and 0 for each padding. which BERT should ignore. 

**Input type ID:** We won't need this for roBERTa model. BERT model needs this input argument since it was trained to predicting the likelihood that Sentence B belongs after Sentence A.

In [None]:
def roberta_encode(hypothesis, premise, tokenizer):
        
    #construct a constant ragged tensor since our entries has different lenghts
    sentence_1 = tf.ragged.constant([
        encode_sentence(s) for s in np.array(hypothesis)
    ])
    
    sentence_2 = tf.ragged.constant([
        encode_sentence(s) for s in np.array(premise)
    ])
    
    # token [CLS] to denote each beginning of concenation of sentence_1 and _2
    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])] * sentence_1.shape[0]
    
    # concenate sentences but each sentence still has different lengths
    input_word_ids = tf.concat([cls, sentence_1, sentence_2], axis = -1)
    
    #a tensor with just ones with the same size as input_word_ids
    input_mask = tf.ones_like(input_word_ids).to_tensor()
    
    inputs = {
        'input_word_ids': input_word_ids.to_tensor(),
        'input_mask': input_mask
    }
    
    return inputs

In [None]:
#inputs = roberta_encode(train_data.premise.values, train_data.hypothesis.values, tokenizer)
inputs = roberta_encode(train_data.hypothesis.values, train_data.premise.values, tokenizer)

In [None]:
inputs

In [None]:
inputs['input_word_ids'].shape

In [None]:
def build_model():
    
    # adjust the model
    encoder = TFAutoModel.from_pretrained(model_name)
    
    # tell how our input looks like
    input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")
    
    # other input, masks
    input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_mask")
    
    # feed model with input
    # roberta encoder will return a tuple and the contextualized embeddings that we need are stored in the first element
    embedding = encoder([input_word_ids, input_mask])[0]
    out = tf.keras.layers.Dropout(.4)(embedding[:,0,:])
    out = tf.keras.layers.Dense(32, activation="relu")(out)
    #output = tf.keras.layers.Dense(3, activation="softmax")(embedding[:,0,:])
    output = tf.keras.layers.Dense(3, activation="softmax")(out)
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output)
    model.compile(tf.keras.optimizers.Adam(learning_rate=lr),
                 loss = 'sparse_categorical_crossentropy',
                 metrics = ["accuracy"])
    
    return model

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()

In [None]:
for key in inputs.keys():
    inputs[key] = inputs[key][:, :MAX_LEN]

In [None]:
inputs.keys()

In [None]:
#n_steps = len(train_data) // BATCH_SIZE
model.fit(inputs, train_data.label.values, epochs=EPOCHS, validation_split=.2,
         batch_size=BATCH_SIZE)

In [None]:
history = model.history.history
px.line(
    history, x=range(1, len(history['loss'])+1), y=['accuracy', 'val_accuracy'], 
    title='Model Accuracy', labels={'x': 'Epoch', 'value': 'Accuracy'}
)

In [None]:
px.line(
    history, x=range(1, len(history['loss'])+1), y=['loss', 'val_loss'], 
    title='Model Loss', labels={'x': 'Epoch', 'value': 'Loss'}
)

In [None]:
test_inputs = roberta_encode(test_data.hypothesis.values, test_data.premise.values, tokenizer)
for key in test_inputs.keys():
    test_inputs[key] = test_inputs[key][:,:MAX_LEN]

In [None]:
preds = model.predict(test_inputs)
predictions = [np.argmax(i) for i in preds]
#predictions

In [None]:
submission = test_data.id.copy().to_frame()
submission["prediction"] = predictions
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)