In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

In [None]:
df_train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
df_test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

## Data Preprocessing

In [None]:
df_train.shape, df_test.shape
# The Train Dataset has 12,1120 rows and 6 columns.
# Meanwhile, the Test Dataset has 5,195 rows and 5 columns.

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()
# The 'label' column is not present in the Test Dataset.

In [None]:
df_train.duplicated().sum()

In [None]:
df_test.duplicated().sum()

In [None]:
df_train.head(15)
# Two sentences could be related in three ways: one could entail the other, one could contradict the other, or they could be unrelated.
# In this dataset, the hypothesis (2nd sentence) can do one of the three to the premise (1st sentence). 
# The 'label' column contains the following values, corresponding to the relationship of the two sentences: 0 - entailment; 1 - neutral ; and 2 - contradiction.

In [None]:
df_test.head(15)

In [None]:
print('train dataset lang_abv: ', len(df_train.lang_abv.unique()), ', languages: ', len(df_train.language.unique()))
print('test dataset lang_abv: ', len(df_test.lang_abv.unique()), ', languages: ', len(df_test.language.unique()))
print('train dataset languages & test dataset languages: ', len(set(df_train.lang_abv.unique()) and set(df_test.lang_abv.unique())))
# This code block shows that the Datasets contain premise-hypothesis pairs in fifteen different languages.
# The languages are Arabic, Bulgarian, Chinese, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, and Vietnamese.

## Percent Distribution of Languages in the Train and Test Datasets

In [None]:
labels, frequencies = np.unique(df_train.language.values, return_counts = True)

plt.figure(figsize = (15,15))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()
# This pie chart shows the quantity of each of the 15 languages in the Train Dataset in percentage.

In [None]:
labels, frequencies = np.unique(df_test.language.values, return_counts = True)

plt.figure(figsize = (15,15))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()
# This pie chart shows the quantity of each of the 15 languages in the Test Dataset in percentage.

## Setting up the TPU

In [None]:
import tensorflow as tf

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    # TPUs are network-connected accelerators and must be first located on the network; this is what TPUClusterResolver.connect() does.
    # This code is used to initialize the TPU.
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    # The TPUStrategy object contains the necessary distributed training code that will work on TPUs with their 8 compute cores.
except ValueError:
    tpu_strategy = tf.distribute.get_strategy()
    print('Number of replicas:', tpu_strategy.num_replicas_in_sync)

In [None]:
if tpu:
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu) 
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
elif len(gpus) > 1:
    tpu_strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
    tpu_strategy = tf.distribute.get_strategy() 
    print('Running on single GPU ', gpus[0].name)
else:
    tpu_strategy = tf.distribute.get_strategy() 
    print('Running on CPU')
print("Number of accelerators: ", tpu_strategy.num_replicas_in_sync)
# To verify that this Notebook is running on TPU and check the number of accelerators.

## Building the Model

In [None]:
from transformers import TFAutoModel, AutoTokenizer
# This code enables me to use the cutting-edge NLP models from Transformers library maintained and democratized by Hugging Face.

In [None]:
def input_convert(data):
# Input IDs (Type, Word, Mask) are simply a set of integers that represent a word.
# They are the tokenized representation of the text.
# 'Tokenized' means converting raw text into a 'token' to prepare it for input to the Model.
        inputs = {
            'input_word_ids': [],
            'input_mask': []
        }
        
        for i in data:
            inputs['input_word_ids'].append(i['input_ids'])
            inputs['input_mask'].append(i['attention_mask'])
            # A mask is a special token, which is an array of 0s and 1s where each 1 represents a valid word/input ID, and a 0 represents padding.
            # Some tokens are 'masked' so that the model can consider the context of the sentences.
            
        inputs['input_word_ids'] = tf.ragged.constant(inputs['input_word_ids']).to_tensor()
        inputs['input_mask'] = tf.ragged.constant(inputs['input_mask']).to_tensor()
        # Since every bit of my data is useful, I don't want to lose any information with slicing nor I want to append some other information
        # I decided to use Ragged Tensors help avoid the shape problem and losing information problem.   
        return inputs

In [None]:
y = df_train.pop('label')
df = pd.concat([df_train, df_test], ignore_index = True)
df_train.shape, df_test.shape, df.shape
# The .pop() method removes the element at the specified position.

In [None]:
model_name = 'joeddav/xlm-roberta-large-xnli'
# THe XLM_RoBERTa Model is a derivative of the original BERT model.
# It is developed by Facebook, intended for zero-shot text classification especially in languages other than English.
# Zero-shot Text Classification pertains to a classifier learning on one set of labels and then evaluating a different set of labels that the classifier has never seen before. 

tokenizer = AutoTokenizer.from_pretrained(model_name)
# The tokenizer library comprises of tokenizers for all BERT models.
# A tokenizer turns sequences of words/sentences into arrays of numbers to prepare them for further analysis.
# AutoTokenizer is used to call tokenizers from the transformers library.

mask = []
for i in range(len(df)):
    padded_seq = tokenizer(df['premise'][i], df['hypothesis'][i], padding = True, add_special_tokens = True)
    mask.append(padded_seq)
    # Neural Networks require inputs that have the same shape and size for further processing; however, this cannot be expected of sentences.
    # This is where padding comes in. It 'pads' out the sentences into a matrix, where each row in that matrix has an padded encoded sentence with the same length.
    # The length of the padding is determined depending on the dataset.

inputs = input_convert(mask)

## Train-test Split

In [None]:
inputs_train = {}
inputs_test = {}

for key in inputs.keys():
    inputs_train[key] = inputs[key][:len(y), :]
    inputs_test[key] = inputs[key][len(y):, :]
# The .keys() method returns a view object that displays a list of all the keys in a Python dictionary, in order of insertion.

In [None]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
# The Dense layer the regular deeply connected neural network layer. It is most common and frequently used layer. 
# The Dropout layer helps prevent overfitting by randomly setting input units to 0 with a frequency of rate at each step during training. And inputs not set to 0 are scaled up by 1/(1-rate), such that the sum over all inputs is unchanged.
# Adam is the most advanced optimizer in Tensorflow with weight decay, which can further help reduce overfitting and improve generalization


with tpu_strategy.scope():
# The TPUStrategy is used here to 'instantiate' the model in the scope of the strategy. Why?
    max_len = inputs['input_word_ids'].shape[1]
    
    encoder = TFAutoModel.from_pretrained(model_name)
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")

    embedding = encoder([input_word_ids, input_mask])[0]
    dense1 = Dense(256, activation='relu')(Dropout(0.1)(embedding[:,0,:]))
    dense2 = Dense(64, activation='relu')(dense1)
    output = Dense(3, activation='softmax')(dense2)
    # I used ReLu because it is a nonlinear function that allows complex relationships to be learned is able to allow learning through all the hidden layers in a deep network by having large derivatives.
    # In addition, ReLu is more useful for the hidden layers rathen the the final output layers
    # I used Softmax because it transforms a bunch of arbitrarily large or small numbers into valid probability distributions.
    # The property of softmax to scale numbers/logits into probabilities is useful and intuitive and is often used as the activation function for the final output/layer.

    model = Model(inputs=[input_word_ids, input_mask], outputs = output)
    model.compile(Adam(lr=1e-6), loss='sparse_categorical_crossentropy', metrics=['accuracy'], steps_per_execution = 100)
    # I used sparse categorical crossentropy because it saves time in memory as well as computation because it simply uses a single integer for a class, rather than a whole vector.

## Fitting the Model

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(patience = 4, restore_best_weights = True)
model.fit(inputs_train, y.values, epochs = 10, verbose = 1, validation_split = 0.2,
                    batch_size = 16 * tpu_strategy.num_replicas_in_sync, callbacks = [early_stop])
# Early stopping is a basic technique to prevent overfitting.
# Patience is the number of epochs with no improvement, after which, the training will be stopped.
# An Epoch can be described as one complete cycle through the entire training dataset and indicates the number of passes that the machine learning algorithm has completed during that training.
# Verbose is generally an option for producing detailed logging information. It is like asking the program to tell me everything about what it is doing all the time.
# Batch is composed of a divided dataset that I use when I can't pass the entire dataset into the Neural Network at once.
# Callbacks are functions called when a task is completed during Model Training to check if the validation loss is increasing.

## Generating & Submitting Predictions

In [None]:
predictions = [np.argmax(i) for i in model.predict(inputs_test)]
# The numpy.argmax() function returns indices of the max element of the array in a particular axis.

In [None]:
submission = df_test.id.copy().to_frame()
submission['prediction'] = predictions
submission.head()

In [None]:
submission.to_csv("submission.csv", index = False)
# sample_submission.csv: This is a sample submission file in the correct format: id: a unique identifier for each sample label: the classification of the relationship between the premise and hypothesis (0 for entailment, 1 for neutral, 2 for contradiction)