In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.model_selection import train_test_split

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Introduction

*“It is a capital mistake to theorize before one has data. Insensibly one begins to twist facts to suit theories, instead of theories to suit facts.”* - Sir Arthur Conan Doyle, Sherlock Holmes

The data given in this dataset consists of only 12K records. One way for deep learning models to work, is to use more data. Thanks to [Yih-Dar SHIEH](https://www.kaggle.com/yihdarshieh/more-nli-datasets-hugging-face-nlp-library)  for excellent Kernel on how to get more Datasets for NLI and using [hugging face nlp ](https://huggingface.co/nlp/) library to easily share and access datasets and evaluation metrics for Natural Language Processing.

In This Kernel, we will augment our training data with other datasets for NLI. Few NLI Datasets that we can get from Hugging Face nlp library are :

1. [SNLI](https://huggingface.co/datasets/snli)
2. [Adversarial Natural Language Inference](https://huggingface.co/datasets/anli)
3. [Multi-Genre Natural Language Inference](https://huggingface.co/datasets/multi_nli)
4. [XNLI](https://huggingface.co/datasets/xnli) : This is a subset of MNLI, hich has been translated into a 14 different languages . While others are purely english, this is cross lingual



In [None]:
train=pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
print("Shape of Train Data ",train.shape)
test=pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")
print("Shape of Test Data ",test.shape)

In [None]:
train.head()

## Install the NLP Library of Hugging Face and import it

In [None]:
!pip install nlp
import nlp

## Getting the Standford Natural Language Inference(SNLI) Data

In [None]:
snli= nlp.load_dataset("snli")

In [None]:
snli

SNLI data has three datasets - train, calidation and test data. The train data consists of around 550152 records and the three classes - entailment, neutral and contradiction

In [None]:
print("Type of SNLI Dataset ",type(snli))

In [None]:
print(snli['train'])
print("Type of SNLI Train ",type(snli['train']))

SNLI Train contains three features : premise, hypotheasis and label. The label consists of either of the three class. We can iterate over this to get the data. We will convert this to a dataframe and have a look at the data. 

**Also, we need to check if any of the premise,hypothesis pair is in the test data and remove it - to avoid any data leakage**

In [None]:
snli_train_df = pd.DataFrame(snli['train'])
snli_train_df.head()

To detect duplicates between the Test Data and SNLI data, we will use the merge function with **indicator=True** , This will create an additional column "**_merge**" which will indicate where the row is present in both tables or only on the right or left table. We can then identify the duplicate rows.

In [None]:
test_dup=pd.merge(test[['premise','hypothesis']],snli_train_df[['premise','hypothesis']],how="outer",indicator=True)
test_dup.head()

In [None]:
test_dup['_merge'].unique()


There are no duplicates between test data and the SNLI Train Data. So we can safely use the SNLI data, without having to worry about data leakage



In [None]:
snli_train_df['lang_abv']="en"
train_df=train[['premise','hypothesis','lang_abv','label']]
snli_train_df['dataset']="snli"
train_df['dataset']="train"
train_df=pd.concat([train_df,snli_train_df])
print("Shape of Original Train Data ",train.shape)
print("Shape of SNLI Train Data ",snli_train_df.shape )
print("Shape after merging Original Train Data With SNLI ",train_df.shape)

In [None]:
train_df.head()

### To get Cross Lingual Data, let us load the XNLI Dataset as well

In [None]:
xnli=nlp.load_dataset("xnli")
xnli

XNLI contains only Test and Validation data across 15 languages. The Validation Data is 2590 hypothesis premise pair converted to each of the 15 languages and even this contains, three classes of labels

In [None]:
for idx, elt in enumerate(xnli['validation']):
    
    print('premise:', elt['premise'])
    print('hypothesis:', elt['hypothesis'])
    print('label:', elt['label'])
    print('label name:', xnli['validation'].features['label'].names[elt['label']])
    print('-' * 80)
    
    if idx >= 3:
        break

XNLI Data structure is slightly different from SNLI. Here we have a dictionary of premise and hypothesis across different languages.

In [None]:
buffer = {
    'premise': [],
    'hypothesis': [],
    'label': [],
    'lang_abv': []
}



for x in xnli['validation']:
    label = x['label']
    for idx, lang in enumerate(x['hypothesis']['language']):
        hypothesis = x['hypothesis']['translation'][idx]
        premise = x['premise'][lang]
        buffer['premise'].append(premise)
        buffer['hypothesis'].append(hypothesis)
        buffer['label'].append(label)
        buffer['lang_abv'].append(lang)
        
# convert to a dataframe and view
xnli_valid_df = pd.DataFrame(buffer)
xnli_valid_df = xnli_valid_df[['premise', 'hypothesis', 'label', 'lang_abv']]

In [None]:
xnli_valid_df.shape

In [None]:
test_dup=pd.merge(test[['premise','hypothesis']],xnli_valid_df[['premise','hypothesis']],how="outer",indicator=True)
test_dup.head()

In [None]:
test_dup['_merge'].value_counts()

There are 746 rows in the XNLI valid dataset, that is there is the test data. We have to remove those rows

In [None]:
dup_pairs=test_dup[test_dup['_merge']=="both"]
dup_pairs.head()

In [None]:
dup_pairs['combo']=dup_pairs['premise']+" "+dup_pairs['hypothesis']
dup_pairs.head()

In [None]:
xnli_valid_df['combo']=xnli_valid_df['premise']+" "+xnli_valid_df['hypothesis']

In [None]:
xnli_valid_without_dups=xnli_valid_df[(~xnli_valid_df['combo'].isin(dup_pairs['combo'].tolist()))]

In [None]:
xnli_valid_without_dups.drop(['combo'],axis=1,inplace=True)

In [None]:
xnli_valid_without_dups.shape

In [None]:
xnli_valid_without_dups['dataset']="xnli"

In [None]:
train_df=pd.concat([train_df,xnli_valid_without_dups])
train_df.shape

## Load the MNLI Dataset 

In [None]:
mnli=nlp.load_dataset(path='glue', name='mnli')
mnli

In [None]:
mnli_train=pd.DataFrame(mnli['train'])
mnli_train.head()

In [None]:
mnli_train.shape

In [None]:
test_dup=pd.merge(test[['premise','hypothesis']],mnli_train[['premise','hypothesis']],how="outer",indicator=True)
test_dup['_merge'].unique()

There are no rows of MNLI present in the test data. So we do not worry about data leakage from the MNLI Dataset

In [None]:
mnli_train.drop(['idx'],axis=1,inplace=True)

In [None]:
mnli_train['lang_abv']="en"
mnli_train['dataset']="mnli"

In [None]:
train_df=pd.concat([train_df,mnli_train])
train_df.shape

After merging all three dataset, we have arounf 991578 rows in the data... 

In [None]:
pd.isnull(train_df).sum()

### Encode the Data Using batch_encode_plus.

This will tokenise and encode the data, and we can also allow for padding. XLM-R has been shown to significantly outperform multilingual BERT. This has same architecture as BERT, but tokenisation is based on BPE and we no longer need token_type_ids.

In [None]:
from transformers import BertTokenizer,TFBertModel
import tensorflow as tf
from transformers import AutoTokenizer


In [None]:
MODEL_NAME="jplu/tf-xlm-roberta-large"
MAX_LEN=64
BATCH_SIZE=64

In [None]:
tokeniser=AutoTokenizer.from_pretrained(MODEL_NAME) ## Autokeniser will initialise the tokeniser based on the model name


In [None]:
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.optimizers import Adam
from transformers import TFAutoModel

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print("In TPU STRATERGY")
    print('Number of replicas:', strategy.num_replicas_in_sync)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
def createModel():
    with strategy.scope():
        model=TFAutoModel.from_pretrained(MODEL_NAME)
        input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), name='input_token', dtype='int32')
        #input_mask=tf.keras.layers.Input(shape=(MAX_LEN,), name='input_mask', dtype='int32')
        #input_token_ids=tf.keras.layers.Input(shape=(MAX_LEN,), name='input_token_ids', dtype='int32')
        ### From the Model, we need to extract the Last Hidden Layer - this is the first element of the model output
        embedding=model(input_ids)[0]
        ### Extract the CLS Token from the Embedding Layer. CLS Token is aggregate of the entire sequence representation. It is the first token
        cls_token=embedding[:,0,:] ## embedding is of the size batch_size*MAX_LEN*768
    
        ### Add a Dense Layer, with three outputs 
        output_layer = Dense(3, activation='softmax')(cls_token)
    
        classification_model= Model(inputs=input_ids, outputs = output_layer)
    
        classification_model.compile(Adam(lr=1e-5),loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    
        #classification_model.summary()
    
    return classification_model
    
    
    
    

In [None]:
'''
This function will take the complete data, with option to select certain external datasets. It will also split the data into train and validation Split,
encode the data and create a TF.Data.DataSet object for the train and validation data
'''
def createTFDataSet(data,external_dataset=None,test_size=0.2,padding=True,max_length=MAX_LEN,truncation=True,batch_size=BATCH_SIZE):
    tokeniser=AutoTokenizer.from_pretrained(MODEL_NAME) 
    if external_dataset==None:
        train_data=data[data['dataset']=="train"]
    else:
        dat=data[data['dataset']=="train"]
        external_data=data[data['dataset'].isin(external_dataset)]
        train_data=pd.concat([dat,external_data])
        assert dat.shape[0]+external_data.shape[0]==train_data.shape[0]
    ### Split the Data into Train and Validation Split
    X=train_data[['hypothesis','premise']]
    y=train_data['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    print("Shape of Train Data ",X_train.shape)
    print("Shape of Test Data ",X_test.shape)
    
    ### Encode the Training and Validation Data
    train_encoded=tokeniser.batch_encode_plus(X_train[['hypothesis','premise']].values.tolist(),pad_to_max_length=padding,max_length=max_length,truncation=True)
    val_encoded=tokeniser.batch_encode_plus(X_test[['hypothesis','premise']].values.tolist(),pad_to_max_length=padding,max_length=max_length,truncation=True)
    
    ### Convert the Encoded Train and Validation data into TF Dataset
    auto = tf.data.experimental.AUTOTUNE
    train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_encoded['input_ids'], y_train))
    .repeat()
    .shuffle(2048)
    .batch(batch_size)
    .prefetch(auto))
    
    
    valid_dataset = (tf.data.Dataset
    .from_tensor_slices((val_encoded['input_ids'], y_test))
    .batch(batch_size)
    .cache()
    .prefetch(auto))
    
    return tokeniser,train_dataset,valid_dataset,X_train.shape[0]
    
    
    

In [None]:
tokeniser,train_dataset,valid_dataset,train_rows=createTFDataSet(train_df,external_dataset=['xnli','mnli'])

In [None]:
with strategy.scope():
    model=createModel()
    model.summary()

In [None]:
n_steps =  train_rows//BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=5
)

In [None]:
test_encoded=tokeniser.batch_encode_plus(test[['hypothesis','premise']].values.tolist(),pad_to_max_length=True,max_length=MAX_LEN,truncation=True)


test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(test_encoded['input_ids'])
    .batch(BATCH_SIZE)
)


In [None]:
test_preds = model.predict(test_dataset, verbose=1)
predictions = test_preds.argmax(axis=1)

In [None]:
submission = pd.DataFrame()
submission['id']=test['id'].tolist()
submission['prediction'] = predictions

In [None]:
submission.to_csv("submission.csv",index=False)

In [None]:
model.save_weights("XLM_R_MNLI_XNLI.h5",overwrite=True)