In [None]:
# Setting package umum 
import pandas as pd
import pandas_profiling as pp
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

from matplotlib.pylab import rcParams
# For every plotting cell use this
# grid = gridspec.GridSpec(n_row,n_col)
# ax = plt.subplot(grid[i])
# fig, axes = plt.subplots()
rcParams['figure.figsize'] = [10,5]
plt.style.use('fivethirtyeight') 
sns.set_style('whitegrid')

import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 150)
pd.options.display.float_format = '{:.4f}'.format

In [None]:
### Install packages
from IPython.display import clear_output

!pip install googletrans
!pip install p_tqdm
!pip install transformers==3.0.2
clear_output()

# So how to configure our input, Watson ?
Hello there, in this notebook I want to benchmark the performance of different input configuration using **DistilBERT** pre-trained model available in HuggingFace. DistilBERT is the modification of the original BERT model published by Google to focus on reducing the trainig time while minimizing the performance reduction, so it is good to use for benchmarking different configuration using Fold split

Here is the list of input configuration that I will experiment on
1. Combine the hypotheses and premise into one sentence
2. Using all-Englisth translated data
3. Removing stopwords and punctuations
4. Use hypotheses and premise as separate input (so the model will have 2 input)

All configuration performance will be based 10 epochs training on 5-StratifiedFold train dataset. To do this I will use the function in this notebook with a slight modification (go and upvote this amazing notebook https://www.kaggle.com/rohanrao/tpu-sherlocked-one-stop-for-with-tf). First lets prep the data

# Overview of DistilBERT
This is a quotation from the DistilBERT paper. DistilBERT is a **small, fast, cheap and light** Transformer model based on the BERT architecture. There are many ways to make the model smaller and one of is **distillation**. 

In a nutshell, distillation mean we build a small model that can reproduce the behavior the large model (in DistilBERT the large model is BERT). This can be done by using the soft label produce by BERT model as the target variable for the small model. Basically its kinda the same as label smoothing. This process called **teacher-student learning**

In order to learn more about DistilBERT first we have to learn about BERT. There is a 13 min video in Youtube that explained it well (here is the link https://www.youtube.com/watch?v=OR0wfP2FD3c), go check it out. Basically DistilBERT is a smaller version of BERT with a modification
- Remove token type embedding and pooler
- Only use half of the layer on BERT

Although it remove many part of BERT, DistilBERT can still retain slightly worse performance than BERT with 60% faster than BERT in training and inferencing. This is exactly why I choose to use DistilBERT since this notebook goal is to do benchmarking

![](https://miro.medium.com/max/1000/1*hHwcSZEazpY_PwArgBzHtw.png)

In [None]:
# Load dataset
df_train = pd.read_csv('../input/contradictory-my-dear-watson/train.csv')
df_test = pd.read_csv('../input/contradictory-my-dear-watson/test.csv')

In [None]:
### Overview dataset
df_train.head(11)

In [None]:
### Translate text into english
import multiprocessing
from multiprocessing import Pool
from p_tqdm import p_map
from googletrans import Translator
translator = Translator()

### Function for paralallel translation
def translate_to_en(text) :
    global translator
    translated_text = translator.translate(text, dest = 'en' ).text
    
    return translated_text

def pool_translated_en(list_text) :
    threads = 8*multiprocessing.cpu_count()
    
    with Pool(threads) as executor:
        result = list(tqdm(executor.imap(translate_to_en, list_text), total=len(list_text)))
        
    return result

def dataset_translated_en(df) :
    
    ### Initialize translated dataset
    df_trans = df.copy()
    
    ### Get non-english text
    list_idx = list(df_trans[df_trans['lang_abv']!='en'].index)
    list_prem = list(df_trans.loc[list_idx]['premise'])
    list_hyp = list(df_trans.loc[list_idx]['hypothesis'])
    
    ### Translate premise
    print('\nPremise Translation - En')
    df_trans.loc[list_idx,'premise'] = pool_translated_en(list_prem)
    
    ### Translate hypotheses
    print('\nPremise Hypotheses - En')
    df_trans.loc[list_idx,'hypothesis'] = pool_translated_en(list_hyp)

    ### Change languange value
    df_trans['lang_abv'] = 'en'
    df_trans['language'] = 'English'
    
    return df_trans
    
### Translate!
df_train_en = dataset_translated_en(df_train)
df_test_en = dataset_translated_en(df_test)

In [None]:
### Compare translation result
import random
list_non_en_idx = list(df_train[df_train['lang_abv']!='en'].index)

for i in range(5) :
    idx = random.randint(0,len(list_non_en_idx))
    print('\nORIGINAL TEXT :',df_train['premise'][list_non_en_idx[idx]])
    print('TRANSLATED TEXT :',df_train_en['premise'][list_non_en_idx[idx]])

In [None]:
### Proportion for each languange in train dataset
print('Proportion for each languange in train dataset')
df_train['language'].value_counts() / len(df_train) * 100

Looks like the translator done a pretty good job. We know that the train dataset heavily consist of english text (56%). In the future I will try to translate the english word into other languange so the proportion can be more uniform

In [None]:
### Remove unnecessary whitespaces, lower text and remove punctuation
import string

def remove_punctuation(text) :
    no_punct = ''.join([c for c in text if c not in string.punctuation])
    
    return no_punct

def quick_clean_data(dataset, var) :
    df = dataset.copy()
    
    # Lowercase
    df[var] = df[var].str.lower()
    
    # Strip whitespaces
    df[var] = df[var].str.strip()
    
    # Remove punctuation
    df[var] = df.apply(lambda x : remove_punctuation(x[var]), axis=1)
    
    # Remove double whitespaces
    df[var] = df.apply(lambda x : " ".join(x[var].split()), axis=1)
    
    return df

list_var = ['premise','hypothesis']
for var in list_var :
    df_train = quick_clean_data(df_train, var)
    df_test = quick_clean_data(df_test, var)
    df_train_en = quick_clean_data(df_train_en, var)
    df_test_en = quick_clean_data(df_test_en, var)

In [None]:
### Make another dataset with stopwords removed
from nltk.corpus import stopwords
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

def remove_stop_words(text) :
    
    # List of stop words
    en_stop_words = stopwords.words('english')
    
    # Remove stop words 
    text = ' '.join([c for c in text.split() if c not in en_stop_words])    
    
    return text

### Initialize dataset
df_train_no_stop = df_train_en.copy()
df_test_no_stop = df_test_en.copy()
list_var = ['premise','hypothesis']

for var in list_var :
    df_train_no_stop[var] = df_train_no_stop.progress_apply(lambda x : remove_stop_words(x[var]), axis=1)
    df_test_no_stop[var] = df_test_no_stop.progress_apply(lambda x : remove_stop_words(x[var]), axis=1)

In [None]:
### Compare stopwords result
import random
list_idx = list(df_train_no_stop.index)

for i in range(5) :
    idx = random.randint(0,len(list_idx))
    print('\nORIGINAL TEXT :',df_train_en['premise'][list_idx[idx]])
    print('NO STOPWORDS TEXT :',df_train_no_stop['premise'][list_idx[idx]])

In [None]:
### Distribution of word count in original dataset
rcParams['figure.figsize'] = [15,5]
plt.style.use('fivethirtyeight') 
sns.set_style('whitegrid')
grid = gridspec.GridSpec(1,2)

### Setting up data for plot
df_plot = df_train.copy()
df_plot['premise_word_count'] = df_plot.apply(lambda x : len(x['premise'].split()), axis=1)
df_plot['hypothesis_word_count'] = df_plot.apply(lambda x : len(x['hypothesis'].split()), axis=1)

### Plotting
list_var = ['premise_word_count', 'hypothesis_word_count']
list_color = ['#db3236','#4885ed']
for i,var in enumerate(list_var) :
    ax = plt.subplot(grid[i])
    sns.distplot(df_plot[var], kde=False, ax=ax, color=list_color[i])

plt.suptitle('Distribution of word count on ORIGINAL dataset') ;
plt.tight_layout() ;
plt.subplots_adjust(top=0.9) ;

For the original dataset we can see that premise word count kinda max out at 50 and hypothesis word count max out at 25. So it safe to use 75 as the MAX_LENGTH for modelling

In [None]:
### Distribution of word count in all english dataset
rcParams['figure.figsize'] = [15,5]
plt.style.use('fivethirtyeight') 
sns.set_style('whitegrid')
grid = gridspec.GridSpec(1,2)

### Setting up data for plot
df_plot = df_train_en.copy()
df_plot['premise_word_count'] = df_plot.apply(lambda x : len(x['premise'].split()), axis=1)
df_plot['hypothesis_word_count'] = df_plot.apply(lambda x : len(x['hypothesis'].split()), axis=1)

### Plotting
list_var = ['premise_word_count', 'hypothesis_word_count']
list_color = ['#db3236','#4885ed']
for i,var in enumerate(list_var) :
    ax = plt.subplot(grid[i])
    sns.distplot(df_plot[var], kde=False, ax=ax, color=list_color[i])

plt.suptitle('Distribution of word count on ALL ENGLISH dataset') ;
plt.tight_layout() ;
plt.subplots_adjust(top=0.9) ;

The distribution is pretty similar to the ORIGINAL dataset. So will use the same MAX LENGTH

In [None]:
### Distribution of word count in no stopwords dataset
rcParams['figure.figsize'] = [15,5]
plt.style.use('fivethirtyeight') 
sns.set_style('whitegrid')
grid = gridspec.GridSpec(1,2)

### Setting up data for plot
df_plot = df_train_no_stop.copy()
df_plot['premise_word_count'] = df_plot.apply(lambda x : len(x['premise'].split()), axis=1)
df_plot['hypothesis_word_count'] = df_plot.apply(lambda x : len(x['hypothesis'].split()), axis=1)

### Plotting
list_var = ['premise_word_count', 'hypothesis_word_count']
list_color = ['#db3236','#4885ed']
for i,var in enumerate(list_var) :
    ax = plt.subplot(grid[i])
    sns.distplot(df_plot[var], kde=False, ax=ax, color=list_color[i])

plt.suptitle('Distribution of word count on NO STOPWORDS dataset') ;
plt.tight_layout() ;
plt.subplots_adjust(top=0.9) ;

After removing the stopwords the premise now max out at 35 and hypothesis max out at 15. So there are around 10-15 word difference with the original dataset. For this dataset we will use MAX LENGTH of 50

In [None]:
### Proportion of target class
rcParams['figure.figsize'] = [8,5]
plt.style.use('fivethirtyeight') 
sns.set_style('whitegrid')

### Function to plot donut chart
def make_donut_chart(sizes, labels, colors=None, explode=None) :
  
    # Make pie chart
    plt.pie(sizes, colors = colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode = explode)

    # Make inner circle
    centre_circle = plt.Circle((0,0),0.70,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)

    plt.axis('equal')  
    plt.tight_layout()
    
# Plot preparation
sizes = df_train['label'].value_counts() / len(df_train) * 100
labels = ['entailment','contradiction','neutral']
colors = ['#4285F4','#EA4335','#6a737b']
explode_donut = [0.05, 0.05, 0.05]

# Plot
make_donut_chart(sizes, labels, colors, explode_donut)
plt.title('Percentage of label class', fontsize=18, fontname='Monospace', fontweight="bold") ;

Fairly uniform proportion of label class so it is not an imbalanced multiclassification case

In [None]:
### Make dummy label for stratified sampling on original dataset
LANGUAGE_MAP = {
            "English"   : 0,
            "Chinese"   : 1,
            "Arabic"    : 2,
            "French"    : 3,
            "Swahili"   : 4,
            "Urdu"      : 5,
            "Vietnamese": 6,
            "Russian"   : 7,
            "Hindi"     : 8,
            "Greek"     : 9,
            "Thai"      : 10,
            "Spanish"   : 11,
            "German"    : 12,
            "Turkish"   : 13,
            "Bulgarian" : 14
        }

df_train['language'] = df_train['language'].map(LANGUAGE_MAP)
df_train['language_label'] = df_train['language'].astype(str) + "_" + df_train['label'].astype(str)

So to conclude all the information above :
- We will use MAX LENGTH of 75 for original dataset, 75 also for all-english dataset, and 50 for no-punctuation dataset
- The proportion of the target variable is pretty equal
- The languange is mostly english with the other languange consist of only 2-3% each

# Setting up accelerator and function for modelling

In [None]:
### Initialize accelerator
import tensorflow as tf

def initialize_accelerator(ACCELERATOR) :

    # checking TPU first
    if ACCELERATOR == "TPU":
        print("Connecting to TPU")
        try:
            tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
            print(f"Running on TPU {tpu.master()}")
        except ValueError:
            print("Could not connect to TPU")
            tpu = None

        if tpu:
            try:
                print("Initializing TPU")
                tf.config.experimental_connect_to_cluster(tpu)
                tf.tpu.experimental.initialize_tpu_system(tpu)
                strategy = tf.distribute.experimental.TPUStrategy(tpu)
                print("TPU initialized")
            except :
                print("Failed to initialize TPU")
                strategy = tf.distribute.get_strategy()
        else:
            print("Unable to initialize TPU")
            ACCELERATOR = "GPU"

    # default for CPU and GPU
    if ACCELERATOR != "TPU":
        print("Using default strategy for CPU and single GPU")
        strategy = tf.distribute.get_strategy()

    # checking GPUs
    if ACCELERATOR == "GPU":
        print(f"GPUs Available: {len(tf.config.experimental.list_physical_devices('GPU'))}")

    # defining replicas
    AUTO = tf.data.experimental.AUTOTUNE
    REPLICAS = strategy.num_replicas_in_sync
    print(f"REPLICAS: {REPLICAS}")
    
    return strategy, AUTO, REPLICAS

STRATEGY, AUTO, REPLICAS =  initialize_accelerator('TPU')

In [None]:
### Function to do experiment
from sklearn.model_selection import StratifiedKFold
import tensorflow.keras.backend as K
import gc

def run_experiments(df, var_stratified, encode_text) :

    # Stratified K-fold
    skf = StratifiedKFold(n_splits = CV_SPLIT, shuffle = True, random_state = SEED)

    # Initializing predictions
    acc_oof = []

    # Iterating over folds
    for (fold, (train_index, valid_index)) in enumerate(skf.split(df, df[var_stratified])):
        
        # Initialize Accelerator
        STRATEGY, AUTO, REPLICAS =  initialize_accelerator('TPU')
        
        # Building model
        K.clear_session()
        with STRATEGY.scope():
            model = build_model(MODEL_NAME, MAX_LENGTH, METRICS)
            if fold == 0:
                print(model.summary())

        print("\n")
        print("#" * 19)
        print(f"##### Fold: {fold + 1} #####")
        print("#" * 19)

        # Splitting data into training and validation
        X_train = df.iloc[train_index].sample(frac=1)
        X_valid = df.iloc[valid_index]
        
        from tensorflow.keras.utils import to_categorical
        y_train = to_categorical(X_train['label'].values)
        y_valid = to_categorical(X_valid['label'].values)

        print("\nTokenizing")

        # Encoding text data using tokenizer
        X_train_encoded = encode_text(texts = X_train, tokenizer = TOKENIZER, maxlen = MAX_LENGTH, padding = PADDING)
        X_valid_encoded = encode_text(texts = X_valid, tokenizer = TOKENIZER, maxlen = MAX_LENGTH, padding = PADDING)
        
        # Creating TF Dataset
        ds_train = (
                            tf.data.Dataset
                            .from_tensor_slices((X_train_encoded, y_train))
                            .repeat()
                            .shuffle(SEED)
                            .batch(BATCH_SIZE)
                            .cache()
                            .prefetch(AUTO)
                            )

        ds_valid = (
                            tf.data.Dataset
                            .from_tensor_slices((X_valid_encoded, y_valid))
                            .batch(BATCH_SIZE)
                            .cache()
                            .prefetch(AUTO)
                            )
        
        n_train = X_train.shape[0]

        # Saving model at best accuracy epoch
        sv = tf.keras.callbacks.ModelCheckpoint(
            "model.h5",
            monitor = 'val_'+METRICS[0],
            verbose = 0,
            save_best_only = True,
            save_weights_only = True,
            mode = "max",
            save_freq = "epoch"
        )

        print("\nTraining")

        # Training model
        history = model.fit(
            ds_train,
            epochs = EPOCHS,
            callbacks = [sv],
            steps_per_epoch = n_train // BATCH_SIZE,
            validation_data = ds_valid,
            verbose = VERBOSE
        )
        
        
        # Validation
        model.load_weights("model.h5")
        
        from sklearn.metrics import accuracy_score
        pred = model.predict(ds_valid)
        acc = accuracy_score(X_valid['label'].values, np.argmax(pred, axis=1))
        acc_oof.append(acc)

        print(f"\nFold {fold + 1} Accuracy: {round(acc, 4)}\n")

        g = gc.collect()

    # overall CV score and standard deviation
    print(f"\nCV Mean Accuracy: {round(np.mean(acc_oof), 4)}")
    print(f"CV StdDev Accuracy: {round(np.std(acc_oof), 4)}\n")

In [None]:
### Function to build model
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import TFAutoModel

def build_model(model_name, max_len, metrics):

    # Defining encoded inputs
    input_ids = Input(shape = (max_len,), dtype = tf.int32, name = "input_ids")
    
    # Defining transformer model embeddings
    transformer_model = TFAutoModel.from_pretrained(model_name)
    transformer_embeddings = transformer_model(input_ids)[0]
    transformer_token = transformer_embeddings[:, 0, :]
    
    # Defining output layer
    output_values = Dense(3, activation = "softmax")(transformer_token)

    # defining model
    model = Model(inputs = input_ids, outputs = output_values)

    model.compile(optimizer = Adam(learning_rate = 1e-5), 
                  loss = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), 
                  metrics = metrics)

    return model

# 1. Combine the hypotheses and premise into one sentence
So in this one we are gonna combine both premise and hypothesis into one sentence separated with [SEP]

For example :
- **Premise** : I love food
- **Hypothesis** : I love fried rice
- **Final sentence** : I love food [SEP] I love fried rice

In [None]:
### Make the long sentence to be predictor
df_train['predictor'] = " [CLS] " + df_train['premise'] + " [SEP] " + df_train['hypothesis']
df_test['predictor'] = " [CLS] " + df_test['premise'] + " [SEP] " + df_test['hypothesis']

In [None]:
### Experiment configuration
# Note that all this parameter are being used in the run_experiments function
# Think of this as a global parameter for the function (cause I'm lazy to code it as function parameter)
MAX_LENGTH = 75
MODEL_NAME = "distilbert-base-multilingual-cased"
PADDING = True
BATCH_SIZE = 16 * REPLICAS
EPOCHS = 10
CV_SPLIT = 5
SEED = 2020
VERBOSE = 1
METRICS = ["categorical_accuracy"]

In [None]:
### Load tokenizer
import transformers
from transformers import AutoTokenizer

TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
### Function to encode predictor
def original_encode(texts, tokenizer, maxlen, padding):
    
    enc_di = tokenizer.batch_encode_plus(
             texts['predictor'], 
             return_attention_masks=False, 
             return_token_type_ids=False,
             pad_to_max_length=padding,
             max_length=maxlen)
    
    return enc_di["input_ids"]

In [None]:
### Start experiment
import time
start = time.time()

run_experiments(df_train, 'language_label', original_encode)

end = time.time()
print('Time used :',(end-start)/60)

Note that :
- Using this configuration we manage to get CV Acc of 56% with standard error of 0.1%, so there are no significance variance with this configuration
- The model give a sign of overfitting, we can conclude this by looking at the train accuracy for each fold. The train dataset can achieve accuracy of 82-85%. This is normal in deep learning model so the next step is to do **text augmentation** for the data

# 2. Using all-Englisth translated data
The translation are made by using Google Translate package. Basically translate all the premise and hypothesis into English. In this method we still combine the premise and hypothesis into one sentence

In [None]:
### Make the long sentence to be predictor
df_train_en['predictor'] = " [CLS] " + df_train_en['premise'] + " [SEP] " + df_train_en['hypothesis']
df_test_en['predictor'] = " [CLS] " + df_test_en['premise'] + " [SEP] " + df_test_en['hypothesis']

In [None]:
### Experiment configuration
# Note that all this parameter are being used in the run_experiments function
# Think of this as a global parameter for the function (cause I'm lazy to code it as function parameter)
MAX_LENGTH = 75
MODEL_NAME = "distilbert-base-multilingual-cased"
PADDING = True
BATCH_SIZE = 16 * REPLICAS
EPOCHS = 10
CV_SPLIT = 5
SEED = 2020
VERBOSE = 1
METRICS = ["categorical_accuracy"]

In [None]:
### Load tokenizer
import transformers
from transformers import AutoTokenizer

TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
### Function to encode predictor
def original_encode(texts, tokenizer, maxlen, padding):
    
    enc_di = tokenizer.batch_encode_plus(
             texts['predictor'], 
             return_attention_masks=False, 
             return_token_type_ids=False,
             pad_to_max_length=padding,
             max_length=maxlen)
    
    return enc_di["input_ids"]

In [None]:
### Start experiment
import time
start = time.time()

run_experiments(df_train_en, 'label', original_encode)

end = time.time()
print('Time used :',(end-start)/60)

Note that :
- This config have better performance than the first config. With this config we can get the CV Acc of 60% with standard error of 0.2%, with a train acc of 82-85% (the same as the first config). 
- So this result conclude that Googletrans translation have done a pretty good job, translating without losing the context of the sentence. But still this config still overfit the training dataset

# 3. Removing stopwords and punctuations
On the english translated dataset. Hypothetically this method will lose some context in the sentence but hey no pain to try

In [None]:
### Make the long sentence to be predictor
df_train_no_stop['predictor'] = " [CLS] " + df_train_no_stop['premise'] + " [SEP] " + df_train_no_stop['hypothesis']
df_test_no_stop['predictor'] = " [CLS] " + df_test_no_stop['premise'] + " [SEP] " + df_test_no_stop['hypothesis']

In [None]:
### Experiment configuration
# Note that all this parameter are being used in the run_experiments function
# Think of this as a global parameter for the function (cause I'm lazy to code it as function parameter)
MAX_LENGTH = 50
MODEL_NAME = "distilbert-base-multilingual-cased"
PADDING = True
BATCH_SIZE = 16 * REPLICAS
EPOCHS = 10
CV_SPLIT = 5
SEED = 2020
VERBOSE = 1
METRICS = ["categorical_accuracy"]

In [None]:
### Function to encode predictor
def original_encode(texts, tokenizer, maxlen, padding):
    
    enc_di = tokenizer.batch_encode_plus(
             texts['predictor'], 
             return_attention_masks=False, 
             return_token_type_ids=False,
             pad_to_max_length=padding,
             max_length=maxlen)
    
    return enc_di["input_ids"]

In [None]:
### Start experiment
import time
start = time.time()

run_experiments(df_train_no_stop, 'label', original_encode)

end = time.time()
print('Time used :',(end-start)/60)

Note that :
<br> For this configuration we manage to get CV Acc of 52% with standard error 0.9% so yeah this configuration isn't suitable in this case. You can see more discussion in this link (https://www.kaggle.com/c/contradictory-my-dear-watson/discussion/172105). **But maybe we can use this stopwords for data augmentation (random stopwords deletion)**

# 4. Use hypotheses and premise as separate input (so the model will have 2 input)
Use 2 pre-trained model for each input. For both input I will still use DistilBERT but feel free to change differ the model. It will make the training time longer but maybe this configuration can improve the performance of the model since each model focus on each input now

In [None]:
### Function to build model
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import TFAutoModel

def build_model_multimodal(model_name, max_len, metrics):

    # Defining encoded inputs
    input_ids_prem = Input(shape = (MAX_LENGTH_PREM,), dtype = tf.int32, name = "input_ids_prem")
    input_ids_hyp = Input(shape = (MAX_LENGTH_HYP,), dtype = tf.int32, name = "input_ids_hyp")
    
    # Defining transformer model embeddings
    transformer_model_prem = TFAutoModel.from_pretrained(model_name)
    transformer_embeddings_prem = transformer_model_prem(input_ids_prem)[0]
    transformer_token_prem = transformer_embeddings_prem[:, 0, :]
    
    transformer_model_hyp = TFAutoModel.from_pretrained(model_name)
    transformer_embeddings_hyp = transformer_model_hyp(input_ids_hyp)[0]
    transformer_token_hyp = transformer_embeddings_hyp[:, 0, :]
    
    # Concat 2 token
    transformer_concat = Concatenate()([transformer_token_prem, transformer_token_hyp])
    
    # Defining output layer
    output_values = Dense(3, activation = "softmax")(transformer_concat)

    # defining model
    model = Model(inputs = [input_ids_prem, input_ids_hyp], outputs = output_values)

    model.compile(optimizer = Adam(learning_rate = 1e-5), 
                  loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.2 ), 
                  metrics = metrics)

    return model

In [None]:
### Function to encode predictor
def original_encode_multimodal(texts, tokenizer, maxlen, padding):
    
    enc_di = tokenizer.batch_encode_plus(
             texts, 
             return_attention_masks=False, 
             return_token_type_ids=False,
             pad_to_max_length=padding,
             max_length=maxlen)
    
    return enc_di["input_ids"]

In [None]:
### Function to do experiment multimodal
from sklearn.model_selection import StratifiedKFold
import tensorflow.keras.backend as K
import gc

def run_experiments_multimodal(df, var_stratified, encode_text) :

    # Stratified K-fold
    skf = StratifiedKFold(n_splits = CV_SPLIT, shuffle = True, random_state = SEED)

    # Initializing predictions
    acc_oof = []

    # Iterating over folds
    for (fold, (train_index, valid_index)) in enumerate(skf.split(df, df[var_stratified])):
        
        # Initialize Accelerator
        STRATEGY, AUTO, REPLICAS =  initialize_accelerator('TPU')
        
        # Building model
        K.clear_session()
        with STRATEGY.scope():
            model = build_model_multimodal(MODEL_NAME, MAX_LENGTH_PREM, METRICS)
            if fold == 0:
                print(model.summary())

        print("\n")
        print("#" * 19)
        print(f"##### Fold: {fold + 1} #####")
        print("#" * 19)

        # Splitting data into training and validation
        X_train = df.iloc[train_index]
        X_valid = df.iloc[valid_index]

        from tensorflow.keras.utils import to_categorical
        y_train = to_categorical(X_train['label'].values)
        y_valid = to_categorical(X_valid['label'].values)

        print("\nTokenizing")

        # Encoding text data using tokenizer
        X_train_encoded_prem = encode_text(texts = X_train['premise'], tokenizer = TOKENIZER, maxlen = MAX_LENGTH_PREM, padding = PADDING)
        X_train_encoded_hyp = encode_text(texts = X_train['hypothesis'], tokenizer = TOKENIZER, maxlen = MAX_LENGTH_HYP, padding = PADDING)
        X_valid_encoded_prem = encode_text(texts = X_valid['premise'], tokenizer = TOKENIZER, maxlen = MAX_LENGTH_PREM, padding = PADDING)
        X_valid_encoded_hyp = encode_text(texts = X_valid['hypothesis'], tokenizer = TOKENIZER, maxlen = MAX_LENGTH_HYP, padding = PADDING)
        
        # Creating TF Dataset
        ds_train = (
                            tf.data.Dataset
                            .from_tensor_slices(({"input_ids_prem": X_train_encoded_prem, "input_ids_hyp": X_train_encoded_hyp}, y_train))
                            .repeat()
                            .shuffle(SEED)
                            .batch(BATCH_SIZE)
                            .cache()
                            .prefetch(AUTO)
                            )

        ds_valid = (
                            tf.data.Dataset
                            .from_tensor_slices(({"input_ids_prem": X_valid_encoded_prem, "input_ids_hyp": X_valid_encoded_hyp}, y_valid))
                            .batch(BATCH_SIZE)
                            .cache()
                            .prefetch(AUTO)
                            )
        
        n_train = X_train.shape[0]

        # Saving model at best accuracy epoch
        sv = tf.keras.callbacks.ModelCheckpoint(
            "model.h5",
            monitor = 'val_'+METRICS[0],
            verbose = 0,
            save_best_only = True,
            save_weights_only = True,
            mode = "max",
            save_freq = "epoch"
        )

        print("\nTraining")

        # Training model
        history = model.fit(
            ds_train,
            epochs = EPOCHS,
            callbacks = [sv],
            steps_per_epoch = n_train // BATCH_SIZE,
            validation_data = ds_valid,
            verbose = VERBOSE
        )
        
        # Validation
        model.load_weights("model.h5")
        
        from sklearn.metrics import accuracy_score
        pred = model.predict(ds_valid)
        acc = accuracy_score(X_valid['label'].values, np.argmax(pred, axis=1))
        acc_oof.append(acc)

        print(f"\nFold {fold + 1} Accuracy: {round(acc, 4)}\n")


        g = gc.collect()

    # overall CV score and standard deviation
    print(f"\nCV Mean Accuracy: {round(np.mean(acc_oof), 4)}")
    print(f"CV StdDev Accuracy: {round(np.std(acc_oof), 4)}\n")

In [None]:
### Experiment configuration
# Note that all this parameter are being used in the run_experiments function
# Think of this as a global parameter for the function (cause I'm lazy to code it as function parameter)
MAX_LENGTH_PREM = 50
MAX_LENGTH_HYP = 25

# You can use different pre trained model for premise and hypothesis
# In this notebook I use XLM-RoBERTa for both
MODEL_NAME = "distilbert-base-multilingual-cased"
PADDING = True
BATCH_SIZE = 16 * REPLICAS
EPOCHS = 10
CV_SPLIT = 5
SEED = 2020
VERBOSE = 1
METRICS = ["categorical_accuracy"]

In [None]:
### Load tokenizer
import transformers
from transformers import AutoTokenizer

TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
### Start experiment
import time
start = time.time()

run_experiments_multimodal(df_train_en, 'label', original_encode_multimodal)

end = time.time()
print('Time used :',(end-start)/60)

By using 1 model for each input we manage to get CV Acc of 52% with standard error 0.3%. This result is produced by using all-english dataset. Do note that the all-english single model can get CV Acc of 60% but with standard error of 1%, so this configuration also not suitable for this case

# Submission
Based on the benchmark result, it safe to say that all-english dataset give the best performance so I will use it to make submission. In here I will use XLM-RoBERTa since it is a bigger model that have much more parameter than DistilBERT

In [None]:
### Experiment configuration
# Note that all this parameter are being used in the run_experiments function
# Think of this as a global parameter for the function (cause I'm lazy to code it as function parameter)
MAX_LENGTH = 75
MODEL_NAME = "jplu/tf-xlm-roberta-large"
PADDING = True
BATCH_SIZE = 16 * REPLICAS
EPOCHS = 10
CV_SPLIT = 5
SEED = 2020
VERBOSE = 1
METRICS = ["categorical_accuracy"]

In [None]:
### Load tokenizer
import transformers
from transformers import AutoTokenizer

TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
### Encode the dataset
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(df_train_en['label'].values)

# Encoding text data using tokenizer
X_train_encoded = original_encode(texts = df_train_en, tokenizer = TOKENIZER, maxlen = MAX_LENGTH, padding = PADDING)
X_test_encoded = original_encode(texts = df_test_en, tokenizer = TOKENIZER, maxlen = MAX_LENGTH, padding = PADDING)

In [None]:
### Make TF dataset
ds_train = (
                    tf.data.Dataset
                    .from_tensor_slices((X_train_encoded, y_train))
                    .repeat()
                    .shuffle(SEED)
                    .batch(BATCH_SIZE)
                    .cache()
                    .prefetch(AUTO)
                    )

ds_test = (
                    tf.data.Dataset
                    .from_tensor_slices((X_test_encoded))
                    .batch(BATCH_SIZE)
                    .prefetch(AUTO)
                    )

In [None]:
# Building model
K.clear_session()
with STRATEGY.scope():
    model = build_model(MODEL_NAME, MAX_LENGTH, METRICS)

In [None]:
### Train model
n_train = df_train_en.shape[0]

history = model.fit(
    ds_train,
    epochs = EPOCHS,
    steps_per_epoch = n_train // BATCH_SIZE,
    verbose = VERBOSE
)

In [None]:
### Make prediction
pred = model.predict(ds_test)

In [None]:
### Make submission
sub = pd.read_csv('../input/contradictory-my-dear-watson/sample_submission.csv')
sub['prediction'] = np.argmax(pred, axis=1)

sub.to_csv('submission.csv', index=False)

# Conclusion
All configuration give a stable performance based on the small standard error accuracy but the model trained using all-englisht text give the best CV Acc, but it sill have the overfitting issue

So for the next step :
- Use translation and stop words as text augmentation. This is supported by the performance inconsistency of the affiliated configuration.
- Benchmark other pre-trained model. JohnM have give us a list of possible model than can be used (https://www.kaggle.com/c/contradictory-my-dear-watson/discussion/171587)

P.S : 
- I have try using different pre-trained model for benchmark. This include the RoBERTa model that have been trained using MNLI dataset which have a leakage to the test dataset. If you want to seek a high lb score you can try using this model, but I don't recommend it because it is a false result.
- XLM RoBERTa give the most stable performance that can achieve CV Acc of 77-78%. I am not using it here because the model have many parameter (around 500k)

## Feel free to ask below and give an upvote if this notebook helps you