Natural Language Inferencing (NLI) is a classic NLP (Natural Language Processing) problem that involves taking two sentences (the premise and the hypothesis ), and deciding how they are related- if the premise entails the hypothesis, contradicts it, or neither.

In this tutorial we'll look at the Contradictory, My Dear Watson competition dataset, build a preliminary model using Tensorflow 2, Keras, and BERT, and prepare a submission file.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from transformers import BertTokenizer, TFBertModel
from transformers import RobertaTokenizer, TFRobertaModel
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import tensorflow as tf
from dask import bag, diagnostics
from sklearn.utils import shuffle

In [None]:
!pip install --quiet googletrans
from googletrans import Translator

Let's set up our TPU.

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

# **Downloading data**
The training set contains a premise, a hypothesis, a label (0 = entailment, 1 = neutral, 2 = contradiction), and the language of the text. For more information about what these mean and how the data is structured, check out the data page: https://www.kaggle.com/c/contradictory-my-dear-watson/data

In [None]:
submission = pd.read_csv("/kaggle/input/output/submission (2).csv")
submission.head()

In [None]:
submission.to_csv("submission1.csv", index = False)


In [None]:
train = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
train.head()

**Let us see the distribution of languages in the training dataset.**

In [None]:
train['language'].value_counts(normalize = True).plot(kind = 'bar', alpha = 0.7)
plt.show()

# Preparing Data for Input
To start out, we can use a pretrained model. Here, we'll use a multilingual BERT model from huggingface. For more information about BERT, see: https://github.com/google-research/bert/blob/master/multilingual.md

First, we download the tokenizer.

In [None]:
# model_name = 'bert-base-multilingual-cased'
# tokenizer = BertTokenizer.from_pretrained(model_name)
#tokenizer = RobertaTokenizer.from_pretrained('roberta-large-mnli')
#tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
tokenizer = AutoTokenizer.from_pretrained('jplu/tf-xlm-roberta-large')

Tokenizers turn sequences of words into arrays of numbers. Let's look at an example:

In [None]:
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)
    

In [None]:
s = "I love machine learning"
encode_sentence(s)

# Data augmentation by Translation

In [None]:
def data_translate(source_data,dest_language):
    translator = Translator()
    if dest_language == 'zh':
        dest_language = 'zh-cn'
    dest_data = translator.translate(source_data, dest = dest_language).text 
    return dest_data

In [None]:
def translation_augment(source_data, languages, fraction):
    
    new_df = pd.DataFrame()
    
    for lang in languages:
        print(lang)
        sampled_rows = source_data.sample(frac=fraction, replace = False)
        prem_bag = bag.from_sequence(sampled_rows['premise'].tolist()).map(data_translate, lang)
        hypothesis_bag = bag.from_sequence(sampled_rows['hypothesis'].tolist()).map(data_translate, lang)
        
        with diagnostics.ProgressBar():
            prems = prem_bag.compute()
            hyps = hypothesis_bag.compute()
            
        aug_df = pd.DataFrame({'id': pd.Series([None]*len(sampled_rows)),
                                'premise': pd.Series(prems),
                                'hypothesis': pd.Series(hyps),
                                'lang_abv': pd.Series([lang]*len(sampled_rows)),
                                'language': pd.Series([None]*len(sampled_rows)),
                                'label': pd.Series(sampled_rows['label'].values)                              
                              })
        new_df = new_df.append(aug_df)
    new_df = shuffle(new_df)
    return new_df


In [None]:
def data_augment(train_df, fraction):
    
    english_df = train.loc[train.lang_abv == 'en']
    languages = list(set(train.lang_abv.values))
    languages.remove('en')

#     languages = ['fr', 'th', 'tr', 'ur', 'ru', 'bg', 'de', 'ar', 'zh-cn', 'hi',
#                  'sw', 'vi', 'es', 'el']

    print(languages)    
    translated_df = translation_augment(english_df,languages, fraction)
    train_df = train_df.append(translated_df)
    train_df = shuffle(train_df)
    return train_df

In [None]:
train = pd.read_csv('/kaggle/input/augment-data20/augmented_data_20percent.csv')
train.head()
len(train)

In [None]:
# print("Length of training data before augmentation", len(train))
# train = data_augment(train, fraction = 0.6)
# print("Length of training data after augmentation", len(train))

# train['lang_abv'].value_counts(normalize = True).plot(kind = 'bar', alpha = 0.7)
# plt.show()
# train.to_csv('augmented_data_60_percent.csv', index=False)
# train.head()

BERT uses three kind of input data- input word IDs, input masks, and input type IDs.

These allow the model to know that the premise and hypothesis are distinct sentences, and also to ignore any padding from the tokenizer.

We add a [CLS] token to denote the beginning of the inputs, and a [SEP] token to denote the separation between the premise and the hypothesis. We also need to pad all of the inputs to be the same size. For more information about BERT inputs, see: https://huggingface.co/transformers/model_doc/bert.html#tfbertmodel

Now, we're going to encode all of our premise/hypothesis pairs for input into BERT.

In [None]:
def bert_encode(premises, hypotheses, tokenizer):
    num_examples = len(premises)
    sen1 = tf.ragged.constant([encode_sentence(s) for s in np.array(premises)])
    sen2 = tf.ragged.constant([encode_sentence(s) for s in np.array(hypotheses)])
    cls = [tokenizer.convert_tokens_to_ids(['CLS'])]*sen1.shape[0]
    
    input_word_ids = tf.concat([cls, sen1, sen2], axis = -1)
    input_mask = tf.ones_like(input_word_ids).to_tensor()
    
    type_cls = tf.zeros_like(cls)
    type_sen1 = tf.zeros_like(sen1)
    type_sen2 = tf.ones_like(sen2)
    input_type_ids = tf.concat([type_cls, type_sen1, type_sen2], axis = -1).to_tensor()
    
    inputs = {
        
        'input_word_ids' : input_word_ids.to_tensor(),
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
        
    }
    
    return inputs

In [None]:
train_input = bert_encode(train.premise.values, train.hypothesis.values, tokenizer)

# **Creating & Training Model**

In [None]:
max_len = 80
def build_model():
    #bert_encoder = TFBertModel.from_pretrained(model_name)
    bert_encoder = TFRobertaModel.from_pretrained('jplu/tf-xlm-roberta-large')
    #bert_encoder = TFXLMRobertaModel.from_pretrained('xlm-mlm-100-1280')
    input_word_ids = tf.keras.Input(shape =(max_len, ), dtype =tf.int32, name = "input_word_ids")
    input_mask = tf.keras.Input(shape = (max_len, ), dtype= tf.int32, name = "input_mask")
    input_type_ids = tf.keras.Input(shape= (max_len, ), dtype= tf.int32, name="input_type_ids")
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation = 'softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs= [input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics = 'accuracy')
    
    return model   
    

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()

In [None]:
model.fit(train_input, train.label.values, epochs=3, verbose=1, batch_size=16, validation_split=0.2)

In [None]:
model.save_weights('RoBertamodel_augmented_data_20_percent_adam_sparse_categorical_entropy.h5')

In [None]:
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
test.head()

In [None]:
test_input = bert_encode(test.premise.values, test.hypothesis.values, tokenizer)

# Generating & Submitting Predictions

In [None]:
predictions = [np.argmax(i) for i in model.predict(test_input) ]

In [None]:
submission = test.id.copy().to_frame()
submission.head()


In [None]:
submission['prediction'] = predictions

In [None]:
submission.to_csv("submission.csv", index = False)
