Natural Language Inferencing (NLI) is a classic NLP (Natural Language Processing) problem that involves taking two sentences (the _premise_ and the _hypothesis_ ), and deciding how they are related- if the premise entails the hypothesis, contradicts it, or neither.

In this tutorial we'll look at the _Contradictory, My Dear Watson_ competition dataset, build a preliminary model using Tensorflow 2, Keras, and BERT, and prepare a submission file.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.environ["WANDB_API_KEY"] = "0" ## to silence warning

In [None]:
!pip install deep_translator
!pip install transformers

import numpy as np
import pandas as pd

import seaborn as sns
from deep_translator import GoogleTranslator
import tensorflow as tf
import transformers
from sklearn.model_selection import train_test_split

Let's set up our TPU.

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)
    
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

## Downloading Data

The training set contains a premise, a hypothesis, a label (0 = entailment, 1 = neutral, 2 = contradiction), and the language of the text. For more information about what these mean and how the data is structured, check out the data page: https://www.kaggle.com/c/contradictory-my-dear-watson/data

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

We reset index for better interpretablity of our code

In [None]:
train = train.reset_index()
test = test.reset_index()
train.columns

Translate the non_english data

In [None]:
def trans_to_eng(row):
    premise = row['premise']
    hypothesis = row['hypothesis']
    
    nmber = row['index']
    source = row['lang_abv']
    target = 'en'
    
    if source != 'en':
        en_return_premise = GoogleTranslator(source=source, target=target).translate(premise)
        en_return_hypothesis = GoogleTranslator(source=source, target=target).translate(hypothesis)
    else:
        en_return_premise = premise
        en_return_hypothesis = hypothesis
       
    #print(en_return)
    #if  umprint(nmber, sep=' ', end='', flush=True)
    if nmber % 1000 == 0:
        print(nmber)
    return en_return_premise, en_return_hypothesis

Get Translated Premise and Hypothesis

In [None]:
train['premise_en'], train['hypothesis_en'] = zip(*train.apply(lambda x: trans_to_eng(x), axis = 1 ))
test['premise_en'], test['hypothesis_en'] = zip(*test.apply(lambda x: trans_to_eng(x), axis = 1 ))

Let's checkout the data

In [None]:
print(f"premise: {train.loc[13, 'premise_en']}")
print(f"hypothesis: {train.loc[13, 'hypothesis_en']}")
print(f"label: {train.loc[13, 'label']}")

## Preparing Data for Input

Load and Tokenize using electra model

In [None]:
max_length = 100  # Maximum length of input sentence to the model.
batch_size = 16
#epochs = 6

In [None]:
tokenizer = transformers.ElectraTokenizer.from_pretrained("google/electra-large-discriminator", do_lower_case=True)

In [None]:
train_encoded = tokenizer(text=list(train.premise_en.values),
                    text_pair=list(train.hypothesis_en.values),
                    add_special_tokens=True,
                    max_length=max_length,
                    truncation=True,
                    padding=True,
                    return_attention_mask=True,
                    return_token_type_ids=True,
                    return_tensors='tf'
                    )

In [None]:
val_size = int(len(train)*0.33)

dataset = tf.data.Dataset.from_tensor_slices((train_encoded.data, train.label.values))
val_dataset = (dataset.take(val_size).batch(batch_size))
train_dataset = (dataset.skip(val_size).batch(batch_size))

Model

In [None]:
MODEL_NAME = 'google/electra-large-discriminator'
with tpu_strategy.scope():
    transformer = transformers.TFAutoModel.from_pretrained(MODEL_NAME)

## Creating & Training Model

In [None]:
def create_model():

    input_1 = tf.keras.Input(shape=(max_length,),name='input_ids', dtype='int32')
    input_2 = tf.keras.Input(shape=(max_length,),name='attention_mask', dtype='int32')
    input_3 = tf.keras.Input(shape=(max_length,),name='token_type_ids', dtype='int32')

    #x = transformer((input_1, input_2, input_3))[0]
    #output_1 = tf.keras.layers.Dense(300, activation='relu')(x[:,0,:])
    #output_2 = tf.keras.layers.Dense(100, activation='relu')(output_1)
    #output = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(output_2)
    sequence_output = transformer((input_1, input_2, input_3))[0]
    output_1 = tf.keras.layers.Dense(400, activation='relu')(sequence_output[:,0,:])
    output_2 = tf.keras.layers.Dense(100, activation='relu')(output_1)
    output = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(output_2)
     
    
    model = tf.keras.Model(inputs=(input_1, input_2, input_3), outputs=output)
    return model

In [None]:
with tpu_strategy.scope():
    model = create_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate =1e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()

In [None]:
lr_reduction = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.3,
    patience= 1,
    min_lr=1e-7
)

history = model.fit(
    train_dataset,
    epochs = 10,
    verbose = 2,
    batch_size = batch_size,
    callbacks=[lr_reduction],
    validation_data=val_dataset
)

In [None]:
encoding_test = tokenizer(text=list(test.premise_en.values),
                    text_pair=list(test.hypothesis_en.values),
                    add_special_tokens=True,
                    max_length=max_length,
                    truncation=True,
                    padding=True,
                    return_attention_mask=True,
                    return_token_type_ids=True,
                    return_tensors='tf'
                    )

In [None]:
pred_submission = model.predict(encoding_test.data, batch_size=128, verbose=1)
test_pred_labels = np.argmax(pred_submission, axis=1)

#submission = pd.DataFrame()
test['prediction'] = test_pred_labels
test = test[['id','prediction']]


In [None]:
test.to_csv("submission.csv", index = False)