In [None]:
#python basics
from matplotlib import pyplot as plt
from tqdm import tqdm
import math, os, re, time, random, json, gc
import numpy as np, pandas as pd, seaborn as sns

#deep learning basics
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow_addons as tfa

#nlp augmentation
!pip install --quiet googletrans
from googletrans import Translator

#easy way to shuffle rows
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#get current TensorFlow version fo
print("Currently using Tensorflow version " + tf.__version__)

# It's Elementary, My Dear Watson

**Natural Language Inference (NLI) is a specific type of NLP task where we must determine whether or not a hypothesis is true based on a premise. Specifically, given a pair of sentences, can we classify them into three different classes: 0 - entailment, 1 - contradiction, 2 - neutral?**

**The current leading model in this field is RoBERTa, described by its creators as a 'robustly optimized BERT pretraining approach'. It changes some of the key hyperparameters of BERT and removes the next-sentence pretraining objective all together. The original paper can be found [here](https://arxiv.org/abs/1907.11692) and the source code [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta)**

**Now, we have 15 different languages in our dataset, so we cannot use the standard pre-trained RoBERTa model as it has only been trained on English sequences. Luckily, there is [XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html) (original paper can be found [here](https://arxiv.org/abs/1911.02116)) which has been trained on 2.5TB of filtered CommonCrawl data in 100 different languages. The implementation procedude is the same as RoBERTa's, so it is easy enough to deploy. Let's see how:**

In [None]:
SEED = 34

def seed_everything(seed):
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(SEED)

In [None]:
DEVICE = 'TPU'

if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

#choose batch size - will depend on cores of our device
BATCH_SIZE = 16 * REPLICAS

# EDA

**A very brief data exploration**

In [None]:
#get CSV files
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

print(f'Train shape: {train.shape}')
train.head()

In [None]:
print(f'Test shape: {test.shape}')
test.head()

In [None]:
#peek at a premise/hypothesis pair and their label
print(f"Premise: {train['premise'].values[0]}")
print(f"Hypothesis: {train['hypothesis'].values[0]}")
print(f"Label: {train['label'].values[0]}")

In [None]:
#peek at a premise/hypothesis pair and their label
print(f"Premise: {train['premise'].values[1]}")
print(f"Hypothesis: {train['hypothesis'].values[1]}")
print(f"Label: {train['label'].values[1]}")

In [None]:
#explore the distribution of classes and languages
fig, ax = plt.subplots(figsize = (15, 10))

#for maximum aesthetics
palette = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True)

graph1 = sns.countplot(train['language'], hue = train['label'], palette = palette)

#set title
graph1.set_title('Distribution of Languages and Labels')

plt.tight_layout()
plt.show()

# Back-Translation

**In computer vision problems, there is a virtual infinitude of techniques you can use to augment your images ranging from simple techniques like randomly flipping images to blending images together with CutMix or MixUp. In natural language processing, it is not as easy to come up with similar augmentation strategies because it is hard to determine which transformations will preserve the meaning of the original words:**

![](https://amitness.com/images/semantic-invariance-nlp.png)
*Image from [@amitness](https://www.kaggle.com/amitness) on his excellent post on NLP augmentation [here](https://amitness.com/2020/05/data-augmentation-for-nlp/)*   


**The first thought I had was to randomly replace words with their synonyms or to randomly add word synonyms to the sequence, but then I saw [this kernel](https://www.kaggle.com/jpmiller/augmenting-data-with-translations) which is based on [this discussion thread](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/48038) and realized we can do better: we can use translation to augment our data and do try several things:**

1. We can experiment and see if training our model on one language is better/worse than training on multiple languages
2. We can change the distribution of languages in our dataset, perhaps translating sentences to low-resource languages like Swahili and Urdu
3. We can randomly translate sentences to another language and then translate them back to the original like so:


![](https://amitness.com/images/backtranslation-en-fr.png)

*Image from [@amitness](https://www.kaggle.com/amitness) on his excellent post on NLP augmentation [here](https://amitness.com/2020/05/data-augmentation-for-nlp/)*

**Please note that some of these language codes are slightly different within the `googletrans` Python API. See [here](https://py-googletrans.readthedocs.io/en/latest/) for more**

In [None]:
def back_translate(sequence, PROB = 1):
    languages = ['en', 'fr', 'th', 'tr', 'ur', 'ru', 'bg', 'de', 'ar', 'zh-cn', 'hi',
                 'sw', 'vi', 'es', 'el']
    
    #instantiate translator
    translator = Translator()
    
    #store original language so we can convert back
    org_lang = translator.detect(sequence).lang
    
    #randomly choose language to translate sequence to  
    random_lang = np.random.choice([lang for lang in languages if lang is not org_lang])
    
    if org_lang in languages:
        #translate to new language and back to original
        translated = translator.translate(sequence, dest = random_lang).text
        #translate back to original language
        translated_back = translator.translate(translated, dest = org_lang).text
    
        #apply with certain probability
        if np.random.uniform(0, 1) <= PROB:
            output_sequence = translated_back
        else:
            output_sequence = sequence
            
    #if detected language not in our list of languages, do nothing
    else:
        output_sequence = sequence
    
    return output_sequence

#check performance
for i in range(5):
    output = back_translate('I genuinely have no idea what the output of this sequence of words will be')
    print(output)

**I have already created augmented datasets with the above translation method in my kernel [here](https://www.kaggle.com/tuckerarrants/using-google-translate-for-nlp-augmentation/edit/run/40695539), and the datasets can be found found [here](https://www.kaggle.com/tuckerarrants/contradictorywatsontranslationaug) and [here](https://www.kaggle.com/tuckerarrants/contradictorywatsontwicetranslatedaug). Let's quickly compare the three separate datasets:**

In [None]:
train.head()

In [None]:
#offline loading of augmented datasets
train_aug = pd.read_csv('../input/contradictorywatsontwicetranslatedaug/translation_aug_train.csv')
train_aug.head()

In [None]:
#offline loading of augmented datasets
train_twice_aug = pd.read_csv('../input/contradictorywatsontwicetranslatedaug/twice_translated_aug_train.csv')
train_twice_aug.head()

In [None]:
#offline loading of augmented datasets
train_thrice_aug = pd.read_csv('../input/contradictorywatsontwicetranslatedaug/thrice_translation_aug_train.csv')
train_thrice_aug.head()

**Wonderful! Now you could take this dataset and apply the same procedure to generate an ever more diverse set of augmetations, or you could increase the complexity of the translation by chaining together multiple languages, i.e.**

> English -> French -> Russian -> .... -> English

**You can experiment to see if adding samples from these back-translated training datasets gives you better performance OR you can use the back-translated test datasets for TTA**

# Upsampling with Translation

**Now we can use Google Translate to add additional samples for training by translating premise/hypothesis pairs to 'low-resource' languages. I have already done this [here](https://www.kaggle.com/tuckerarrants/using-google-translate-for-nlp-augmentation) so we can just import them. The following datasets are the original train dataset translated to Vietnamese, Hindi, and Bulgarian:**

In [None]:
#get CSV files
train_vi = pd.read_csv("../input/contradictorytranslatedtrain/train_vi.csv")
train_hi = pd.read_csv("../input/contradictorytranslatedtrain/train_hi.csv")
train_bg = pd.read_csv("../input/contradictorytranslatedtrain/train_bg.csv")

In [None]:
#sanity check
train_vi.head()

In [None]:
#sanity check
train_hi.head()

In [None]:
#sanity check
train_bg.head()

**Great, so if we wanted to experiment with adding these newly translated samples, we can simply add them to training. Be careful though, you do not want RoBERTa to overfit the validation data by seeing labels of the same premise/hypothesis pair in different languages, so split your data into train/validation before upsampling and remove the validation samples from your training set, if you decide to experiment with this upsampling technique**

# RoBERTa

**HuggingFace Transformers makes it unbelievable easy to use transformers. In fact, you don't even need to specify the transformer or tokenizer: its architecture can be guessed from the name or path of the pretrained model you specify in the `from_pretrained` method. To read more about AutoModels/Tokenizers, see [this](https://huggingface.co/transformers/model_doc/auto.html)**

In [None]:
#get HuggingFace transformers
!pip install --quiet transformers

#import model and Tokenizer
from transformers import TFAutoModel, AutoTokenizer

#get paths to TensorFlow XLM-RoBERTa base and large models
roberta_base = "jplu/tf-xlm-roberta-base"
roberta_large = 'jplu/tf-xlm-roberta-large'

### 1. Original Dataset


In [None]:
#offline load back-translated test samples
test_bt = pd.read_csv('../input/contradictorywatsontwicetranslatedaug/translation_aug_test.csv')
test_bt_twice = pd.read_csv('../input/contradictorywatsontwicetranslatedaug/twice_translated_aug_test.csv')
test_bt_thrice = pd.read_csv('../input/contradictorywatsontwicetranslatedaug/thrice_translation_aug_test.csv')

**Below is a function that covers the 2 step process where we tokenize our text data with a HuggingFace object `TOKENIZER` and then convert it into a `tf.data.Dataset` object for use with TPU:**

In [None]:
TOKENIZER = AutoTokenizer.from_pretrained(roberta_large)

#function to encode text and convert dataset to tensor dataset
def to_tf_dataset(dataset, max_len, repeat = False, shuffle = False, labeled = True, batch_size = BATCH_SIZE):
    dataset_text = dataset[['premise', 'hypothesis']].values.tolist()
    dataset_enc = TOKENIZER.batch_encode_plus(dataset_text, pad_to_max_length = True, max_length = max_len)
    
    if labeled:
        tf_dataset = tf.data.Dataset.from_tensor_slices((dataset_enc['input_ids'], dataset['label']))
    else:
        tf_dataset = tf.data.Dataset.from_tensor_slices((dataset_enc['input_ids']))
    
    if repeat: tf_dataset = tf_dataset.repeat()  
        
    if shuffle: 
        tf_dataset = tf_dataset.shuffle(2048)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        tf_dataset = tf_dataset.with_options(opt)
        
    tf_dataset = tf_dataset.batch(batch_size)
    tf_dataset = tf_dataset.prefetch(AUTO)
    
    return tf_dataset

**It is generally a good idea to use a learning rate scheduler when transfer learning. Our pretrained model already knows quite a bit, so we want to start the learning rate at 0 - if we start with a high learning rate, there is a chance we 'erase' the weights that the model already had, defeating the purpose of transfer learning. We then slowly increase the learning rate as the model adapts to the new data:** 

**That being said, I am still figuring out the best learning rate schedule as the current one does not seem to provide much increase in score/smoother training**

In [None]:
###########################################
#### Configuration
###########################################
LR_START = 1e-6
LR_MAX = 1e-6 * 8
LR_MIN = 1e-6
LR_RAMPUP_EPOCHS = 2
LR_SUSTAIN_EPOCHS = 0
LR_DECAY = .8

#stepwise schedule
def lrfn_step(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = LR_MAX * LR_DECAY**((epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS)//2)
    return lr


#smoothish schedule
def lrfn_smooth(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr
    
lr_callback_step = tf.keras.callbacks.LearningRateScheduler(lrfn_step, verbose = True)
lr_callback_smooth = tf.keras.callbacks.LearningRateScheduler(lrfn_smooth, verbose = True)

#visualize learning rate schedule
rng = [i for i in range(25)]
y1 = [lrfn_step(x) for x in rng]
y2 = [lrfn_smooth(x) for x in rng]
fix, ax = plt.subplots(1,2, figsize = (15, 5))
ax[0].plot(rng, y1)
ax[1].plot(rng, y2)
plt.tight_layout()
print("Learning rate schedule for step schedule: {:.3g} to {:.3g} to {:.3g}".format(y1[0], max(y1), y1[-1]))
print("Learning rate schedule for smooth schedule: {:.3g} to {:.3g} to {:.3g}".format(y2[0], max(y2), y2[-1]))

**Let's create a simple model with a RoBERTa layer connected to a `softmax` activated layer with 3 nodes to classify our premise/hypothesis pairs as 3 different labels:**

In [None]:
#helper function to create our model
def build_model(transformer_layer, max_len, learning_rate):
    #must use this to send to TPU cores
    with strategy.scope():
        #define input(s)
        input_ids = tf.keras.Input(shape = (max_len,), dtype = tf.int32)
        
        #insert roberta layer
        roberta = TFAutoModel.from_pretrained(transformer_layer)
        roberta = roberta(input_ids)[0]
        
        #only need <s> token here, so we extract it now
        out = roberta[:, 0, :]
        
        out = tf.keras.layers.BatchNormalization()(out)
        
        #add our softmax layer
        out = tf.keras.layers.Dense(3, activation = 'softmax')(out)
        
        #assemble model and compile
        model = tf.keras.Model(inputs = input_ids, outputs = out)
        model.compile(
                        optimizer = tf.keras.optimizers.Adam(lr = learning_rate), 
                        loss = 'sparse_categorical_crossentropy', 
                        metrics = ['accuracy'])
        
    return model  

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

###########################################
#### Configuration
###########################################

LR_RATE = 5e-6
EPOCHS = 15
FOLDS = 4
MAX_LEN = 85
STEPS_PER_EPOCH = len(train) // BATCH_SIZE
TTA = 4
VERBOSE = 2

############################################
#### Training
############################################

preds = np.zeros((len(test), 3))
preds_tta = np.zeros((len(test), 3))
skf = StratifiedKFold(n_splits=FOLDS,shuffle=True,random_state=SEED)

for fold,(train_index,val_index) in enumerate(skf.split(train, train['language'])):

    #to clear TPU memory
    if DEVICE=='TPU':
        if tpu: tf.tpu.experimental.initialize_tpu_system(tpu)
    
    #build model
    K.clear_session()
    model = build_model(roberta_large, max_len = MAX_LEN, learning_rate = LR_RATE)
        
    #save best model from each fold
    sv = tf.keras.callbacks.ModelCheckpoint(f'fold-{fold}.h5', monitor = 'val_loss', verbose = 0,
                        save_best_only = True, save_weights_only = True, mode = 'min')
   
    #get our datasets
    train_ds = to_tf_dataset(train.loc[train_index], labeled = True, shuffle = True, repeat = True, max_len = MAX_LEN)
    val_ds = to_tf_dataset(train.loc[val_index], labeled = True, shuffle = False, repeat = False, max_len = MAX_LEN)


    #and go
    print('')
    print('#'*25); print('#### FOLD',fold+1)
    print('Training...'); print('')
    history = model.fit(train_ds, validation_data = val_ds, callbacks = [sv],
                        epochs = EPOCHS, steps_per_epoch = STEPS_PER_EPOCH,
                        verbose = VERBOSE); print('')

    
    print('Loading best model...')
    model.load_weights(f'fold-{fold}.h5')
    
############################################
#### Validation
############################################
    
    #predict validation with TTA
    print('Predicting validation with TTA...')
    
    #offline load pre-back-translated datasets
    val_df = train.loc[val_index]
    val_df_bt = train_aug.loc[val_index]
    val_df_bt_twice = train_twice_aug.loc[val_index]
    val_df_bt_thrice = train_thrice_aug.loc[val_index]
    
    #convert to tensor dataset
    val_tta1 = to_tf_dataset(val_df, shuffle = False, labeled = False, repeat = False, max_len = MAX_LEN)
    val_tta2 = to_tf_dataset(val_df_bt, shuffle = False, labeled = False, repeat = False, max_len = MAX_LEN)
    val_tta3 = to_tf_dataset(val_df_bt_twice, shuffle = False, labeled = False, repeat = False, max_len = MAX_LEN)
    val_tta4 = to_tf_dataset(val_df_bt_thrice, shuffle = False, labeled = False, repeat = False, max_len = MAX_LEN)
    
    #predict with augmentated validation sets
    val_pred1 = model.predict(val_tta1, verbose = VERBOSE)
    val_pred2 = model.predict(val_tta2, verbose = VERBOSE) 
    val_pred3 = model.predict(val_tta3, verbose = VERBOSE) 
    val_pred4 = model.predict(val_tta4, verbose = VERBOSE) 
        
    val_preds = (val_pred1 + val_pred2 + val_pred3 + val_pred4) / TTA
     
    print(f"Without TTA: {accuracy_score(val_pred1.argmax(axis = 1), val_df['label'])}")
    print(f"With TTA: {accuracy_score(val_preds.argmax(axis = 1), val_df['label'])}")
    print('')
    
############################################
#### Prediction
############################################

    #predict out of fold with TTA
    print('Predicting OOF with TTA...')
    
    #convert test to tensor dataset
    test_tta1 = to_tf_dataset(test, shuffle = False, labeled = False, repeat = False, max_len = MAX_LEN)
    test_tta2 = to_tf_dataset(test_bt, shuffle = False, labeled = False, repeat = False, max_len = MAX_LEN)
    test_tta3 = to_tf_dataset(test_bt_twice, shuffle = False, labeled = False, repeat = False, max_len = MAX_LEN)
    test_tta4 = to_tf_dataset(test_bt_thrice, shuffle = False, labeled = False, repeat = False, max_len = MAX_LEN)
    
    #predict with augmentated validation sets
    pred1 = model.predict(test_tta1, verbose = VERBOSE)
    pred2 = model.predict(test_tta2, verbose = VERBOSE) 
    pred3 = model.predict(test_tta3, verbose = VERBOSE)
    pred4 = model.predict(test_tta4, verbose = VERBOSE) 
        
    preds_tta += (pred1 + pred2 + pred3 + pred4) / TTA / FOLDS
    preds += pred1 / FOLDS

    #so we don't hit memory limits
    os.remove(f"/kaggle/working/fold-{fold}.h5")
    del model ; z = gc.collect()

# Submission

In [None]:
USE_TTA = False

In [None]:
if USE_TTA:
    submission = pd.DataFrame()
    submission['id'] = test['id']
    submission['prediction'] = preds_tta.argmax(axis = 1)

else:
    submission = pd.DataFrame()
    submission['id'] = test['id']
    submission['prediction'] = preds.argmax(axis = 1)
    
#sanity check 
submission.head()

In [None]:
submission.to_csv('submission.csv', index = False)
print('Submission saved')