Hello! 

In this notebook, I made an attempt to compile the ideas of other contestants.

The main differences of my work are as follows:


* Using full MNLI & XLNI datasets for XLM training
* I skipped SNLI dataset because it showed poor validation results
* A greater number of training periods without taking into account the competition limit of 120 minutes (6 epochs), so the model weights are dumped at the end
* The training dataset from the competition is ignored and partially used only for validation
* Average pooling is used to extract features from transformer. It's preferable instead of using CLS token, as recommended in many works, for example, in the `Sentence-BERT` paper
* I tried to make the code as modular, clean, and self-commented as possible

Also, I have little experience in the use of tensorflow Datasets and tools and would appreciate comments about speed up and memory improvements.

References:

[XLM-Roberta pretrained on NLI](https://www.kaggle.com/qinhui1999/more-nli-datasets-xmlr-large)

[Tensorflow TPU starter](https://www.kaggle.com/xhlulu/contradictory-watson-concise-keras-xlm-r-on-tpu)

[Sentence BERT paper](https://arxiv.org/abs/1908.10084])

In [None]:
!pip install --upgrade pip > /dev/null
!pip install --upgrade transformers > /dev/null
!pip install nlp > /dev/null

In [None]:
import os
import gc
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import nlp

import plotly.express as px

In [None]:
def init_strategy():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Init TPU strategy")
    except ValueError:
        strategy = tf.distribute.get_strategy() # for CPU and single GPU
        print("Init CPU/GPU strategy")
    return strategy

def build_model(model_name, maxlen, head="avg_pooling"):
    # model encoding
    input_ids = Input(shape=(maxlen,), dtype=tf.int32, name="input_ids")
    encoder = TFAutoModel.from_pretrained(model_name)
    encoder_output = encoder(input_ids)[0]
    
    # convert transformer encodings to 1d-vector
    if head == "cls":
        features = encoder_output[:, 0, :] # using first token as encoder feature map
    elif head == "avg_pooling":
        features = GlobalAveragePooling1D()(encoder_output)
    elif head == "max_pooling":
        features = GlobalMaxPooling1D()(encoder_output)
    else:
        raise NotImplementedError
    
    # 3-class softmax
    out = Dense(3, activation='softmax')(features)
    
    # define model
    model = Model(inputs=input_ids, outputs=out)
    model.compile(
        Adam(lr=1e-5), 
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

def tokenize_dataframe(df, tokenizer, max_length):
    # tokenize
    text = df[['premise', 'hypothesis']].values.tolist()
    encoded = tokenizer.batch_encode_plus(text, padding=True, max_length=max_length, truncation=True)
    # features
    x = encoded['input_ids']
    # labels
    y = None
    if 'label' in df.columns:
        y = df.label.values
    return x, y

def load_mnli(use_validation=True):
    result = []
    dataset = nlp.load_dataset(path='glue', name='mnli')
    keys = ['train', 'validation_matched','validation_mismatched'] if use_validation else ['train']
    for k in keys:
        for record in dataset[k]:
            c1, c2, c3 = record['premise'], record['hypothesis'], record['label']
            if c1 and c2 and c3 in {0,1,2}:
                result.append((c1,c2,c3,'en'))
    result = pd.DataFrame(result, columns=['premise','hypothesis','label','lang_abv'])
    return result

def load_snli(use_validation=True):
    result = []
    dataset = nlp.load_dataset(path='snli')
    keys = ['train', 'validation'] if use_validation else ['train']
    for k in keys:
        for record in dataset[k]:
            c1, c2, c3 = record['premise'], record['hypothesis'], record['label']
            if c1 and c2 and c3 in {0,1,2}:
                result.append((c1,c2,c3,'en'))
    result = pd.DataFrame(result, columns=['premise','hypothesis','label','lang_abv'])
    return result

def load_xnli():
    result = []
    dataset = nlp.load_dataset(path='xnli')
    for k in dataset.keys():
        for record in dataset[k]:
            hp, pr, lb = record['hypothesis'], record['premise'], record['label']
            if hp and pr and lb in {0,1,2}:
                for lang, translation in zip(hp['language'], hp['translation']):
                    pr_lang = pr.get(lang, None)
                    if pr_lang is None:
                        continue
                    result.append((pr_lang, translation, lb,lang))
    result = pd.DataFrame(result, columns=['premise','hypothesis','label','lang_abv'])
    return result


In [None]:
MODEL = 'jplu/tf-xlm-roberta-large'
EPOCHS = 6
MAXLEN = 120
VALIDATION = "mnli+xnli"

strategy = init_strategy()
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

tokenizer = AutoTokenizer.from_pretrained(MODEL)
auto = tf.data.experimental.AUTOTUNE

def preprocess(df):
    return tokenize_dataframe(df, tokenizer, MAXLEN)

In [None]:
%%time 

# load data
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
submission = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/sample_submission.csv')

mnli = load_mnli()
xnli = load_xnli()

# tokenize
x, y = preprocess(train)
x_test, _ = preprocess(test)

# project dataset validation 
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=2020)

# nli datasets
x_mnli, y_mnli = preprocess(mnli)
x_xnli, y_xnli = preprocess(xnli)

del mnli, xnli
gc.collect()

In [None]:
%%time

# datasets
def build_dataset(x, y, mode, batch_size):
    if mode == "train":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices((x, y))
            .repeat()
            .shuffle(2048)
            .batch(batch_size)
            .prefetch(auto)
        )
    elif mode == "valid":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices((x, y))
            .batch(BATCH_SIZE)
            .cache()
            .prefetch(auto)
        )
    elif mode == "test":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices(x)
            .batch(BATCH_SIZE)
        )
    else:
        raise NotImplementedError
    return dataset

dataset = build_dataset(x, y, "train", BATCH_SIZE)
train_dataset = build_dataset(x_train, y_train, "train", BATCH_SIZE)
valid_dataset = build_dataset(x_valid, y_valid, "valid", BATCH_SIZE)
test_dataset = build_dataset(x_test, None, "test", BATCH_SIZE)

# merge XNLI & MNLI 
x_mnli += x_xnli
del x_xnli; gc.collect()
nli_dataset = build_dataset(x_mnli, np.concatenate([y_mnli, y_xnli]), "train", BATCH_SIZE)

In [None]:
# fit parameters
fit_params = dict(epochs=EPOCHS, verbose=2)
validation = VALIDATION

# create TPU context
# it's significant to make tpu context every training to free up memory
strategy = init_strategy()
with strategy.scope():
    model = build_model(MODEL, MAXLEN)

if validation == "dataset":
    steps_per_epoch = len(x_train) // BATCH_SIZE
    history = model.fit(
        train_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_data=valid_dataset,
        **fit_params
    )
elif validation == "mnli":
    steps_per_epoch = len(x_mnli) // BATCH_SIZE
    history = model.fit(
        mnli_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_data=valid_dataset,
        **fit_params
    )
elif validation == "xnli":
    steps_per_epoch = len(x_xnli) // BATCH_SIZE
    history = model.fit(
        xnli_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_data=valid_dataset,
        **fit_params
    )
elif validation == "mnli+xnli":
    steps_per_epoch = len(x_mnli) // BATCH_SIZE
    history = model.fit(
        nli_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_data=valid_dataset,
        **fit_params
    )


In [None]:
# save weights
model.save_weights(f"XLMR_{VALIDATION}_ep{EPOCHS}.h5")

hist = history.history
print(max(hist['val_accuracy']))
px.line(
    hist, x=range(1, len(hist['loss'])+1), y=['accuracy', 'val_accuracy'], 
    title='Model Accuracy', labels={'x': 'Epoch', 'value': 'Accuracy'}
)