Hello!

This is inference notebook from this [train notebook](https://www.kaggle.com/alturutin/watson-xml-r-nli-train).

Also, I interpeted results and showed why the accuracy in this competition is so high.


In [None]:
!pip install --upgrade pip > /dev/null
!pip install --upgrade transformers > /dev/null
!pip install nlp > /dev/null

In [None]:
import os
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import plotly.express as px

# NN
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import nlp

In [None]:
def init_strategy():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Init TPU strategy")
    except ValueError:
        strategy = tf.distribute.get_strategy() # for CPU and single GPU
        print("Init CPU/GPU strategy")
    return strategy

def build_model(model_name, maxlen, head="avg_pooling"):
    input_ids = Input(shape=(maxlen,), dtype=tf.int32, name="input_ids")
    encoder = TFAutoModel.from_pretrained(model_name)
    encoder_output = encoder(input_ids)[0]
    
    # convert transformer encoding to vector
    if head == "cls":
        features = encoder_output[:, 0, :] # using first token as encoder feature map
    elif head == "avg_pooling":
        features = GlobalAveragePooling1D()(encoder_output)
    elif head == "max_pooling":
        features = GlobalMaxPooling1D()(encoder_output)
    else:
        raise NotImplementedError
    
    # 3class softmax
    out = Dense(3, activation='softmax')(features)
    
    # define model
    model = Model(inputs=input_ids, outputs=out)
    model.compile(
        Adam(lr=1e-5), 
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

def tokenize_dataframe(df, tokenizer, max_length):
    # tokenize
    text = df[['premise', 'hypothesis']].values.tolist()
    encoded = tokenizer.batch_encode_plus(text, padding=True, max_length=max_length, truncation=True)
    # features
    x = encoded['input_ids']
    # labels
    y = None
    if 'label' in df.columns:
        y = df.label.values
    return x, y

def build_dataset(x, y, mode, batch_size):
    if mode == "train":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices((x, y))
            .repeat()
            .shuffle(2048)
            .batch(batch_size)
            .prefetch(auto)
        )
    elif mode == "valid":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices((x, y))
            .batch(batch_size)
            .cache()
            .prefetch(auto)
        )
    elif mode == "test":
        dataset = (
            tf.data.Dataset
            .from_tensor_slices(x)
            .batch(batch_size)
        )
    else:
        raise NotImplementedError
    return dataset

def load_mnli(use_validation=True):
    result = []
    dataset = nlp.load_dataset(path='glue', name='mnli')
    keys = ['train', 'validation_matched','validation_mismatched'] if use_validation else ['train']
    for k in keys:
        for record in dataset[k]:
            c1, c2, c3 = record['premise'], record['hypothesis'], record['label']
            if c1 and c2 and c3 in {0,1,2}:
                result.append((c1,c2,c3,'en'))
    result = pd.DataFrame(result, columns=['premise','hypothesis','label','lang_abv'])
    return result

def load_xnli():
    result = []
    dataset = nlp.load_dataset(path='xnli')
    for k in dataset.keys():
        for record in dataset[k]:
            hp, pr, lb = record['hypothesis'], record['premise'], record['label']
            if hp and pr and lb in {0,1,2}:
                for lang, translation in zip(hp['language'], hp['translation']):
                    pr_lang = pr.get(lang, None)
                    if pr_lang is None:
                        continue
                    result.append((pr_lang, translation, lb,lang))
    result = pd.DataFrame(result, columns=['premise','hypothesis','label','lang_abv'])
    return result


In [None]:
MODEL = 'jplu/tf-xlm-roberta-large'
MAXLEN = 120
strategy = init_strategy()
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
tokenizer = AutoTokenizer.from_pretrained(MODEL)
auto = tf.data.experimental.AUTOTUNE

def preprocess(df):
    return tokenize_dataframe(df, tokenizer, MAXLEN)

In [None]:
# load data
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
submission = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/sample_submission.csv')

# preprocess
x, y = preprocess(train)
x_test, _ = preprocess(test)
test_dataset = build_dataset(x_test, None, "test", BATCH_SIZE)

# load external datasets for interpretation purpose
mnli = load_mnli()
xnli = load_xnli()

#### Let's build toy search engine: it's looking for a full match of query and knowledge base.

In [None]:
import re
import string
punct = '[' + ''.join([c for c in string.punctuation if c != "'"]) + ']'

def preprocess_query(q):
    q = q.lower()
    q = re.sub(punct, ' ', q)
    q = re.sub('[ ]{2,}', ' ', q)
    return q

def search_in_base(q, kb):
    q = preprocess_query(q)
    return int(q in kb)


In [None]:
premises = pd.concat([train[['premise', 'lang_abv']], test[['premise', 'lang_abv']]])

In [None]:
knowledge_base = set(mnli['premise'].apply(preprocess_query))
premises['mnli'] = premises['premise'].apply(lambda q: search_in_base(q, knowledge_base))
print(f"fraction of train set english premises occurence in MNLI = {premises.loc[premises.lang_abv=='en', 'mnli'].mean() * 100}%")

In [None]:
knowledge_base = set(xnli['premise'].apply(preprocess_query))
premises['xnli'] = premises['premise'].apply(lambda q: search_in_base(q, knowledge_base))
print(f"fraction of train set non-english premises occurence in XNLI = {premises.loc[premises.lang_abv!='en', 'xnli'].mean() * 100}%")

#### As we can see, external datasets completely include train and test data, and it's obvious overfitting :)

In [None]:
# save results 
strategy = init_strategy()
with strategy.scope():
    model = build_model(MODEL, MAXLEN)
    model.load_weights("../input/watson-xlmr-models/XLMR_mnlixnli_ep6.h5")
    
dataset = build_dataset(x, y, "valid", BATCH_SIZE)
pr = np.argmax(model.predict(dataset), axis=1)
print(f"accuracy {accuracy_score(y, pr):.4f}")

test_preds = model.predict(test_dataset, verbose=0)
submission['prediction'] = test_preds.argmax(axis=1)
submission.to_csv('submission.csv', index=False)