In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

## Loading and exploring the data 

In [None]:
df_train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
df_test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

In [None]:
df_train.head()

In [None]:
df_test.tail()

In [None]:
df_train.shape, df_test.shape

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df_train.duplicated().sum()

In [None]:
# check whether target value is skewed
df_train.label.hist()

In [None]:
# how many languages is in dataset
print('train lang_abv: ', len(df_train.lang_abv.unique()), ', languages: ', len(df_train.language.unique()))
print('test lang_abv: ', len(df_test.lang_abv.unique()), ', languages: ', len(df_test.language.unique()))
print('train lang & test lang: ', len(set(df_train.lang_abv.unique()) and set(df_test.lang_abv.unique())))

In [None]:
# great, there is the same set of languages in both train and test datasets
# lets see what exact languages are presented and how diff sentences in diff languages look like
for lang in df_train.lang_abv.unique():
    first_row = df_train[df_train.lang_abv == lang].iloc[0]
    print(first_row.language, first_row.premise)

## TPU Setup

In [None]:
import tensorflow as tf

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    
    strategy = tf.distribute.experimental.TPUStrategy
except ValueError:
    strategy = tf.distribute.get_strategy() 
    print('Number of replicas:', strategy.num_replicas_in_sync) 

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError:
    tpu = None
    gpus = tf.config.experimental.list_logical_devices("GPU")
    
if tpu:
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu,) 
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
elif len(gpus) > 1:
    strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
    strategy = tf.distribute.get_strategy() 
    print('Running on single GPU ', gpus[0].name)
else:
    strategy = tf.distribute.get_strategy() 
    print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

## XLM_RoBERTa model

In [None]:
from transformers import TFAutoModel, AutoTokenizer

In [None]:
def input_convert(data):
        # -in- data - list of dict
        # -out- inputs - dict of <key + list>
        
        inputs = {
            'input_word_ids': [],
            'input_mask': []
        }
        
        for i in data:
            inputs['input_word_ids'].append(i['input_ids'])
            inputs['input_mask'].append(i['attention_mask'])
            
        inputs['input_word_ids'] = tf.ragged.constant(inputs['input_word_ids']).to_tensor()
        inputs['input_mask'] = tf.ragged.constant(inputs['input_mask']).to_tensor()
           
        return inputs

In [None]:
y = df_train.pop('label')
df = pd.concat([df_train, df_test], ignore_index = True)
df_train.shape, df_test.shape, df.shape

In [None]:
model_name = 'joeddav/xlm-roberta-large-xnli'

# tokenizing
tokenizer = AutoTokenizer.from_pretrained(model_name)

mask = []
for i in range(len(df)):
    padded_seq = tokenizer(df['premise'][i], df['hypothesis'][i], padding = True, add_special_tokens = True)
    mask.append(padded_seq)

inputs = input_convert(mask)

In [None]:
# split df into train and test
inputs_train = {}
inputs_test = {}

for key in inputs.keys():
    inputs_train[key] = inputs[key][:len(y), :]
    inputs_test[key] = inputs[key][len(y):, :]

In [None]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# build model
with strategy.scope():
    max_len = inputs['input_word_ids'].shape[1]
    
    encoder = TFAutoModel.from_pretrained(model_name)
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")

    embedding = encoder([input_word_ids, input_mask])[0]
    dense1 = Dense(256, activation='relu')(Dropout(0.1)(embedding[:,0,:]))
    dense2 = Dense(64, activation='relu')(dense1)
    output = Dense(3, activation='softmax')(dense2)

    model = Model(inputs=[input_word_ids, input_mask], outputs = output)
    model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'], steps_per_execution = 100)

# fit the model
early_stop = tf.keras.callbacks.EarlyStopping(patience = 3, restore_best_weights = True)
model.fit(inputs_train, y.values, epochs = 10, verbose = 1, validation_split = 0.1,
                    batch_size = 16 * strategy.num_replicas_in_sync, callbacks = [early_stop])

In [None]:
# make predictions
predictions = [np.argmax(i) for i in model.predict(inputs_test)]

In [None]:
# submit the result
submission = df_test.id.copy().to_frame()
submission['prediction'] = predictions
submission.to_csv("submission.csv", index = False)