In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, TFAutoModel,AutoTokenizer

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

In [None]:
!pip install nlp
from nlp import load_dataset

In [None]:
def load_mnli(use_validation=True):
    res = []
    dataset = load_dataset('multi_nli')
    print(dataset['train'])
    key = ['train', 'validation_matched','validation_mismatched'] if use_validation else ['train']
    for k in key:
        for record in dataset[k]:
            c1, c2, c3 = record['premise'], record['hypothesis'], record['label']
            if c1 and c2 and c3 in {0,1,2}:
                res.append((c1,c2,c3,'en'))
    res = pd.DataFrame(res, columns=['premise','hypothesis', 'label','lang_abv'])
    return res

In [None]:
mnli = load_mnli()

In [None]:
X = train[['id', 'premise', 'hypothesis','lang_abv', 'language', 'label']]

In [None]:
mnli = mnli[['premise', 'hypothesis', 'lang_abv', 'label']]
mnli.insert(0, 'language', 'English')
mnli = mnli[['premise', 'hypothesis', 'lang_abv', 'language', 'label']]
mnli.insert(0, 'id', 'xxx')

In [None]:
X = pd.concat([X, mnli], axis = 0)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("joeddav/xlm-roberta-large-xnli")

In [None]:
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s)) 
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def bert_encode(hypotheses, premises, tokenizer):
    
    num_examples = len(hypotheses)
  
    sentence1 = tf.ragged.constant([  
      encode_sentence(s) for s in np.array(hypotheses)])
  
    sentence2 = tf.ragged.constant([
      encode_sentence(s) for s in np.array(premises)])
  
    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0] 
  
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1) 
  
    
    input_mask = tf.ones_like(input_word_ids).to_tensor() 
 
    type_cls = tf.zeros_like(cls)
  
    type_s1 = tf.zeros_like(sentence1)
  
    type_s2 = tf.ones_like(sentence2) 
  
    input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()
  
    inputs = {
      'input_word_ids': input_word_ids.to_tensor(), 
      'input_mask': input_mask
      
     
  }

    return inputs

In [None]:
train_input = bert_encode(train.premise.values, train.hypothesis.values, tokenizer)

In [None]:
Xinput = bert_encode(X.premise.values, X.hypothesis.values, tokenizer)

In [None]:
test_input = bert_encode(test.premise.values, test.hypothesis.values, tokenizer)

In [None]:
max_len = 236 #: FC 50 in the initial tutorial

def build_model():
    
    encoder = TFAutoModel.from_pretrained('joeddav/xlm-roberta-large-xnli')
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask") 
    
    embedding = encoder([input_word_ids, input_mask])[0] 
    
    
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    
       
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output) 
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy']) 
    return model

In [None]:
try:
    
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    
    strategy = tf.distribute.experimental.TPUStrategy(tpu) 
except ValueError: 
    strategy = tf.distribute.get_strategy() 
    
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError:
    tpu = None
    gpus = tf.config.experimental.list_logical_devices("GPU")

if tpu:
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu,)
elif len(gpus) > 1:
    strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
elif len(gpus) == 1:
    strategy = tf.distribute.get_strategy() 
else:
    strategy = tf.distribute.get_strategy() 
    

In [None]:
with strategy.scope(): 
    model = build_model() 
    model.summary() 
tf.keras.utils.plot_model(model, "model.png", show_shapes=True)

In [None]:
model.layers[2].trainable=True

In [None]:
for key in train_input.keys():
    train_input[key] = train_input[key][:,:max_len]

In [None]:
for key in Xinput.keys():
    Xinput[key] = Xinput[key][:,:max_len]

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=3,restore_best_weights=True)

model.fit(Xinput, X.label.values, epochs = 5, verbose = 1, validation_split = 0.01, batch_size=16*strategy.num_replicas_in_sync, callbacks=[early_stop])

In [None]:
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test_input = bert_encode(test.premise.values, test.hypothesis.values, tokenizer)

In [None]:
for key in test_input.keys():
    test_input[key] = test_input[key][:,:max_len]

In [None]:
pred = [np.argmax(i) for i in model.predict(test_input)]

In [None]:
submission = test.id.copy().to_frame()
submission['prediction'] = pred

In [None]:
submission

In [None]:
submission.to_csv("submission.csv", index = False)