In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, TFAutoModel,AutoTokenizer
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

In [None]:
train.describe(include='all')


In [None]:
train.head(5)

In [None]:
train=train.drop('id',axis=1)
train.head()

In [None]:
train=train.drop('language',axis=1)
test=test.drop('language',axis=1)
train.head(5)

In [None]:
sns.countplot(x='lang_abv', data=train)

In [None]:
plt.figure(figsize=(9,9))
train.groupby('lang_abv').size().plot(kind='pie', autopct='%1.1f%%')


In [None]:
model_name ='joeddav/xlm-roberta-large-xnli'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def encode_premise_sentence(s):
    tokens = []
    tokens.append('[CLS]')
    tokens+=list(tokenizer.tokenize(s))
    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def encode_hypothesis_sentence(s):
    tokens = []
    tokens.append('[SEP]')
    tokens+=list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
tokenized=[]
for each in range(len(train)):
    pre=encode_premise_sentence(train['premise'][each])
    hyp=encode_hypothesis_sentence(train['hypothesis'][each])
    tokenized.append(pre+hyp)
train['tokenized'] = tokenized
train.head(5)

In [None]:
mask=[]
for each in range(len(train)):
    padded_sequences = tokenizer(train['premise'][each],train['hypothesis'][each], padding=True,add_special_tokens = True)
    mask.append(padded_sequences)
train['masked'] = mask
train.head(5)

In [None]:
train['masked'][0]

In [None]:
max_len=237 

def build_model():
    bert_encoder = TFAutoModel.from_pretrained('joeddav/xlm-roberta-large-xnli')
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    #input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
def input_convert(data):
    inputs   = {
            'input_word_ids' :[],
            'input_mask'     :[]  }
    for each in data:
        inputs['input_word_ids'].append(each['input_ids'])
        inputs['input_mask'].append(each['attention_mask'])
        #inputs['input_type_ids'].append(each['token_type_ids'])
    
    inputs['input_word_ids']= tf.ragged.constant( inputs['input_word_ids']).to_tensor()
    inputs['input_mask']= tf.ragged.constant( inputs['input_mask']).to_tensor()
    #inputs['input_type_ids']= tf.ragged.constant( inputs['input_type_ids']).to_tensor()
    
    return inputs

In [None]:
train_input= input_convert(train['masked'].values)
for key in train_input.keys():
    train_input[key] = train_input[key][:,:max_len]

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=3,restore_best_weights=True)
with strategy.scope():
    model = build_model()
    model.summary()
    model.fit(train_input, train['label'].values, epochs = 5, verbose = 1, batch_size = 128, validation_split = 0.1 ,callbacks=[early_stop])

In [None]:
mask=[]
for each in range(len(test)):
    padded_sequences = tokenizer(test['premise'][each],test['hypothesis'][each], padding=True,add_special_tokens = True)
    mask.append(padded_sequences)
test['masked'] = mask
test.head(5)

In [None]:
test_input= input_convert(test['masked'].values)
for key in test_input.keys():
    test_input[key] = test_input[key][:,:max_len]

In [None]:
predictions = [np.argmax(i) for i in model.predict(test_input)]

In [None]:
test.head()

In [None]:
submission = test['id'].copy().to_frame()
submission['prediction'] = predictions
submission.to_csv("submission.csv", index = False)

In [None]:
submission.head()