In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


## **Load Data**

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

In [None]:
train.info()

In [None]:
train.describe(include='all')

# Data Preprocessiog

In [None]:
train=train.drop('id',axis=1)
train.head()

In [None]:
train=train.drop('language',axis=1)
test=test.drop('language',axis=1)
train.head(5)

In [None]:
sns.countplot(x='lang_abv', data=train)

In [None]:
plt.figure(figsize=(9,9))
train.groupby('lang_abv').size().plot(kind='pie', autopct='%1.1f%%')

In [None]:
from transformers import BertTokenizer, TFBertModel, TFAutoModel,AutoTokenizer
model_name ='joeddav/xlm-roberta-large-xnli'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize

In [None]:
def encode_premise_sentence(s):
    tokens=[]
    tokens.append('[CLS]')
    tokens+=list(tokenizer.tokenize(s))
    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
encode_premise_sentence("jsalkgfad")

In [None]:
def encode_hypo_sentence(s):
    tokens=[]
    tokens.append('[sep]')
    tokens+=list(tokenizer.tokenize(s))
    tokens.append('[sep]')
    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
encode_premise_sentence("jsalkgfad")

In [None]:
tokenized=[]
for i in range(len(train)):
    pre=encode_premise_sentence(train['premise'][i])
    hyp=encode_hypo_sentence(train['hypothesis'][i])
    tokenized.append(pre+hyp)
train['tokenized']=tokenized
train.head()

# Attention Mask and Token Type ID:

In [None]:
mask=[]
for i in range(len(train)):
    padded_seq=tokenizer(train['premise'][i],train['hypothesis'][i], padding=True,add_special_tokens = True)
    mask.append(padded_seq)
train['masked'] = mask
train.head(5)
# print(mask[0])


# Train Model


In [None]:
max_len=237
def build_model():
    bert_encoder = TFAutoModel.from_pretrained('joeddav/xlm-roberta-large-xnli')
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    embedding = bert_encoder([input_word_ids, input_mask])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
def input_convert(data):
    inputs={
        'input_word_ids':[],
        'input_mask':[]
    }
    for each in data:
        inputs['input_word_ids'].append(each['input_ids'])
        inputs['input_mask'].append(each['attention_mask'])
        
    inputs['input_word_ids']= tf.ragged.constant( inputs['input_word_ids']).to_tensor()
    inputs['input_mask']= tf.ragged.constant( inputs['input_mask']).to_tensor()
    return inputs

In [None]:
train_input=input_convert(train['masked'].values)
for key in train_input.keys():
    train_input[key] = train_input[key][:,:max_len]

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=3,restore_best_weights=True)
with strategy.scope():
    model = build_model()
    model.summary()
    model.fit(train_input, train['label'].values, epochs = 5, verbose = 1, batch_size = 128, validation_split = 0.1 ,callbacks=[early_stop])

# Prediciton

In [None]:
mask=[]
for i in range(len(test)):
    padded_seq=tokenizer(test['premise'][i],test['hypothesis'][i],
                        padding=True,add_special_tokens =True)
    mask.append(padded_seq)
test['masked']=mask
test.head()

In [None]:
test_input=input_convert(test['masked'].values)
for key in test_input.keys():
    test_input[key]=test_input[key][:,:max_len]

In [None]:
predictions=[np.argmax(i) for i in model.predict(test_input)]

In [None]:
test.head()

In [None]:
submission=test['id'].copy().to_frame()
submission['prediction']=predictions
submission.to_csv("submission.csv",index=False)

In [None]:
submission.head()