In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf

In [None]:
df_train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
df_test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train.describe(include='all')

In [None]:
df_train=df_train.drop('id', axis=1)
df_train.head()

In [None]:
df_test.describe(include='all')

In [None]:
df_train=df_train.drop('language',axis=1)
df_test=df_test.drop('language',axis=1)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
sns.countplot(x='lang_abv', data=df_train)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(9,9))
df_train.groupby('lang_abv').size().plot(kind='pie', autopct='%1.1f%%')

In [None]:
from transformers import BertTokenizer, TFBertModel, TFAutoModel,AutoTokenizer
model_name ='joeddav/xlm-roberta-large-xnli'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def encode_premise_sentence(s):
    tokens=[]
    tokens.append('[CLS]')
    tokens+=list(tokenizer.tokenize(s))
    return tokenizer.convert_tokens_to_ids(tokens)
encode_premise_sentence("tusharmittal")

In [None]:
def encode_hypo_sentence(s):
    tokens=[]
    tokens.append('[sep]')
    tokens+=list(tokenizer.tokenize(s))
    tokens.append('[sep]')
    return tokenizer.convert_tokens_to_ids(tokens)
encode_premise_sentence("tusharmittal")

In [None]:
tokenized=[]
for i in range(len(df_train)):
    pre=encode_premise_sentence(df_train['premise'][i])
    hyp=encode_hypo_sentence(df_train['hypothesis'][i])
    tokenized.append(pre+hyp)
df_train['tokenized']=tokenized
df_train.head()

In [None]:
mask=[]
for i in range(len(df_train)):
    padded_seq=tokenizer(df_train['premise'][i],df_train['hypothesis'][i], padding=True,add_special_tokens = True)
    mask.append(padded_seq)

In [None]:
df_train_panda = df_train
df_train_panda['masked'] = mask
df_train_panda.head(5)

In [None]:
max_len = 236 

def build_model():
    encoder = TFAutoModel.from_pretrained('joeddav/xlm-roberta-large-xnli')
  
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")  
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
 
    embedding = encoder([input_word_ids, input_mask])[0] 
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output) 
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy']) 
    
    return model

In [None]:
def input_convert(data):
    inputs={
        'input_word_ids':[],
        'input_mask':[]
    }
    for each in data:
        inputs['input_word_ids'].append(each['input_ids'])
        inputs['input_mask'].append(each['attention_mask'])
        
    inputs['input_word_ids']= tf.ragged.constant( inputs['input_word_ids']).to_tensor()
    inputs['input_mask']= tf.ragged.constant( inputs['input_mask']).to_tensor()
    return inputs

In [None]:
df_train_panda_input=input_convert(df_train_panda['masked'].values)
for key in df_train_panda_input.keys():
    df_train_panda_input[key] = df_train_panda_input[key][:,:max_len]

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    
    strategy = tf.distribute.experimental.TPUStrategy
except ValueError:
    strategy = tf.distribute.get_strategy() 
    print('Number of replicas:', strategy.num_replicas_in_sync) 

In [None]:
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError:
  tpu = None
  gpus = tf.config.experimental.list_logical_devices("GPU")

if tpu:
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.experimental.TPUStrategy(tpu,)  
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
elif len(gpus) > 1:
  strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
  print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
  strategy = tf.distribute.get_strategy() 
  print('Running on single GPU ', gpus[0].name)
else:
  strategy = tf.distribute.get_strategy() 
  print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

In [None]:
with strategy.scope(): 
    model = build_model() 
    model.summary()      

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=3,restore_best_weights=True)
model.fit(df_train_panda_input, df_train_panda['label'].values, epochs = 12, verbose = 1, validation_split = 0.01,
         batch_size=16*strategy.num_replicas_in_sync
          ,callbacks=[early_stop]
         ) 

In [None]:
df_test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

In [None]:
tokenized=[]
for i in range(len(df_test)):
    pre=encode_premise_sentence(df_test['premise'][i])
    hyp=encode_hypo_sentence(df_test['hypothesis'][i])
    tokenized.append(pre+hyp)
df_test['tokenized']=tokenized


mask=[]
for i in range(len(df_test)):
    padded_seq=tokenizer(df_test['premise'][i],df_test['hypothesis'][i], padding=True,add_special_tokens = True)
    mask.append(padded_seq)
    
df_test['masked'] = mask


df_test_panda_input=input_convert(df_test['masked'].values)
for key in df_test_panda_input.keys():
    df_test_panda_input[key] = df_test_panda_input[key][:,:max_len]




In [None]:
df_test_panda_input

In [None]:
predictions = [np.argmax(i) for i in model.predict(df_test_panda_input)] 

In [None]:
submission = df_test.id.copy().to_frame()
submission['prediction'] = predictions

In [None]:
submission.head()

In [None]:
submission.shape

In [None]:
submission.to_csv("submission.csv", index = False)