In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
df_1=pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
df.head()

In [None]:
print(df.premise.values[0])
print(df.hypothesis.values[0])
print(df.label.values[0])

In [None]:
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
labels, frequencies = np.unique(df.language.values, return_counts = True)


plt.pie(frequencies,labels = labels)
plt.show()

In [None]:
model_name = 'bert-base-multilingual-cased'   ## pre trained model
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
encode_sentence('My name is Sunny')

In [None]:
def bert_encode(hypotheses, premises, tokenizer):
    
  num_examples = len(hypotheses)
  
  sentence1 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(hypotheses)])
  sentence2 = tf.ragged.constant([
      encode_sentence(s)
       for s in np.array(premises)])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs

In [None]:
df_input = bert_encode(df.premise.values, df.hypothesis.values, tokenizer)

In [None]:
df_input['input_word_ids']=df_input['input_word_ids'][:,:50]
df_input['input_mask']=df_input['input_mask'][:,:50]
df_input['input_type_ids']=df_input['input_type_ids'][:,:50]

In [None]:
max_len = 50

def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = build_model()
model.summary()

In [None]:
model.fit(df_input, df.label.values, epochs = 5, verbose = 1, batch_size = 50, validation_split = 0.2)

In [None]:
df_1.head()

In [None]:
df_1_input = bert_encode(df_1.premise.values, df_1.hypothesis.values, tokenizer)

In [None]:
df_1_input['input_word_ids']=df_1_input['input_word_ids'][:,:50]
df_1_input['input_mask']=df_1_input['input_mask'][:,:50]
df_1_input['input_type_ids']=df_1_input['input_type_ids'][:,:50]

In [None]:
predictions = [np.argmax(i) for i in model.predict(df_1_input)]

In [None]:
submission = df_1.id.copy().to_frame()
submission['prediction'] = predictions

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index = False)