In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
os.environ["WANDB_API_KEY"] = "0" ## to silence warning

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [None]:
train = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")

In [None]:
labels, frequencies = np.unique(train.language.values, return_counts = True)

plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

### **Bert**




In [None]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
# def encode_sentence(s):
#    tokens = list(tokenizer.tokenize(s))
#    tokens.append('[SEP]')
   
#    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
# encode_sentence("I love machine learning")

In [None]:
lengths = []

def encode_sentence(s):
  tokens = list(tokenizer.tokenize(s))
  # lengths.append(len(tokens))
  padding  = ['[PAD]']*(150-len(tokens))
  tokens = tokens + padding
  tokens = tokens[:150]
  lengths.append(len(tokens))
  return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def bert_encode(hypotheses, premises, tokenizer):
    
  num_examples = len(hypotheses)
  
  # TRIAL #1
  # sentence1 = tf.ragged.constant([
  #     encode_sentence(s)
  #     for s in np.array(hypotheses)])
  # sentence2 = tf.ragged.constant([
  #     encode_sentence(s)
  #      for s in np.array(premises)])

  # TRIAL #2
  # sentence = tf.ragged.constant([
  #   encode_sentence(s1,s2)
  #   for s1,s2 in zip(hypothesis, premises)
  # ])

  # TRIAL #3
  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*len(hypotheses) # wont error
  
  sentence1 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(hypotheses)])
  
  sep = [tokenizer.convert_tokens_to_ids(['[SEP]'])]*len(hypotheses) # wont error

  sentence2 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(premises)])
  
  sentence1 = tf.concat([sentence1, sep], axis=-1)
  sentence2 = tf.concat([sentence2, sep], axis=-1)

  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs

In [None]:
lengths = []
train_input = bert_encode(train.hypothesis.values, train.premise.values, tokenizer)
test_input = bert_encode(test.hypothesis.values, test.premise.values, tokenizer)

In [None]:
plt.plot(lengths)

In [None]:
max_len = 303

def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    # Adding more FC layers
    FC1 = tf.keras.layers.Dense(2048)(embedding[:,0,:])
    FC2 = tf.keras.layers.Dense(512)(FC1)
    output = tf.keras.layers.Dense(3, activation='softmax')(FC2)
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    # cross categorical vs sparse cat
    return model

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()

In [None]:
train.label.values.size

In [None]:
model.fit(train_input, train.label.values, epochs = 2, verbose = 1, batch_size = 64, validation_split = 0.2)

In [None]:
predictions = [np.argmax(i) for i in model.predict(test_input)]
submission = test.id.copy().to_frame()
submission['prediction'] = predictions

In [None]:
submission.to_csv("submission.csv", index = False)