In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
!pip install nlpaug
!pip install fairseq

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.environ["WANDB_API_KEY"] = "0" ## to silence warning
os.environ["MODEL_DIR"] = '../model'

In [None]:
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf
import nlpaug.augmenter.word as naw

"""from nlpaug.util.file.download import DownloadUtil
DownloadUtil.download_word2vec(dest_dir='.') # Download word2vec model"""

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
train_normal = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")

validation_split = 0.2

train_data = train_normal.iloc[:int(len(train_normal.index)*(1-validation_split)),:]
validation_data = train_normal.iloc[int(len(train_normal.index)*(1-validation_split)):,:]

def insert_row(idx, df, df_insert):
    return df.iloc[:idx, ].append(df_insert, ignore_index=True).append(df.iloc[idx:, ]).reset_index(drop = True)

In [None]:
# if we test backtranslation

"""
train_augmented = pd.read_csv("../input/augmented/Transformed data/train_backtrsl.csv")


for index, row in train_augmented.iloc[:int(len(train_augmented.index)*(1-validation_split))].iterrows():
    train_data = insert_row(index*2, train_data, row)
validation_split = validation_split/(2-validation_split)

In [None]:
# if we test low resource languages text augmentation
"""
train_vi = pd.read_csv("../input/augmented/Transformed data/train_translated_vi.csv")
train_bg = pd.read_csv("../input/augmented/Transformed data/train_translated_bg.csv")
train_hi = pd.read_csv("../input/augmented/Transformed data/train_translated_hi.csv")

for index, row in train_vi.iloc[:int(len(train_vi.index)*(1-validation_split))].iterrows():
    train_data = insert_row(index*2, train_data, row)

for index, row in train_bg.iloc[:int(len(train_bg.index)*(1-validation_split))].iterrows():
    train_data = insert_row(index*3, train_data, row)
    
for index, row in train_hi.iloc[:int(len(train_hi.index)*(1-validation_split))].iterrows():
    train_data = insert_row(index*4, train_data, row)
    
validation_split = validation_split/(4-3*validation_split)"""

In [None]:
# if we use the nlpaug module

aug = naw.SynonymAug(aug_src='wordnet')

cpt = 0
for index, row in train_normal.iloc[:int(len(train_normal.index)*(1-validation_split))].iterrows():
    if(row['language'] == 'English'):
        cpt += 1
        premise = aug.augment(row['premise'])
        hypothesis = aug.augment(row['hypothesis'])
        new_row = pd.Series([row['id'], premise, hypothesis, 'en', 'English', row['label']], index=['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label'])
        train_data = insert_row(index+cpt, train_data, new_row)
validation_split = (validation_split*12120)/(12120+cpt)

In [None]:
print(len(train_data.index))
train_data.head()
train = pd.concat([train_data, validation_data], ignore_index=True)

In [None]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def bert_encode(hypotheses, premises, tokenizer):
    
  num_examples = len(hypotheses)
  
  sentence1 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(hypotheses)])
  sentence2 = tf.ragged.constant([
      encode_sentence(s)
       for s in np.array(premises)])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs

In [None]:
train_input = bert_encode(train.premise.values, train.hypothesis.values, tokenizer)

In [None]:
max_len = 50

def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()

In [None]:
model.fit(train_input, train.label.values, epochs = 2, verbose = 1, batch_size = 64, validation_split = validation_split)

In [None]:
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test_input = bert_encode(test.premise.values, test.hypothesis.values, tokenizer)

In [None]:
predictions = [np.argmax(i) for i in model.predict(test_input)]

In [None]:
submission = test.id.copy().to_frame()
submission['prediction'] = predictions

submission.to_csv("submission.csv", index = False)