# **Watson using Bert**

In [None]:
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

**Prepare TPU**

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy()
    print('Number of replicas:', strategy.num_replicas_in_sync)

**Read dataset**

In [None]:
train_df = pd.read_csv('../input/contradictory-my-dear-watson/train.csv')
train_df.head()

**Show the percentage of languages used**

In [None]:
labels, frequencies = np.unique(train_df.language.values, return_counts = True)
plt.figure(figsize = (8,8))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

**Get ready for tokenizer**

In [None]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

**Preprocess data**

In [None]:
premise = train_df['premise'].values
hypothesis = train_df['hypothesis'].values

In [None]:
premise = [re.sub('\d+', '0', s) for s in premise]       #Set all numbers to 0
premise = [s.lower() for s in premise]                   #English should be all lowercase
hypothesis = [re.sub('\d+', '0', s) for s in hypothesis] #Set all numbers to 0
hypothesis = [s.lower() for s in hypothesis]             #English should be all lowercase

In [None]:
def bert_encode(premise, hypothesis, tokenizer):
    num_examples = len(premise)
    sentence1 = tf.ragged.constant([
        encode_sentence(s)
        for s in np.array(premise)])
    sentence2 = tf.ragged.constant([
        encode_sentence(s)
        for s in np.array(hypothesis)])
    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])] * sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis =- 1)
    input_mask = tf.ones_like(input_word_ids).to_tensor()
    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis =- 1).to_tensor()
    inputs = {
            'input_word_ids': input_word_ids.to_tensor(),
            'input_mask': input_mask,
            'input_type_ids': input_type_ids}
    return inputs

In [None]:
x_train = bert_encode(premise, hypothesis, tokenizer)

**Build Model**

In [None]:
max_len = 20
def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()

**Train using the TF Bert Model**

In [None]:
model.fit(x_train, train_df.label.values, epochs = 8, batch_size = 64)

**Read dataset**

In [None]:
test_df = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test_df.head()

**Show the percentage of languages used**

In [None]:
labels, frequencies = np.unique(test_df.language.values, return_counts = True)
plt.figure(figsize = (8,8))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

**Preprocess data**

In [None]:
x_test = bert_encode(test_df.premise.values, test_df.hypothesis.values, tokenizer)

**Predict the answer**

In [None]:
y_test = [np.argmax(i) for i in model.predict(x_test)]

In [None]:
sub = pd.DataFrame({'id': test_df['id'].values, 'prediction': y_test})
sub.head()

In [None]:
sub.to_csv('submission.csv', index = False)