In [None]:
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#  Copyright (c) 2021. Mohamed Reda Bouadjenek, Deakin University              +
#           Email:  reda.bouadjenek@deakin.edu.au                              +
#                                                                              +
#  Licensed under the Apache License, Version 2.0 (the "License");             +
#   you may not use this file except in compliance with the License.           +
#    You may obtain a copy of the License at:                                  +
#                                                                              +
#                 http://www.apache.org/licenses/LICENSE-2.0                   +
#                                                                              +
#    Unless required by applicable law or agreed to in writing, software       +
#    distributed under the License is distributed on an "AS IS" BASIS,         +
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  +
#    See the License for the specific language governing permissions and       +
#    limitations under the License.                                            +
#                                                                              +
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In [None]:
# To be run on Google Colab!
# !mkdir preprocessing/
# !wget --directory-prefix=preprocessing/  https://raw.githubusercontent.com/rbouadjenek/DeepQA/master/DeepQA/preprocessing/__init__.py   > /dev/null 2> /dev/null 
# !wget --directory-prefix=preprocessing/ https://raw.githubusercontent.com/rbouadjenek/DeepQA/master/DeepQA/preprocessing/preprocessing.py  > /dev/null 2> /dev/null 
# !pip install tokenizers transformers keras_metrics > /dev/null 2> /dev/null 

# from google.colab import drive
# drive.mount('/content/drive')

use_tpu = False


In [None]:
import os
import random
import json
from tensorflow import keras
import numpy as np
from tensorflow.python.keras import backend as K
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, BertConfig, TFBertModel
import tensorflow as tf
from tensorflow.keras import layers
from preprocessing.preprocessing import create_squad_examples, create_inputs_targets, get_SQuAD1, get_SQuAD2, get_NewsQA

# Set the random seeds
np.random.seed(1)
random.seed(1)
tf.random.set_seed(1)


# Data preprocessing

In [None]:
# set max length value
max_len = 300
# Create the tokenizer
save_path = os.path.expanduser("~") + "/.bert_base_uncased/"
if not os.path.exists(save_path):
    slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    os.makedirs(save_path)
    slow_tokenizer.save_pretrained(save_path)
tokenizer = BertWordPieceTokenizer(save_path + "vocab.txt", lowercase=True)


In [None]:
# Get the raw data

# raw_train_data, raw_val_data, raw_test_data = get_SQuAD1()
raw_train_data, raw_val_data, raw_test_data = get_SQuAD2()
# raw_train_data, raw_val_data, raw_test_data = get_NewsQA()


In [None]:
# Prepare the data

train_squad_examples, train_skipped_questions = create_squad_examples(raw_train_data, max_len, tokenizer, False)
x_train, y1_train, y2_train, y3_train, train_total_skipped_questions = create_inputs_targets(train_squad_examples)
print('Size: ', len(y1_train[0]))
print('skipped_questions: ', train_skipped_questions)
print('skipped_questions: ', train_total_skipped_questions)

eval_squad_examples, val_skipped_questions = create_squad_examples(raw_val_data, max_len, tokenizer)
x_eval, y1_eval, y2_eval, y3_eval, val_total_skipped_questions = create_inputs_targets(eval_squad_examples)
print('Size: ', len(y1_eval[0]))
print('skipped_questions: ', val_skipped_questions)
print('skipped_questions: ', val_total_skipped_questions)

test_squad_examples, test_skipped_questions = create_squad_examples(raw_test_data, max_len, tokenizer)
x_test, y1_test, y2_test, y3_test, test_total_skipped_questions = create_inputs_targets(test_squad_examples)
print('Size: ', len(y1_test[0]))
print('skipped_questions: ', test_skipped_questions)
print('skipped_questions: ', test_total_skipped_questions)


# Create the keras model 

In [None]:
# Define the model
def create_model(learn_rate=5e-5, dropout_prob=0.3):
    ## BERT encoder 
    configuration = BertConfig(hidden_dropout_prob=dropout_prob, attention_probs_dropout_prob=dropout_prob) 
    encoder = TFBertModel.from_pretrained("bert-base-uncased", config=configuration)
    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask") 
    
    embedding = encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    
    
    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)    
    start_logits = layers.Dropout(dropout_prob)(start_logits)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)   
    end_logits = layers.Dropout(dropout_prob)(end_logits)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)
    
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=learn_rate)
    model.compile(optimizer=optimizer, loss=[loss, loss], metrics=['accuracy'], 
                  run_eagerly=False)    
    return model


In [None]:
if use_tpu:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    

# Start the training

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

for lr in [9e-06, 1e-05, 3e-05, 5e-05, 7e-05, 9e-05, 1e-04]:
    for dropout_prob in [0, 0.2]:
        print("Learning rate = " + str(lr) + ", dropout_prob = " + str(dropout_prob))
        np.random.seed(1)
        random.seed(1)
        tf.random.set_seed(1)
        if use_tpu: 
            with strategy.scope():
                model = create_model(lr, dropout_prob)
        else:
            model = create_model(lr, dropout_prob) 
        history = model.fit(
            x_train,
            y1_train,
            validation_data=(x_eval,y1_eval),
            epochs=20, 
            verbose=1,
            batch_size=128,
            callbacks=[callback],
        )
        results = model.evaluate(x_eval, y1_eval, batch_size=128)
        print(f"Results validation: {results}, Learning rate =  {lr}, dropout_prob = {dropout_prob}")
        print('-'*50)


# Run the model with the best parameters

In [None]:
# Hyper-parameters
# SQuAD2.0
# file_name = "SQuAD2.0_"
# lr = 5e-05
# dropout_prob = 0.2

# NewsQA
# file_name = "NewQA_"
# lr = 9E-06
# dropout_prob = 0.2


In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

np.random.seed(1)
random.seed(1)
tf.random.set_seed(1)
if use_tpu: 
    with strategy.scope():
        model = create_model(lr, dropout_prob)


history = model.fit(
            x_train,
            y1_train,
            validation_data=(x_eval,y1_eval),
            epochs=20, 
            verbose=1,
            batch_size=128,
            callbacks=[callback],
        )
model.evaluate(x_test, y1_test, batch_size=128)

In [None]:
test_pred_start_baseline, test_pred_end_baseline = model.predict(x_test, verbose=1)
eval_pred_start_baseline, eval_pred_end_baseline = model.predict(x_eval, verbose=1)
    

In [None]:
algorithm = "Baseline1_"
file_list = []

def save_to_file(name, data):
    json_string = json.dumps(data.tolist())
    file = algorithm + file_name + name
    file_list.append(file)
    with open(file, "w") as text_file:
        text_file.write(json_string)

save_to_file("test_pred_start.txt", test_pred_start_baseline)
save_to_file("test_pred_end.txt", test_pred_end_baseline)

save_to_file("eval_pred_start.txt", eval_pred_start_baseline)
save_to_file("eval_pred_end.txt", eval_pred_end_baseline)

save_to_file("y2_eval.txt",y2_eval)
save_to_file("y3_eval.txt", np.array([v.tolist() for v in y3_eval]))

save_to_file("y2_test.txt",y2_test)
save_to_file("y3_test.txt", np.array([v.tolist() for v in y3_test]))

tar = tarfile.open(algorithm + file_name + "predictions.tar.gz", "w:gz")
for name in file_list:
    tar.add(name)
tar.close()

!mv *predictions.tar.gz /content/drive/MyDrive


# Debugging

In [None]:
np.random.seed(1)
random.seed(1)
tf.random.set_seed(1)
model = create_model(5e-05, 0) 
model.summary()
i = 16
history = model.fit(
        [x_test[0][0:i],x_test[1][0:i],x_test[2][0:i]],
        [y1_test[0][0:i],y1_test[1][0:i]],
        epochs=2, 
        verbose=1,
        batch_size=4,
    )
model.evaluate([x_test[0][0:i],x_test[1][0:i],x_test[2][0:i]],[y1_test[0][0:i],y1_test[1][0:i]], batch_size=4)
