In [None]:
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#  Copyright (c) 2021. Mohamed Reda Bouadjenek, Deakin University              +
#           Email:  reda.bouadjenek@deakin.edu.au                              +
#                                                                              +
#  Licensed under the Apache License, Version 2.0 (the "License");             +
#   you may not use this file except in compliance with the License.           +
#    You may obtain a copy of the License at:                                  +
#                                                                              +
#                 http://www.apache.org/licenses/LICENSE-2.0                   +
#                                                                              +
#    Unless required by applicable law or agreed to in writing, software       +
#    distributed under the License is distributed on an "AS IS" BASIS,         +
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  +
#    See the License for the specific language governing permissions and       +
#    limitations under the License.                                            +
#                                                                              +
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In [None]:
# To be run on Google Colab!
# !pip install tokenizers transformers keras_metrics > /dev/null 2> /dev/null 
use_tpu = False


In [None]:
import os
import random
from tensorflow import keras
import numpy as np
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, BertConfig, TFBertModel
import tensorflow as tf
from tensorflow.keras import layers
from preprocessing.preprocessing import create_squad_examples, create_inputs_targets, get_SQuAD1, get_SQuAD2, get_NewsQA

# Data preprocessing

In [None]:
# set max length value
max_len = 300
# Create the tokenizer
save_path = os.path.expanduser("~") + "/.bert_base_uncased/"
if not os.path.exists(save_path):
    slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    os.makedirs(save_path)
    slow_tokenizer.save_pretrained(save_path)
tokenizer = BertWordPieceTokenizer(save_path + "vocab.txt", lowercase=True)


In [None]:
# Get the raw data
# raw_train_data, raw_val_data, raw_test_data = get_SQuAD1()
raw_train_data, raw_val_data, raw_test_data = get_SQuAD2()
# raw_train_data, raw_val_data, raw_test_data = get_NewsQA()

In [None]:
# Prepare the data
train_squad_examples, train_skipped_questions = create_squad_examples(raw_train_data, max_len, tokenizer, False)
x_train, y1_train, y2_train, y3_train, train_total_skipped_questions = create_inputs_targets(train_squad_examples)
print('Size: ', len(y1_train[0]))
print('skipped_questions: ', train_skipped_questions)
print('skipped_questions: ', train_total_skipped_questions)

eval_squad_examples, val_skipped_questions = create_squad_examples(raw_val_data, max_len, tokenizer)
x_eval, y1_eval, y2_eval, y3_eval, val_total_skipped_questions = create_inputs_targets(eval_squad_examples)
print('Size: ', len(y1_eval[0]))
print('skipped_questions: ', val_skipped_questions)
print('skipped_questions: ', val_total_skipped_questions)

test_squad_examples, test_skipped_questions = create_squad_examples(raw_test_data, max_len, tokenizer)
x_test, y1_test, y2_test, y3_test, test_total_skipped_questions = create_inputs_targets(test_squad_examples)
print('Size: ', len(y1_test[0]))
print('skipped_questions: ', test_skipped_questions)
print('skipped_questions: ', test_total_skipped_questions)

# Create the keras model 

In [None]:
# Define the f1-score measure to be used during training
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [None]:
# Define the model
def create_model(learn_rate=5e-5, dropout_prob=0.3):
    ## BERT encoder 
    configuration = BertConfig(hidden_dropout_prob=dropout_prob, attention_probs_dropout_prob=dropout_prob) 
    encoder = TFBertModel.from_pretrained("bert-base-uncased", config=configuration)
    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask") 
    
    embedding = encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    
    out = layers.Dense(1, name="logit", use_bias=False)(embedding)
    out = layers.Flatten()(out)
    out = layers.Dropout(dropout_prob)(out)
    out = layers.Activation(keras.activations.sigmoid)(out)
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[out],
    )
    # loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    # loss = keras.losses.CategoricalCrossentropy(from_logits=False)
    loss = keras.losses.BinaryCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=learn_rate)
    model.compile(optimizer=optimizer, loss=[loss], metrics=[f1_m, 'accuracy'])
    
    return model


In [None]:
if use_tpu:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    

# Start the training

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)


In [None]:
for lr in [5e-05]:
    for dropout_prob in [0.2]:
        print("Learning rate = " + str(lr) + ", dropout_prob = " + str(dropout_prob))
        if use_tpu: 
            with strategy.scope():
                model = create_model(lr, dropout_prob)
        else:
            model = create_model(lr, dropout_prob) 
        history = model.fit(
            x_train,
            y2_train,
            validation_data=(x_eval,y2_eval),
            epochs=20, 
            verbose=1,
            batch_size=128,
            callbacks=[callback],
        )
        results = model.evaluate(x_eval, y2_eval, batch_size=128)
        print(f"eval loss, eval acc: {results}, Learning rate =  {lr}, dropout_prob = {dropout_prob}")
        print('-'*50)

