In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub

import pandas as pd

from transformers import AutoTokenizer, AutoConfig

import numpy as np

In [None]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4'
trainFile = '../input/chaii-hindi-and-tamil-question-answering/train.csv'
testFile = '../input/chaii-hindi-and-tamil-question-answering/test.csv'

In [None]:
bert_layer = hub.KerasLayer(tfhub_handle_encoder,trainable=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
config = AutoConfig.from_pretrained('bert-base-multilingual-cased')

tokenizer.save_pretrained('bert-base-multilingual-cased-tokenizer')
config.save_pretrained('bert-base-multilingual-cased-tokenizer')

In [None]:
traindf = pd.read_csv(trainFile)
testdf = pd.read_csv(testFile)
print(traindf.shape)

In [None]:
model_name = 'bert-chaii'
max_seq_length = 384

In [None]:
traindf.head()

In [None]:
testdf.head()

In [None]:
def context_offsets_index(offsets):
    last_start = 0
    idx = 0
    for x, (ostart, oend) in enumerate(offsets):
        if ostart >= last_start:
            idx = x
            last_start = ostart
        else:
            break
    return idx+1

In [None]:
def make_samples(question, context, start_char_idx=None, answer_text=None):

    encoding = tokenizer.encode_plus(question, context,  return_tensors='np',
                                     max_length=384, stride=128, return_overflowing_tokens=True,
                                     padding="max_length", truncation=True,
                                     return_offsets_mapping=True)

    input_word_ids = encoding.input_ids
    token_type_ids = encoding.token_type_ids
    attention_mask = encoding.attention_mask
    offsets = encoding.offset_mapping

    # check if end character index is in the context
    end_char_idx = start_char_idx + len(answer_text)
    if end_char_idx >= len(context):
        return

    # mark all the character indexes in context that are also in answer
    is_char_in_ans = [0] * len(context)
    for idx in range(start_char_idx, end_char_idx):
        is_char_in_ans[idx] = 1
    
    inputs = []
    for x in range(len(input_word_ids)):
        ans_token_idx = []
        context_index = context_offsets_index(offsets[x])
        # find all the tokens that are in the answers
        sample_offsets = offsets[x][context_index:]
        #print('sample_offsets=', sample_offsets.shape)
        for idx, (start, end) in enumerate(sample_offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            continue

        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]
        
        print('Question=', question)
        print('answer text=', answer_text)
        print('string=', tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_word_ids[x][context_index:][start_token_idx:end_token_idx+1])))

        input = {'input_word_ids': input_word_ids[x], 'input_type_ids': token_type_ids[x],
                 'input_mask': attention_mask[x], 'start_token_idx': start_token_idx,
                 'end_token_idx': end_token_idx}

        inputs.append(input)

    return inputs

In [None]:
def chaii_train_data(df):
    train_data_samples = []
    for idx, row in df.iterrows():
        question = row['question']
        context = row['context']
        answer_start = row['answer_start']
        answer_text = row['answer_text']
        language = row['language']

        samples = make_samples(question, context, answer_start, answer_text)

        for s in samples:
            train_data_samples.append(s)

    return train_data_samples

In [None]:
def create_bert_inputs(samples):
    dataset_dict = {
        "input_word_ids": [],
        "input_type_ids": [],
        "input_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }

    for item in samples:
        for key in dataset_dict:
            dataset_dict[key].append(item[key])

    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [dataset_dict["input_word_ids"],
         dataset_dict["input_mask"],
         dataset_dict["input_type_ids"]]

    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]

    return x, y

In [None]:
def buildModel():
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
    
    bert_inputs = {'input_word_ids': input_word_ids, 'input_mask': input_mask, 'input_type_ids': input_type_ids}
    bert_outputs = bert_layer(bert_inputs)

    pooled_output = bert_outputs["pooled_output"]      # [batch_size, 768].
    sequence_output = bert_outputs["sequence_output"]  # [batch_size, seq_length, 768].

    start_logits = tf.keras.layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
    start_logits = tf.keras.layers.Flatten()(start_logits)
    end_logits = tf.keras.layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
    end_logits = tf.keras.layers.Flatten()(end_logits)
    start_probs = tf.keras.layers.Activation(tf.keras.activations.softmax)(start_logits)
    end_probs = tf.keras.layers.Activation(tf.keras.activations.softmax)(end_logits)
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=[start_probs, end_probs])
    
    return model

In [None]:
def train_model():
    train_samples = chaii_train_data(traindf)
    x, y = create_bert_inputs(train_samples)

    print(f"{len(train_samples)} training points created.")

    model = buildModel()

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = tf.keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    model.summary()

    model.fit(x, y, epochs=17, batch_size=8)
    model.save(model_name)

In [None]:
def test_model():
    model = tf.keras.models.load_model(model_name)
    for idx, row in testdf.iterrows():
        id = row['id']
        question = row['question']
        context = row['context']
        encoding = tokenizer.encode_plus(question, context, return_tensors='np',
                                            max_length=384, stride=128, return_overflowing_tokens=True,
                                            padding="max_length", truncation=True,
                                            return_offsets_mapping=True)

        input_word_ids = encoding.input_ids
        token_type_ids = encoding.token_type_ids
        attention_mask = encoding.attention_mask
        offsets = encoding.offset_mapping

        input = {'input_word_ids': input_word_ids, 'input_type_ids': token_type_ids, 'input_mask': attention_mask}

        x = [input["input_word_ids"],input["input_mask"],input["input_type_ids"]]

        pred_start, pred_end = model.predict(x)

        max = 0
        answer = "unknown"
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            print('start probability=', start.max(), ' , end probability=', end.max())
            start = np.argmax(start)
            end = np.argmax(end)
            
            context_index = context_offsets_index(offsets[idx])
            # find all the tokens that are in the answers
            sample_offsets = offsets[idx][context_index:]

            if start >= end : continue
            if end >= len(sample_offsets): continue
                
            char_start = sample_offsets[start][0]
            char_end = sample_offsets[end][1]
            answer = context[char_start:char_end]

            print("id=", id, ", Q=",  question, ", A=", answer)

In [None]:
train_model()
test_model()