**Packages**

In [None]:
import json
from tqdm.notebook import tqdm

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, SpatialDropout1D, Dense, Dropout, Input, concatenate, Conv1D, Activation, Flatten

from nltk.corpus import stopwords
import re


**Constants Definiation**

In [None]:
# data to load
NUM_OF_TRAIN_QUESTIONS = 1000
NUM_OF_VAL_QUESTIONS = 1050
SAMPLE_RATE = 15
TRAIN_PATH = '../input/tensorflow2-question-answering/simplified-nq-train.jsonl'


# long answer model parameters
EPOCHS = 80
BATCH_SIZE = 16
EMBED_SIZE = 100
CLASS_WEIGHTS = {0: 0.5, 1: 5.}

# short answer model parameters
SHORT_EPOCHS = 80
SHORT_BATCH_SIZE = 16
SHORT_EMBED_SIZE = 200

## **Reading The Data Set From Json files**

**reading data helper funcs**

In [None]:
def get_line_of_data(file):
    line = file.readline()
    line = json.loads(line)
    
    return line


def get_question_and_document(line):
    question = line['question_text']
    text = line['document_text'].split(' ')
    annotations = line['annotations'][0]
    return question, text, annotations
                
                
def get_long_candidate(i, annotations, candidate):
    # check if this candidate is the correct answer
    if i == annotations['long_answer']['candidate_index']:
        label = True
    else:
        label = False

    # get place where long answer starts and ends in the document text
    long_start = candidate['start_token']
    long_end = candidate['end_token']
    
    return label, long_start, long_end

#create dataset with two features (question and long_answer)
def form_data_row(question, label, text, long_start, long_end):
    row = {
        'question': question,
        'long_answer': ' '.join(text[long_start:long_end]),
        'is_long_answer': label,
    }
    
    return row


def load_data(file_path, questions_start, questions_end):
    rows = []
    
    with open(file_path) as file:

        for i in tqdm(range(questions_start, questions_end)):
            line = get_line_of_data(file)
            
            question, text, annotations = get_question_and_document(line)

            for i, candidate in enumerate(line['long_answer_candidates']):
                label, long_start, long_end = get_long_candidate(i, annotations, candidate)

                if label == True or (i % SAMPLE_RATE == 0):##?? samplerate
                    rows.append(
                        form_data_row(question, label, text, long_start, long_end)
                    )
        
    return pd.DataFrame(rows)

In [None]:
val_df = load_data(TRAIN_PATH, NUM_OF_TRAIN_QUESTIONS, NUM_OF_VAL_QUESTIONS)

In [None]:
val_df.head(5)

In [None]:
train_df = load_data(TRAIN_PATH, 0, NUM_OF_TRAIN_QUESTIONS)

In [None]:
train_df.head(5)

## **DATA PRE-PROCESSING PART**

In [None]:
#remove words like ( is , been , have , ... )
def remove_stopwords(sentence):
    words = sentence.split()
    words = [word for word in words if word not in stopwords.words('english')]
    
    return ' '.join(words)

#Using a regex, TO clean everything inside <>
def remove_html(sentence):
    html = re.compile(r'<.*?>')
    return html.sub(r'', sentence)

#USE THE ABOVEV HELPER TO CLean question and long answer
def clean_df(df):
    df['long_answer'] = df['long_answer'].apply(lambda x : remove_stopwords(x))
    df['long_answer'] = df['long_answer'].apply(lambda x : remove_html(x))

    df['question'] = df['question'].apply(lambda x : remove_stopwords(x))
    df['question'] = df['question'].apply(lambda x : remove_html(x))
    
    return df

In [None]:
train_df = clean_df(train_df)

**Saving cleaned train data**

In [None]:
train_df.to_csv('mycsvfile.csv',index=False)

**Reading cleaned train data**

In [None]:
import pandas as pd
train_df=pd.read_csv('../input/cleaned-tens/mycsvfile.csv')

In [None]:
train_df.head(10)

**Tokenizer**

In [None]:
def define_tokenizer(df_series): #def of tokenizer engine 
    sentences = pd.concat(df_series)
    
    tokenizer =  tf.keras.preprocessing.text.Tokenizer(
    num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', char_level=False, oov_token=None,
    document_count=0) 
    
   
    tokenizer.fit_on_texts(sentences) #prepare text 
    
    return tokenizer

In [None]:
tokenizer = define_tokenizer([
    train_df.long_answer, 
    train_df.question,
    val_df.long_answer, 
    val_df.question
])
tokenizer.word_index['tracy']

**Encoding with max lenght=300**

In [None]:
MAX_LEN=300
def encode(sentences, tokenizer):
    encoded_sentences = tokenizer.texts_to_sequences(sentences)
    encoded_sentences = tf.keras.preprocessing.sequence.pad_sequences(
        encoded_sentences, 
        padding='post',
        maxlen=MAX_LEN
    )
    return encoded_sentences

In [None]:
train_long_answers = encode(train_df['long_answer'].values, tokenizer)
train_questions = encode(train_df['question'].values, tokenizer)

val_long_answers = encode(val_df['long_answer'].values, tokenizer)
val_questions = encode(val_df['question'].values, tokenizer)

 **Labels converted to 0-1 integers**

In [None]:
train_labels = train_df.is_long_answer.astype(int).values
val_labels = val_df.is_long_answer.astype(int).values

**Import Words Embedding Using Glove**

In [None]:
embedding_dict = {}

with open('../input/glove-global-vectors-for-word-representation/glove.6B.' + str(EMBED_SIZE) + 'd.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:],'float32')
        embedding_dict[word] = vectors
        
f.close()

**Generate Embedding Matrix for our words ...**

In [None]:
num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, EMBED_SIZE))

for word, i in tokenizer.word_index.items():
    if i > num_words:
        continue
    
    emb_vec = embedding_dict.get(word)
    
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec

## **LONG MODEL PART**

**Embedding layer**

In [None]:
embedding = tf.keras.layers.Embedding(
    len(tokenizer.word_index) + 1,
    EMBED_SIZE,
    embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix),
    trainable = False
)

**Architecture**

In [None]:
# question encoding
question_input = Input(shape=(None,))
question_x = embedding(question_input)
question_x = SpatialDropout1D(0.2)(question_x)
question_x = Bidirectional(LSTM(100, return_sequences=True))(question_x)
question_x = GlobalMaxPooling1D()(question_x)

# answer encoding
answer_input = Input(shape=(None,))
answer_x = embedding(answer_input)
answer_x = SpatialDropout1D(0.2)(answer_x)
answer_x = Bidirectional(LSTM(150, return_sequences=True))(answer_x)
answer_x = GlobalMaxPooling1D()(answer_x)

# classification
combined_x = concatenate([question_x, answer_x])
combined_x = Dense(300, activation='relu')(combined_x)
combined_x = Dropout(0.5)(combined_x)
combined_x = Dense(300, activation='relu')(combined_x)
combined_x = Dropout(0.5)(combined_x)
output = Dense(1, activation='sigmoid')(combined_x)

# combine model parts into one
model = tf.keras.models.Model(inputs=[answer_input, question_input], outputs=output)

**Compile**

In [None]:
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam',
    metrics=['BinaryAccuracy' ,'Recall', 'Precision' ]
)

**Callbacks**

In [None]:
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=2, verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, verbose=1),
]

**Train The Model**

In [None]:
history = model.fit(
    x = [train_long_answers, train_questions], 
    y = train_labels,
    validation_data = (
        [val_long_answers, val_questions], 
        val_labels
    ),
    epochs = EPOCHS,
    callbacks = callbacks,
    class_weight = CLASS_WEIGHTS,
    batch_size = BATCH_SIZE,
    shuffle = True
)

**Save Model**

In [None]:
from keras.models import load_model
model.save('./long_model_full')

**Import our trained Model from Inputs**

In [None]:
from tensorflow import keras
model = keras.models.load_model('../input/long-model-qa/long_model')

**Model Summary**

In [None]:
model.summary()

**Loss & Accuarcy Visualization**

In [None]:
history=model.history

In [None]:
accuracy =history.history['binary_accuracy']
val_accuracy = history.history['val_binary_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(accuracy))
plt.plot(epochs, accuracy,  label='Training accuracy')
plt.plot(epochs, val_accuracy,label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss,label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
print('Num of Epochs: {0}'.format(
    len(history.history['loss'])
))

**F1 Score**

In [None]:
recall = history.history['recall'][-1]
precision = history.history['precision'][-1]

print('Train F1 score: {0:.4f}'.format(
    2 * (precision * recall) / (precision + recall)
))

recall = history.history['val_recall'][-1]
precision = history.history['val_precision'][-1]

print('Validation F1 score: {0:.4f}'.format(
    2 * (precision * recall) / (precision + recall)
))

## **SHORT MODEL PART**

**Extract the short answer from a long answer.**

In [None]:
def get_short_answer(annotations, long_start, long_end):
    if len(annotations['short_answers']) > 0:
        short_start = annotations['short_answers'][0]['start_token']
        short_end = annotations['short_answers'][0]['end_token']
        
        short_start = short_start - long_start
        short_end = short_end - long_start
        
        return short_start, short_end
    else:
        return 0, 0
    

def form_short_data_row(question, text, long_start, long_end, short_start, short_end):
    long_answer = ' '.join(text[long_start:long_end])
    short_answer = ' '.join(long_answer.split(' ')[short_start:short_end])
    
    row = {
        'question': question,
        'long_answer': long_answer,
        'short_answer': short_answer,
        'short_start': short_start,
        'short_end': short_end
    }
    
    return row


def load_short_data(file_path, questions_start, questions_end):
    rows = []
    
    with open(file_path) as file:

        for i in tqdm(range(questions_start, questions_end)):
            line = get_line_of_data(file)
            question, text, annotations = get_question_and_document(line)

            for i, candidate in enumerate(line['long_answer_candidates']):
                label, long_start, long_end = get_long_candidate(i, annotations, candidate)

                if label == True:
                    short_start, short_end = get_short_answer(annotations, long_start, long_end)
                    
                    rows.append(
                        form_short_data_row(question, text, long_start, long_end, short_start, short_end)
                    )
        
    return pd.DataFrame(rows)

In [None]:
train_short_df = load_short_data(TRAIN_PATH, 0, NUM_OF_TRAIN_QUESTIONS)
val_short_df = load_short_data(TRAIN_PATH, NUM_OF_TRAIN_QUESTIONS, NUM_OF_VAL_QUESTIONS)

In [None]:
train_short_df.head()

In [None]:
train_long_answers = encode(train_short_df['long_answer'].values, tokenizer)
train_questions = encode(train_short_df['question'].values, tokenizer)

val_long_answers = encode(val_short_df['long_answer'].values, tokenizer)
val_questions = encode(val_short_df['question'].values, tokenizer)

In [None]:
def form_short_labels(df, sentence_length):
    start_labels = np.zeros((len(df), sentence_length))
    end_labels = np.zeros((len(df), sentence_length))

    for i in range(len(df)):
        start = df.loc[i].short_start
        end = df.loc[i].short_end

        if start < 300 and end < 300:
            start_labels[i, start] = 1
            end_labels[i, end] = 1
        else:
            continue
    
    return start_labels, end_labels


train_start_labels, train_end_labels = form_short_labels(train_short_df, MAX_LEN)
val_start_labels, val_end_labels = form_short_labels(val_short_df, MAX_LEN)

In [None]:
# load from file
embedding_dict = {}

with open('../input/glove-global-vectors-for-word-representation/glove.6B.' + str(SHORT_EMBED_SIZE) + 'd.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:],'float32')
        embedding_dict[word] = vectors
        
f.close()

# write to matrix
num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, SHORT_EMBED_SIZE))

for word, i in tokenizer.word_index.items():
    if i > num_words:
        continue
    
    emb_vec = embedding_dict.get(word)
    
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec
        
# load as tensorflow embedding
embedding = tf.keras.layers.Embedding(
    len(tokenizer.word_index) + 1,
    SHORT_EMBED_SIZE,
    embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix),
    trainable = False
)

**Architecture**

In [None]:
# encode question
question_input = Input(shape=(None,))
question_x = embedding(question_input)
question_x = SpatialDropout1D(0.2)(question_x)
question_x = Bidirectional(LSTM(200, return_sequences=True))(question_x)
question_x = Bidirectional(LSTM(100, return_sequences=True))(question_x)

# encode answer
answer_input = Input(shape=(None,))
answer_x = embedding(answer_input)
answer_x = SpatialDropout1D(0.2)(answer_x)
answer_x = Bidirectional(LSTM(250, return_sequences=True))(answer_x)
answer_x = Bidirectional(LSTM(150, return_sequences=True))(answer_x)

# merge the encodings
combined_x = concatenate([question_x, answer_x])

# predict start index
start_x = Dropout(0.1)(combined_x) 
start_x = Conv1D(1,1)(start_x)
start_x = Flatten()(start_x)
start_x = Activation('softmax', name='start_token_out')(start_x)

# predict end index
end_x = Dropout(0.1)(combined_x) 
end_x = Conv1D(1,1)(end_x)
end_x = Flatten()(end_x)
end_x = Activation('softmax', name='end_token_out')(end_x)

# merge the parts into one model
short_model = tf.keras.models.Model(inputs=[answer_input, question_input], outputs=[start_x, end_x])

**Compile**

In [None]:
short_model.compile(
    loss='categorical_crossentropy', 
    optimizer='adam',
    metrics=['categorical_accuracy', 'Recall', 'Precision']
)

**Callbacks**

In [None]:
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=3, verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, verbose=1),
]

**Train The Model**


In [None]:
history = short_model.fit(
    x = [train_long_answers, train_questions], 
    y = [train_start_labels, train_end_labels],
    validation_data = (
        [val_long_answers, val_questions], 
        [val_start_labels, val_end_labels]
    ),
    epochs = SHORT_EPOCHS,
    callbacks = callbacks,
    batch_size = SHORT_BATCH_SIZE,
    shuffle = True
)

**Save Model**


**Import our trained Model from Inputs**


In [None]:
# Recreate the exact same model, including weights and optimizer.
short_loaded_model = keras.models.load_model('../input/short-model-qa/short_model.h5')

**Model Summary**

In [None]:
short_model.summary()

In [None]:
print('Epoch: {0}'.format(len(history.history['loss'])))
print('Loss: {0}'.format(history.history['loss'][-1]))

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(loss))
plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss,label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
print('Num of Epochs: {0}'.format(
    len(history.history['loss'])
))

## **TEST DATA PART**

**Load Test data**

In [None]:
def get_line_of_data_test(file):
    line = file.readline()
    line = json.loads(line)
    
    return line


def get_question_and_document_test(line):
    example_id=line['example_id']
    question = line['question_text']
    text = line['document_text'].split(' ')
    return question, text,example_id

def get_long_candidate_test(i,candidate):
    # get place where long answer starts and ends in the document text
    long_start = candidate['start_token']
    long_end = candidate['end_token']
    
    return long_start, long_end


#create dataset with two features (question and long_answer)
def form_data_row_test(question, text, long_start, long_end, example_id):
    row = {
        'question': question,
        'long_answer': ' '.join(text[long_start:long_end]),
        'long_start': long_start,
        'long_end': long_end,
        'example_id': example_id
    }
    return row

def load_test_data(file_path, questions_start, questions_end):
    rows = []
    candidates= []
    
    with open(file_path) as file:

        for i in tqdm(range(questions_start, questions_end)):
            line = get_line_of_data_test(file)
            question, text, example_id = get_question_and_document_test(line)
            for i, candidate in enumerate(line['long_answer_candidates']):
                    long_start, long_end = get_long_candidate_test(i,candidate)
                    rows.append(
                        form_data_row_test(question, text, long_start, long_end, example_id)
                    )
            candidates.append(i+1)
        
    return pd.DataFrame(rows), candidates


In [None]:
Test_PATH = '../input/tensorflow2-question-answering/simplified-nq-test.jsonl'

test_df,test_candidates = load_test_data(Test_PATH, 0, 346)

In [None]:
test_df.head(10)

In [None]:
def test_question(question, long_answer):
    sentences = [question, long_answer]
    sentences = encode(sentences, tokenizer)
    prediction = model.predict(
        [np.expand_dims(sentences[1], axis=0), np.expand_dims(sentences[0], axis=0)]
    )
    return prediction

**Predict Long Answers**

In [None]:
j=0                                           # iterator on the test candidates array 
curr_test_cand=test_candidates[j]             # temp array for num of ansers per question 
prob_predictions=[]                           # temp array to store all answers probabilites per question

long_PredictionIdx=[]                    # Indces of the predicted long answer sample in the train set  
long_PredictionIdx.append(0)            # the first elemtn is reserverd for the offset of each question to the begninng of the test set


#Final Answers , used to save in the submission file 
long_prediction_ids=[]
long_prediction_strings=[]


#*** The predict main loop ***#
for i in range(test_df.shape[0]):

    curr_pred=test_question(test_df.question.iloc[i],test_df.long_answer.iloc[i])
    
    prob_predictions.append(curr_pred)
    curr_test_cand-=1
    #print(urr_test_cand,"trrttrtr")
    # cuur anser was the last one
    if(curr_test_cand==0):
        
        #find the max prob cadididate
        long_pred = np.amax(prob_predictions)
        long_pred_idx= np.argmax(prob_predictions)
        offset=long_PredictionIdx[0]
        idx = offset+long_pred_idx

        
        if(long_pred>=0.5):
            long_Prediction_string=str(test_df.long_start.iloc[idx])+':'+str(test_df.long_end.iloc[idx])
            long_prediction_strings.append(long_Prediction_string)
                        
        else:
            long_Prediction_string=""
            long_prediction_strings.append(long_Prediction_string)
            
            
        long_prediction_id =str(test_df.example_id.iloc[idx])+'_long,'
        long_prediction_ids.append(long_prediction_id)
        long_PredictionIdx.append(idx)
        
       #print(i,long_prediction_id,long_Prediction_string)
        
        #reset temp var to next question
        j=j+1
        long_PredictionIdx[0]=i+1
        if j < len(test_candidates):
            curr_test_cand=test_candidates[j]
            prob_predictions=[]
            
#print(long_prediction_ids)
#print(long_PredictionIdx)

**Predict short Answer**

In [None]:
def test_short_answer(question, long_answer):
    sentences = [long_answer, question]
    sentences = encode(sentences, tokenizer)

    predictions = short_model.predict(
        [np.expand_dims(sentences[0], axis=0), np.expand_dims(sentences[1], axis=0)]
    )
    predictions = np.array(predictions)
    pred_start = np.amax(predictions[0,0])
    pred_end = np.amax(predictions[1,0])

    return pred_start,pred_end


In [None]:
short_example_ids=[]
short_prediction_strings=[]
idx=[]
predictions=[]
i=0

for i in range(len(long_PredictionIdx)):
    if(i==0):
        continue
        
    idx = long_PredictionIdx[i]
    #print(idx)
    
    start_prob,end_prob=test_short_answer(test_df.question.iloc[idx],test_df.long_answer.iloc[idx])  
    predictions.append((start_prob, end_prob))


    if(start_prob>=0.5 and end_prob>=0.5):
        short_prediction_string="YES"
        short_prediction_strings.append(short_prediction_string)
    else:
        short_prediction_string="NO"
        short_prediction_strings.append(short_prediction_string)

    short_example_id =str(test_df.example_id.iloc[idx])+'_short,'
    short_example_ids.append(short_example_id)
    
    #print(i,short_example_id,short_prediction_string)    

**Creating Submission File**

In [None]:
import csv 
header = ['example_id', 'PredictionString']
with open("submission.csv", "w") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    long_prediction_id=len(long_prediction_ids)

    for value in  range(len(long_prediction_ids)):
        long_prediction_id-=1 
        writer.writerow([long_prediction_ids[value], long_prediction_strings[value]])
        writer.writerow([short_example_ids[value], short_prediction_strings[value]])