In [1]:
import json
import pandas as pd

In [25]:
def load_json(dataset_path="SQUAD MATERIAL/test_set.json"):
        '''Load testdata from json file'''    
        with open(dataset_path) as f:
            raw_json = json.load(f)

        return raw_json['data']

In [15]:
jsn = load_json("SQUAD MATERIAL/training_set.json")

In [52]:
with open("SQUAD MATERIAL/test_set.json", 'w') as f:
    d = {'data': jsn[0:4]}
    raw_json = json.dump(d, f)


In [8]:
def find_word_index(char_idx, context):
    return context[0:char_idx].count(' ')

In [26]:
def create_dataframe(data):
    contexts = []
    questions = []
    answers_text = []
    answers_start = []
    answers_end = []
    question_ids = []
    answers_word_start = []
    answers_word_end = []
    for i in range(len(data)):
        paragraphs = data[i]['paragraphs']
        for sub_para in paragraphs:
            for q_a in sub_para['qas']:
                questions.append(q_a['question'])
                q_a_answer_starts = []
                q_a_answer_ends = []
                q_a_answers = []
                q_a_ans_word_idx_start = []
                q_a_ans_word_idx_end = []

                for answer in q_a['answers']:
                    answer_end = answer['answer_start'] + len(answer['text'])
                    q_a_answer_starts.append(answer['answer_start'])
                    q_a_answer_ends.append(answer_end)
                    q_a_answers.append(answer['text'])
                    q_a_ans_word_idx_start.append(find_word_index(answer['answer_start'], sub_para['context']))
                    q_a_ans_word_idx_end.append(find_word_index(answer_end, sub_para['context']))
                    
                answers_start.append(q_a_answer_starts)
                answers_end.append(q_a_answer_ends)
                answers_word_start.append(q_a_ans_word_idx_start)                
                answers_word_end.append(q_a_ans_word_idx_end)
                answers_text.append(q_a_answers)
                question_ids.append(q_a['id'])
                contexts.append(sub_para['context'])   
    df = pd.DataFrame({"questionID":question_ids, "answer_text": answers_text, "context":contexts, "question": questions, "answer_start": answers_start, "answer_word_start": answers_word_start, "answer_end": answers_end, "answer_word_end": answers_word_end})
    return df

In [53]:
data = load_json()
df = create_dataframe(data)

In [65]:
df['questionID'][40:60]

40    5733ae924776f41900661013
41    5733ae924776f41900661015
42    5733ae924776f41900661016
43    5733ae924776f41900661017
44    5733afd3d058e614000b6045
45    5733afd3d058e614000b6047
46    5733afd3d058e614000b6048
47    5733afd3d058e614000b6046
48    5733afd3d058e614000b6049
49    5733b0fb4776f41900661041
50    5733b0fb4776f41900661043
51    5733b0fb4776f41900661044
52    5733b0fb4776f41900661045
53    5733b0fb4776f41900661042
54    5733b1da4776f41900661068
55    5733b1da4776f41900661069
56    5733b1da4776f4190066106a
57    5733b1da4776f4190066106b
58    5733b1da4776f41900661067
59    5733b2fe4776f4190066108f
Name: questionID, dtype: object

In [64]:
df['answer_text'][40:60]

40                                 [19.7%]
41        [the top 10 to 15 in the nation]
42                                 [39.1%]
43                   [more than 750 miles]
44                          [18th overall]
45                                   [8th]
46                           [1st overall]
47                             [USA Today]
48                                 [57.6%]
49         [Father Joseph Carrier, C.S.C.]
50                             [1851–1921]
51                [the Science Department]
52                   [Evolution and Dogma]
53    [Professor of Chemistry and Physics]
54                                  [1882]
55                [Professor Jerome Green]
56                           [Around 1899]
57               [Father Julius Nieuwland]
58                  [an early wind tunnel]
59                  [The Lobund Institute]
Name: answer_text, dtype: object

In [46]:
#------------------------------------------------------------------------------
# File for using the trained models to get predictions
#------------------------------------------------------------------------------
import numpy as np
import tensorflow as tf
from tensorflow import keras
import json
import pandas as pd
from functools import reduce
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import sys
from keras import backend as K



_EPSILON = 1e-7
def categorical_cross_entropy_loss(target, output):
    output /= tf.reduce_sum(output, -1, True)
    # manual computation of crossentropy
    epsilon = K.constant(_EPSILON, output.dtype.base_dtype)
    output = tf.clip_by_value(output, epsilon, 1. - epsilon)
    return - tf.reduce_sum(target * tf.math.log(output), -1)

losses = {"start_output": categorical_cross_entropy_loss, "end_output": categorical_cross_entropy_loss}

lossWeights = {"start_output": 1.0, "end_output": 1.0}
# Load model
def load_model(dir='./models/model_22_12_2021_11_16_34'):
    '''
        - Load the trained model using keras.models.load_model
        - Load the tokenizer word index to give the words 
        the same index as in training.
        - Load MAX_SEQ_LEN used in training to pad testset 
        to the correct length.
        Outputs:
            Trained model
            Tokenizer word to index dictionary
            Max sequence length (int)
    '''
    print("Loading model...")
    model = keras.models.load_model(f'{dir}/model', custom_objects={'test_loss':categorical_cross_entropy_loss})
    with open(f'{dir}/tokenizer.txt') as f:
        tokenizer_word_index = json.load(f)
    with open(f'{dir}/MAX_SEQ_LEN.txt') as f:
        MAX_SEQ_LEN = json.load(f)
    return model, tokenizer_word_index, MAX_SEQ_LEN


def get_test_data(path, tokenizer_word_index, MAX_SEQ_LEN):
    '''
        Loads testdata, makes it into a dataframe, lower the 
        text and strip text and tokenize the words using the 
        word_index dict from training and padds sequences.
        Input:
            Path to the testdata file
            tokenizer word index list from training
            MAX_SEQ_LEN (int) used in training
        Ouput:
            tokenized context (np.array)
            tokenized question (np.array)
            dataframe

    '''
    print(f'Get test data from {path}')
    # Import json file from path
    def load_json(dataset_path="training_set.json"):
        '''Load testdata from json file'''    
        with open(dataset_path) as f:
            raw_json = json.load(f)

        return raw_json['data']

    def create_dataframe(data):
        '''Create dataframe of the given data'''
        contexts = []
        questions = []
        question_ids = []
        for i in range(len(data)):
            paragraphs = data[i]['paragraphs']
            for sub_para in paragraphs:
                for q_a in sub_para['qas']:
                    questions.append(q_a['question'])
                    question_ids.append(q_a['id'])
                    contexts.append(sub_para['context'])   
        df = pd.DataFrame({"questionID":question_ids, "context":contexts, "question": questions})
        return df

    def clean_text(dataframe):
        '''Make the text into lower and remove all leading and trailing whitespace'''
        def lower(text: str) -> str:
            return text.lower()
        def strip_text(text: str) -> str:
            return text.strip()  

        PREPROCESSING_PIPELINE = [
                            lower,
                            strip_text
                            ]

        def text_prepare(text: str) -> str:
            """
            Applies a list of pre-processing functions in sequence (reduce).
            """

            filter_methods = PREPROCESSING_PIPELINE
            if type(text) == list:
                new_row = [reduce(lambda txt, f: f(txt), filter_methods, x) for x in text]
            else:
                new_row = reduce(lambda txt, f: f(txt), filter_methods, text)
            return new_row
        for key in ['context', 'question']:
            dataframe[key] = dataframe[key].apply(lambda txt: text_prepare(txt))
        
        return dataframe

    def textToTensor(tokenizer, max_len, text):
        '''
            Converts text to tensors by converting the words into the correct indexes. 
            Then padds the tensors with 0 vlaues
        '''
        seq = tokenizer.texts_to_sequences(text)
        padded = pad_sequences(sequences=seq, maxlen=max_len, padding='post')
        return padded

    def tokenize(df, tokenizer_word_index, MAX_SEQ_LEN):
        '''Creates a tokenizer using the word_index dicitonary from training'''
        tokenizer = Tokenizer()
        tokenizer.word_index = tokenizer_word_index
        context = textToTensor(tokenizer, MAX_SEQ_LEN, df['context'])
        question = textToTensor(tokenizer, MAX_SEQ_LEN, df['question'])
        return context, question

    data = load_json(path)
    df = create_dataframe(data)
    df = clean_text(df)
    context, question = tokenize(df, tokenizer_word_index, MAX_SEQ_LEN)
    return context, question, df


def get_predicitons(model, context, questions):
    '''Use the model to predict on the testset'''
    print('Get predicitons..')
    predictions = model.predict([questions, context])
    print("Gotten the predcitions!")
    return predictions


def make_answer_dict(start_preds, end_preds, df):
    '''Convert predicitons to a dicitonary containing question ID and answer text'''
    print('Convert predicitons to answer text..')
    def get_word_index(prediction):
        return [np.argmax(prediction[i]) for i in range(len(prediction))]

    def get_answer_text(start, end, index, df):
        '''Get answer text from context'''
        words = df['context'][index].split(' ')[start:end+1]
        answ = " ".join(words)
        # NB!!!: fore some reason the end is projected to be before the start so the answers are empty strings. 
        # Just doing this for now.
        if answ == "":
            print("\n\nSOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED\n\n")
            answ = df['context'][index].split(' ')
            print('satrta', start)
            print('len', len(answ))
            answ = answ[start]
        return answ

    answer_dict = {}
    start_indxs = get_word_index(start_preds)
    end_indxs = get_word_index(end_preds)
    for i in range(len(start_preds)):
        question_id = df['questionID'][i]
        start_index = start_indxs[i]
        end_index = end_indxs[i]
        answr_text = get_answer_text(start_index, end_index, i, df)
        answer_dict[question_id] = answr_text
    return answer_dict

def write_predictions(answer_dict, path):
    '''Write answers to a prediciton file'''
    print(f'Saving answer to {path}')
    with open(path, 'w') as file:
     file.write(json.dumps(answer_dict))

In [47]:
test_path = './SQUAD MATERIAL/test_set.json'
prediction_path = 'predictions.txt'

In [48]:
model, tokenizer_word_index, MAX_SEQ_LEN = load_model()


Loading model...


2021-12-22 15:02:37.512242: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 14 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes may be inaccurate.


KeyboardInterrupt: 

In [32]:
context, question, df = get_test_data(test_path, tokenizer_word_index, MAX_SEQ_LEN)


Get test data from ./SQUAD MATERIAL/test_set.json


In [51]:
df

Unnamed: 0,questionID,context,question
0,5733be284776f41900661182,"architecturally, the school has a catholic cha...",to whom did the virgin mary allegedly appear i...
1,5733be284776f4190066117f,"architecturally, the school has a catholic cha...",what is in front of the notre dame main building?
2,5733be284776f41900661180,"architecturally, the school has a catholic cha...",the basilica of the sacred heart at notre dame...
3,5733be284776f41900661181,"architecturally, the school has a catholic cha...",what is the grotto at notre dame?
4,5733be284776f4190066117e,"architecturally, the school has a catholic cha...",what sits on top of the main building at notre...
...,...,...,...
1309,573410864776f419006617e5,other authors have focused on the structural c...,"in the build-up to genocide, what have other a..."
1310,573410864776f419006617e6,other authors have focused on the structural c...,what processes are thought to create an evolut...
1311,573410864776f419006617e7,other authors have focused on the structural c...,who revealed the starting points of this evolu...
1312,573410864776f419006617e8,other authors have focused on the structural c...,a history of what is just one factor that cont...


In [33]:
pred_start, pred_end = get_predicitons(model, context, question)


Get predicitons..
Gotten the predcitions!


In [49]:
answer_dict = make_answer_dict(pred_start, pred_end, df)

Convert predicitons to answer text..


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta 38
len 114


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta 170
len 219


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta -1
len 97


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta -1
len 97


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta 68
len 97


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta 33
len 159


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta 24
len 190


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta 89
len 190


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta 24
len 190


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta 135
len 190


SOMETHING STRANGE IS GOING ON AND END PRED IS BEFORE START PRED


satrta -1
len 157


SOMETHING STRANG

In [50]:
write_predictions(answer_dict, prediction_path)


Saving answer to predictions.txt


In [45]:
with open('SQUAD MATERIAL/test_set.json') as f:
    dataset_json = json.load(f)
dataset = dataset_json['data']