In [2]:
import pandas as pd

train = pd.read_csv('../dataset/train.csv')
dev = pd.read_csv('../dataset/dev.csv')

In [16]:
dev.head(1)


Unnamed: 0,id,question,context,answers,c_id
0,56ddde6b9a695914005b9628,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': 'France', 'answer_start': 159}, {'te...",0


In [38]:
train.head(5)

Unnamed: 0,index,question,context,text,answer_start,c_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269.0,0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207.0,0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526.0,0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166.0,0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0


In [52]:
def preprocess_squad_data(data):
    contexts = data['context'].tolist()
    questions = data['question'].tolist()
    answers = data['text'].tolist()
    return contexts, questions, answers

train_contexts, train_questions, train_answers = preprocess_squad_data(train)

def has_float_element(lst):
    return any(isinstance(element, float) for element in lst)

train_answers = [str(x) for x in train_answers]

In [53]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

In [54]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_answers + train_contexts + train_questions)
num_tokens = len(tokenizer.word_index) + 1

In [57]:
answers_seq = tokenizer.texts_to_sequences(train_answers)
contexts_seq = tokenizer.texts_to_sequences(train_contexts)
questions_seq = tokenizer.texts_to_sequences(train_questions)


# Pad the input and target sequences to the same length
max_seq_length = max(max(len(seq) for seq in answers_seq),
                     max(len(seq) for seq in contexts_seq),
                     max(len(seq) for seq in questions_seq))

answers_seq = pad_sequences(answers_seq, maxlen=max_seq_length, padding='post')
contexts_seq = pad_sequences(contexts_seq, maxlen=max_seq_length, padding='post')
questions_seq = pad_sequences(questions_seq, maxlen=max_seq_length, padding='post')

encoder_inputs = Input(shape=(max_seq_length,10))
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_seq_length,10))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
model.fit([contexts_seq, questions_seq], answers_seq, batch_size=32, epochs=10)