## Data Preprocessing

In [None]:
import pickle
import numpy as np

# Read as binary
with open('train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)
# Read as binary
with open('test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)
train_data[:2]
# Train_data is a list of tuples consist of 3 parts: story, question, answer.
train_data[0]
' '.join(train_data[0][0])
' '.join(train_data[0][1])
train_data[0][2]
all_data = test_data + train_data
len(all_data)
set(train_data[0][0])
# Build vocabulary from all stories and questions
vocabulary = set()
i=0
for story, question, answer in all_data:
    vocabulary = vocabulary.union(set(story))
    vocabulary = vocabulary.union(set(question))
    vocabulary=vocabulary.union(set(answer))
    i=i+1 
vocabulary.add('no')
vocabulary.add('yes')

# Add one to length of vocabulary: Keras embedding layer requires this.
vocab_len = len(vocabulary) + 1
print("Actual length of the vocabulary: ", vocab_len-1)
# Length of all the stories
all_story_len = [len(data[0]) for data in all_data]
# Get maximum of the stories
max_story_len = max(all_story_len) 
max_question_len = max([len(data[1]) for data in all_data])
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocabulary)


tokenizer.word_index
train_story_text = []
train_question_text = []
train_answers = []
for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)
train_story_seq = tokenizer.texts_to_sequences(train_story_text)
# Create our own list of list of word indicies with padding.
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    # Stories = X
    X = []
    
    # Questions = Xq
    Xq = []
    
    # Y Correct Answer ['yes', 'no']
    Y = []
    for story, query, answer in data:
        
        # for each story
        # [23, 14, 15]
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        
        y = np.zeros(len(word_index)+1)
        y[word_index[answer]] = 1
        
        X.append(x)   # X holds list of lists of word indices for stories.
        Xq.append(xq) # Xq holds list of lists for word indices for questions.
        Y.append(y) # Y holds lists of lists of (38) biniary numbers, only 1 of them is 1.
        
    return (pad_sequences(X, maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))
inputs_train, queries_train, answers_train = vectorize_stories(train_data)
inputs_test, queries_test, answers_test = vectorize_stories(test_data)
inputs_test
answers_test
tokenizer.word_index['yes']
tokenizer.word_index['no']
from keras.models import Sequential, Model
from keras.layers import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))
vocab_size = len(vocabulary) + 1  #for tf to work with
# INPUT ENCODER M
encoder_m = Sequential()
encoder_m.add(Embedding(input_dim=vocab_size, output_dim=112))
# INPUT ENCODER C
encoder_c = Sequential()
encoder_c.add(Embedding(input_dim=vocab_size, output_dim=max_question_len))


question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=112, input_length=max_question_len))
question_encoder.add(Dropout(0.2))

# OUTPUT
# (samples, query_maxlen, embedding_dim)
# ENCODED <---- ENCODER(INPUT)
input_encoded_m = encoder_m(input_sequence)
input_encoded_c = encoder_c(input_sequence)
question_encoded = question_encoder(question)

#match = dot([input_encoded_m, question_encoded], axes=(2,2)) # why axes is (2,2) ==> dot product along the embedding dim (64 numbers dot 64 numbers)
match = dot([input_encoded_m, question_encoded], axes=-1, normalize=True)  #implemetn  cosine simialrity
match = Activation('softmax')(match)

output = add([match, input_encoded_c]) 
output = Permute((2,1))(output) 
answer = concatenate([output, question_encoded])

# Note: answer: (batch_size, query_maxlen, story_maxlen+embedding_dim)
answer = LSTM(64)(answer) #(samples, 32)
#print(answer.shape)
#answer = Dropout(0.2)(answer)
# answer: (batch_size, 32)
answer = Dense(vocab_size)(answer) # (samples, vocab_size) # YES/NO 0000
# answer (batch_size, vocab_size)
answer = Activation('softmax')(answer)

model = Model([input_sequence, question], answer)
#model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#model.summary()
history = model.fit([inputs_train, queries_train], answers_train, batch_size=84, epochs=100, validation_data=([inputs_test, queries_test], answers_test))

import matplotlib.pyplot as plt
%matplotlib inline
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
model.save('RNNbasedChatbot.h5')

In [None]:
pred_result = model.predict(([inputs_test, queries_test]))

In [None]:
index_word = {index: word for word, index in tokenizer.word_index.items()}

In [None]:
predictions = np.argmax(pred_result, axis=1)
pred_answers = [index_word[pred] for pred in predictions]
pred_answers

['no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no