In [2]:
import pickle
import numpy as np

In [3]:
train_file_path = '/home/viper/Downloads/UPDATED_NLP_COURSE/TextFiles/train_qa.txt'

In [4]:
with open(train_file_path, 'rb') as file:
    train_data = pickle.load(file)

In [5]:
test_file_path = '/home/viper/Downloads/UPDATED_NLP_COURSE/TextFiles/test_qa.txt'

In [6]:
with open(test_file_path, 'rb') as file:
    test_data = pickle.load(file)

In [17]:
all_data = test_data + train_data

In [20]:
vocab = set()

for story, question, answer in all_data:
    vocab = vocab.union(set(story), set(question))

In [22]:
vocab.add('yes')
vocab.add('no')

In [25]:
vocab_len = len(vocab) + 1 # keras pad_sequences requires padding.

In [24]:
vocab_len

38

In [31]:
max_story_len = 0

for story, question, answer in all_data:
    if len(story) > max_story_len:
        max_story_len = len(story)

In [32]:
max_story_len

156

In [54]:
max_question_len = max([len(data[1]) for data in all_data])

In [56]:
max_question_len

6

In [35]:
# Vectorizing data

In [37]:
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [38]:
tokenizer = Tokenizer(filters=[])

In [39]:
tokenizer.fit_on_texts(vocab)

In [44]:
word_dict = tokenizer.word_index

In [45]:
train_story_text = []
train_question_text = []
train_answers = []

In [47]:
for story, question, answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)

In [51]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)
# converting text sequences to vector or number sequences

In [59]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=max_story_len, max_question_len=max_question_len):
    X = [] # stories
    Xq = [] # question
    Y = [] # answers
    
    for story, question, answer in data:
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[question.lower()] for question in question]
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1 
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    
    return (pad_sequences(X, maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [60]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)

In [61]:
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [65]:
# Building the network

In [66]:
from keras.models import Sequential, Model

In [68]:
from keras.layers import Embedding

In [69]:
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

In [70]:
input_sequence = Input((max_story_len,))
question = Input((max_question_len,))

In [71]:
vocab_size = len(vocab) + 1 

In [76]:
# Input encoder M
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
input_encoder_m.add(Dropout(0.35))

# output of input_encoder_m = (samples, story_maxlen, embedding_dim)

In [77]:
# Input encoder M
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size, output_dim=max_question_len))
input_encoder_c.add(Dropout(0.35))

# output of input_encoder_c = (samples, story_maxlen, max_question_len)

In [78]:
# Input encoder M
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=max_answer_len))
question_encoder.add(Dropout(0.35))

# output of input_encoder_c = (samples, query_maxlen, max_answer_len)

In [79]:
# encoded <--- encoder(input)

input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

In [81]:
match = dot([input_encoded_m, question_encoded], axes=(2,2))
match = Activation('softmax')(match)

In [83]:
response = add([match, input_encoded_c])
response = Permute((2,1))(response)

In [84]:
answer = concatenate([response, question_encoded])

In [85]:
answer

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate')>

In [86]:
answer = LSTM(32)(answer)

In [87]:
answer

<KerasTensor: shape=(None, 32) dtype=float32 (created by layer 'lstm')>

In [88]:
answer = Dropout(0.4)(answer)
answer = Dense(vocab_size)(answer)

In [89]:
answer = Activation('softmax')(answer)

In [90]:
model = Model([input_sequence, question], answer)

In [94]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [95]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 156)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 6)]          0           []                               
                                                                                                  
 sequential_3 (Sequential)      (None, None, 64)     2432        ['input_1[0][0]']                
                                                                                                  
 sequential_5 (Sequential)      (None, 6, 64)        2432        ['input_2[0][0]']                
                                                                                              

In [None]:
history = model.fit([inputs_train, queries_train], answers_train, batch_size=32, epochs=20, validation_data=([inputs_test, queries_test], answers_test))

In [97]:
model.load_weights('/home/viper/Downloads/UPDATED_NLP_COURSE/06-Deep-Learning/chatbot_10.h5')

In [98]:
pred_results = model.predict(([inputs_test, queries_test]))



In [102]:
val_max = np.argmax(pred_results[0])

In [103]:
for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

In [104]:
k

'.'

In [105]:
pred_results[0][val_max]

0.98382735

In [106]:
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [107]:
story = 'John left the kitchen . Sandra dropped milk in kitchen . '

In [108]:
story.split()

['John',
 'left',
 'the',
 'kitchen',
 '.',
 'Sandra',
 'dropped',
 'milk',
 'in',
 'kitchen',
 '.']

In [109]:
question = 'Is the milk in kitchen ? '

In [110]:
question.split()

['Is', 'the', 'milk', 'in', 'kitchen', '?']

In [112]:
data = [(story.split(), question.split(), 'yes')]

In [113]:
data

[(['John',
   'left',
   'the',
   'kitchen',
   '.',
   'Sandra',
   'dropped',
   'milk',
   'in',
   'kitchen',
   '.'],
  ['Is', 'the', 'milk', 'in', 'kitchen', '?'],
  'yes')]

In [114]:
n_story, n_ques, n_ans = vectorize_stories(data)

In [116]:
preds = model.predict(([n_story, n_ques]))



In [118]:
preds.argmax()

6