In [1]:
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras import layers , activations , models
import requests, zipfile, io
from tensorflow.keras import preprocessing , utils
import os
import yaml
import string
import re

In [2]:
regex = '(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )'

data_path = "./chatbot_nlp/data"
files = os.listdir(data_path+"/")

questions = list()
answers = list()

for file_name in files:
    stream = open(data_path+"/"+file_name)
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for conv in conversations:
        replies = conv[1:]
        ans = ''
        for rep in replies:
            if type(rep) is str:
                ans += ' ' + rep
        if ans is not '':
            answers.append(re.sub(regex, r' ', ans))
            questions.append(re.sub(regex, r' ', conv[0]))

for i in range(len(answers)):
    answers[i] = "<START> " + answers[i] + " <END>"

# Tokenizing
t = preprocessing.text.Tokenizer(filters='')
t.fit_on_texts(questions+answers)
vocab_size = len(t.word_index)+1
print("Vocab size: ", vocab_size)

# encoder input
tokenized_q = t.texts_to_sequences(questions)
question_maxlen = max([len(x) for x in questions])
padded_q = preprocessing.sequence.pad_sequences(tokenized_q, maxlen=question_maxlen, padding='post')
encoder_input_data = np.array(padded_q)
print(encoder_input_data.shape, question_maxlen)

# decoder input
tokenized_ans = t.texts_to_sequences(answers)
answer_maxlen = max([len(x) for x in answers])
padded_ans = preprocessing.sequence.pad_sequences(tokenized_ans, maxlen=answer_maxlen, padding='post')
decoder_input_data = np.array(padded_ans)
print(decoder_input_data.shape, answer_maxlen)

# decoder output
tokenized_ans = t.texts_to_sequences(answers)
for i in range(len(tokenized_ans)):
    tokenized_ans[i]=tokenized_ans[i][1:]
padded_ans = preprocessing.sequence.pad_sequences(tokenized_ans, maxlen=answer_maxlen, padding='post')
onehot = utils.to_categorical(padded_ans, vocab_size)
decoder_output_data = np.array(onehot)
print(decoder_output_data.shape)

Vocab size:  1912
(564, 128) 128
(564, 455) 455
(564, 455, 1912)


In [3]:
##### Building training model
embedding_matrix = np.load('./data/embedding_matrix.npy')

embedding = layers.Embedding(vocab_size, 200, weights=[embedding_matrix], trainable=False)

encoder_input = layers.Input(shape=(None, ))
encoder_lstm = layers.LSTM(200, return_state=True)
_ , state_h1, state_c1 = encoder_lstm(embedding(encoder_input))
encoder_states = [state_h1, state_c1]

decoder_input = layers.Input(shape=(None, ))
decoder_lstm = layers.LSTM(200, return_state=True, return_sequences=True)
decoder_output, _, _ = decoder_lstm(embedding(decoder_input), initial_state=encoder_states)
decoder_dense = layers.Dense(vocab_size, activation='softmax')
output = decoder_dense(decoder_output)

train_model = models.Model([encoder_input, decoder_input], output)
train_model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

train_model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 200)    382400      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 200), (None, 320800      embedding[0][0]              

In [4]:
train_model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=40 )
# train_model.save( 'model.h5' )

Train on 564 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x25dee59df98>

In [4]:
train_model.load_weights('model.h5')

In [5]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_input, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        embedding(decoder_input) , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_input] + decoder_states_inputs,
        [decoder_output] + decoder_states)
    
    return encoder_model , decoder_model

In [6]:
def str_to_tokens( sentence : str ):
    translator = str.maketrans('', '', string.punctuation)
    words = sentence.lower().translate(translator).split()
    
    tokens_list = list()
    for word in words:
        try:
            a = tokenizer.word_index[ word ]
        except KeyError:
            a = 2
        tokens_list.append( a ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')

In [7]:
enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )


ValueError: Graph disconnected: cannot obtain value for tensor Tensor("input_1:0", shape=(None, None), dtype=float32) at layer "input_1". The following previous layers were accessed without issue: ['input_2']