# Chatbot using LSTM-Encoder-Decoder

The Encoder-Decoder LSTM is a recurrent neural network designed to address sequence-to-sequence problems, sometimes called seq2seq.

Sequence-to-sequence prediction problems are challenging because the number of items in the input and output sequences can vary. For example, text translation and learning to execute programs are examples of seq2seq problems.

One approach to seq2seq prediction problems that has proven very effective is called the Encoder-Decoder LSTM.

This architecture is comprised of two models: one for reading the input sequence and encoding it into a fixed-length vector, and a second for decoding the fixed-length vector and outputting the predicted sequence. The use of the models in concert gives the architecture its name of Encoder-Decoder LSTM designed specifically for seq2seq problems.

![nn.png](encoder_decoder.png)

Seq2seq model will be trained on [Cornell Movie - Dialogs Corpus](https://www.kaggle.com/Cornell-University/movie-dialog-corpus) Dataset

In [0]:
# Importing the libraries
from keras.models import Model, model_from_json
from keras.layers.recurrent import LSTM
from keras.layers import Input, LSTM, Dense, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint,EarlyStopping
from collections import Counter
import nltk
import numpy as np
from sklearn.model_selection import train_test_split

np.random.seed(42)

In [0]:
BATCH_SIZE = 64
NUM_EPOCHS = 100
HIDDEN_UNITS = 256
MAX_INPUT_SEQ_LENGTH = 40
MAX_TARGET_SEQ_LENGTH = 40
MAX_VOCAB_SIZE = 800
DATA_PATH = 'dataset/movie_lines_cleaned_10k.txt'
WEIGHT_FILE_PATH = 'models/word-seq.h5'

In [0]:
# Importing the dataset
lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n')
input_texts = []
target_texts = []

## PART 1 - DATA PREPROCESSING 

In [0]:
input_counter = Counter()
target_counter = Counter()

In [0]:
# Getting separately the questions and the answers
prev_words = []
for line in lines:

    next_words = [w.lower() for w in nltk.word_tokenize(line)]
    if len(next_words) > MAX_TARGET_SEQ_LENGTH:
        next_words = next_words[0:MAX_TARGET_SEQ_LENGTH]

    if len(prev_words) > 0:
        input_texts.append(prev_words)
        for w in prev_words:
            input_counter[w] += 1

        target_words = next_words[:]
        target_words.insert(0, 'START')
        target_words.append('END')
        for w in target_words:
            target_counter[w] += 1
        target_texts.append(target_words)

    prev_words = next_words

In [0]:
# Creating two dictionaries that map the questions words and the answers words to a unique integer
input_word2idx = dict()
target_word2idx = dict()
for idx, word in enumerate(input_counter.most_common(MAX_VOCAB_SIZE)):
    input_word2idx[word[0]] = idx + 2
for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)):
    target_word2idx[word[0]] = idx + 1

In [0]:
# Adding the last tokens to these two dictionaries
input_word2idx['PAD'] = 0
input_word2idx['UNK'] = 1
target_word2idx['UNK'] = 0

# Creating the inverse dictionary of the answerswords2int dictionary
input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])
target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

num_encoder_tokens = len(input_idx2word)
num_decoder_tokens = len(target_idx2word)

np.save(project_path+'models/word-input-word2idx.npy', input_word2idx)
np.save(project_path+'models/word-input-idx2word.npy', input_idx2word)
np.save(project_path+'models/word-target-word2idx.npy', target_word2idx)
np.save(project_path+'models/word-target-idx2word.npy', target_idx2word)

In [11]:
# Translating all the questions and the answers into integers
encoder_input_data = []

encoder_max_seq_length = 0
decoder_max_seq_length = 0

for input_words, target_words in zip(input_texts, target_texts):
    encoder_input_wids = []
    for w in input_words:
        w2idx = 1  # default [UNK]
        if w in input_word2idx:
            w2idx = input_word2idx[w]
        encoder_input_wids.append(w2idx)

    encoder_input_data.append(encoder_input_wids)
    encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length)
    decoder_max_seq_length = max(len(target_words), decoder_max_seq_length)

context = dict()
context['num_encoder_tokens'] = num_encoder_tokens
context['num_decoder_tokens'] = num_decoder_tokens
context['encoder_max_seq_length'] = encoder_max_seq_length
context['decoder_max_seq_length'] = decoder_max_seq_length

print(context)
np.save(project_path+'models/word-context.npy', context)

{'num_encoder_tokens': 802, 'num_decoder_tokens': 801, 'encoder_max_seq_length': 40, 'decoder_max_seq_length': 42}


In [0]:
# generate traning and testing batches
def generate_batch(input_data, output_text_data):
    num_batches = len(input_data) // BATCH_SIZE
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * BATCH_SIZE
            end = (batchIdx + 1) * BATCH_SIZE
            encoder_input_data_batch = pad_sequences(input_data[start:end], encoder_max_seq_length)
            decoder_target_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            decoder_input_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            for lineIdx, target_words in enumerate(output_text_data[start:end]):
                for idx, w in enumerate(target_words):
                    w2idx = 0  # default [UNK]
                    if w in target_word2idx:
                        w2idx = target_word2idx[w]
                    decoder_input_data_batch[lineIdx, idx, w2idx] = 1
                    if idx > 0:
                        decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1
            yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch

## PART 2 - BUILDING THE SEQ2SEQ MODEL

In [13]:
# Creating the Encoder RNN
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=HIDDEN_UNITS,
                              input_length=encoder_max_seq_length, name='encoder_embedding')
encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [encoder_state_h, encoder_state_c]
# Creating the Decoder RNN
decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs')
decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
                                                                 initial_state=encoder_states)
decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

json = model.to_json()
open(project_path+'models/word-architecture.json', 'w').write(json)








3664

In [14]:
# create train and test sets
Xtrain, Xtest, Ytrain, Ytest = train_test_split(encoder_input_data, target_texts, test_size=0.2, random_state=42)

print(len(Xtrain))
print(len(Xtest))

7997
2000


In [0]:
# train and test batch size 
train_num_batches = len(Xtrain) // BATCH_SIZE
test_num_batches = len(Xtest) // BATCH_SIZE

In [0]:
# generate train and test datra
train_gen = generate_batch(Xtrain, Ytrain)
test_gen = generate_batch(Xtest, Ytest)

In [0]:
# callbacks
# checkpoint = ModelCheckpoint(WEIGHT_FILE_PATH, monitor ='val_loss', verbose = 1, save_best_only = True, mode ='max')
# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)

## PART 3 - TRAINING THE SEQ2SEQ MODEL

In [18]:
model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                    epochs=NUM_EPOCHS,
                    verbose=1, validation_data=test_gen, validation_steps=test_num_batches)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/100






Epoch 00001: val_loss improved from -inf to 1.43352, saving model to /content/drive/My Drive/DLCP/openwork/Chatbot_with_Encoder_Decoder/models/word-weights.h5
Epoch 2/100

Epoch 00002: val_loss did not improve from 1.43352
Epoch 3/100

Epoch 00003: val_loss did not improve from 1.43352
Epoch 4/100

Epoch 00004: val_loss did not improve from 1.43352
Epoch 5/100

Epoch 00005: val_loss did not improve from 1.43352
Epoch 6/100

Epoch 00006: val_loss did not improve from 1.43352
Epoch 7/100

Epoch 00007: val_loss did not improve from 1.43352
Epoch 8/100

Epoch 00008: val_loss did not improve from 1.43352
Epoch 9/100

Epoch 00009: val_loss did not improve from 1.43352
Epoch 10/100

Epoch 00010: val_loss did not improve from 1.43352
Epoch 11/100

Epoch 00011: val_loss did not improve from 1.43352
Epoch 12/100

Epoch 00012: val_loss did not improve from 1.43352
Epoch 13/100

Epoch

<keras.callbacks.History at 0x7fcb2015eef0>

In [0]:
# save the model
model.save_weights(WEIGHT_FILE_PATH)

## PART 4 - TESTING THE SEQ2SEQ MODEL

In [0]:
class ChatBot():
    """
    This is ChatBot class it takes weights for the Neural Network, compliling model
    and returns prediction in responce to input text
    """
    def __init__(self):
        """
        define all required parameters, rebuild model and load weights
        """
        self.input_word2idx = np.load(project_path+'models/word-input-word2idx.npy',allow_pickle=True).item()
        self.input_idx2word = np.load(project_path+'models/word-input-idx2word.npy',allow_pickle=True).item()
        self.target_word2idx = np.load(project_path+'models/word-target-word2idx.npy',allow_pickle=True).item()
        self.target_idx2word = np.load(project_path+'models/word-target-idx2word.npy',allow_pickle=True).item()
        context = np.load(project_path+'models/word-context.npy',allow_pickle=True).item()
        self.max_encoder_seq_length = context['encoder_max_seq_length']
        self.max_decoder_seq_length = context['decoder_max_seq_length']
        self.num_encoder_tokens = context['num_encoder_tokens']
        self.num_decoder_tokens = context['num_decoder_tokens']
        self.ultimate_question = 'Answer to the Ultimate Question of Life, the Universe, and Everything'

        encoder_inputs = Input(shape=(None, ), name='encoder_inputs')
        encoder_embedding = Embedding(input_dim=self.num_encoder_tokens, output_dim=HIDDEN_UNITS,
                                      input_length=self.max_encoder_seq_length, name='encoder_embedding')
        encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm")
        encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs))
        encoder_states = [encoder_state_h, encoder_state_c]

        decoder_inputs = Input(shape=(None, self.num_decoder_tokens), name='decoder_inputs')
        decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm')
        decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
        decoder_dense = Dense(self.num_decoder_tokens, activation='softmax', name='decoder_dense')
        decoder_outputs = decoder_dense(decoder_outputs)

        self.model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
        # Loading the weights
        self.model.load_weights(project_path+'models/word-seq.h5')
        self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

        self.encoder_model = Model(encoder_inputs, encoder_states)

        decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))]
        decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs)
        decoder_states = [state_h, state_c]
        decoder_outputs = decoder_dense(decoder_outputs)
        self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)
    # Setting up the chat
    def reply(self, input_text):
        """
        Takes input_text and return predicted responce
        :param input_text: string
        :return: predicted_text: string
        """
        if input_text == self.ultimate_question:
            return '42'
        input_seq = []
        input_wids = []
        for word in nltk.word_tokenize(input_text.lower()):
            idx = 1
            if word in self.input_word2idx:
                idx = self.input_word2idx[word]
            input_wids.append(idx)
        input_seq.append(input_wids)
        input_seq = pad_sequences(input_seq, self.max_encoder_seq_length)
        states_value = self.encoder_model.predict(input_seq)
        target_seq = np.zeros((1, 1, self.num_decoder_tokens))
        target_seq[0, 0, self.target_word2idx['START']] = 1
        target_text = ''
        target_text_len = 0
        terminated = False
        while not terminated:
            output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value)
            sample_token_idx = np.argmax(output_tokens[0, -1, :])
            sample_word = self.target_idx2word[sample_token_idx]
            target_text_len += 1

            if sample_word != 'START' and sample_word != 'END':
                target_text += ' ' + sample_word

            if sample_word == 'END' or target_text_len >= self.max_decoder_seq_length:
                terminated = True

            target_seq = np.zeros((1, 1, self.num_decoder_tokens))
            target_seq[0, 0, sample_token_idx] = 1

            states_value = [h, c]
        return target_text.strip().replace('UNK', '')

In [0]:
# start chatbot
bot = ChatBot()

In [59]:
bot.reply('Where are you from?')

"i do n't know ."

In [61]:
bot.reply('What is your name?')

' ,  .'