# Seq2Seq: Encoder-Decoder Chatbot 

![](https://cdn-images-1.medium.com/max/2560/1*1I2tTjCkMHlQ-r73eRn4ZQ.png)

In [2]:
import numpy as np
import pandas as pd
import string
import pickle
import operator
import matplotlib.pyplot as plt
%matplotlib inline

## Step 1. Import Data

In [3]:
# .txtから会話データを取得する
import codecs

with codecs.open("movie_lines.txt", "rb", encoding="utf-8", errors="ignore") as f:
    lines = f.read().split("\n")
    conversations = []
    for line in lines:
        data = line.split(" +++$+++ ")
        conversations.append(data)

In [4]:
conversations[:6]

[['L1045', 'u0', 'm0', 'BIANCA', 'They do not!'],
 ['L1044', 'u2', 'm0', 'CAMERON', 'They do to!'],
 ['L985', 'u0', 'm0', 'BIANCA', 'I hope so.'],
 ['L984', 'u2', 'm0', 'CAMERON', 'She okay?'],
 ['L925', 'u0', 'm0', 'BIANCA', "Let's go."],
 ['L924', 'u2', 'm0', 'CAMERON', 'Wow']]

In [5]:
# idと会話だけ取り出す
chats = {}
for tokens in conversations:
    if len(tokens) > 4:
        idx = tokens[0][1:]
        chat = tokens[4]
        chats[int(idx)] = chat

In [8]:
# idと会話をセットにする
sorted_chats = sorted(chats.items(), key = lambda x: x[0])

In [9]:
# 会話のペアごとに辞書を作る { 会話セットid: [会話リスト] }
conves_dict = {}
counter = 1
conves_ids = []
for i in range(1, len(sorted_chats)+1):
    if i < len(sorted_chats):
        if (sorted_chats[i][0] - sorted_chats[i-1][0]) == 1:
            # 1つ前の会話の頭の文字がないのを確認
            if sorted_chats[i-1][1] not in conves_ids:
                conves_ids.append(sorted_chats[i-1][1])
            conves_ids.append(sorted_chats[i][1])
        elif (sorted_chats[i][0] - sorted_chats[i-1][0]) > 1:            
            conves_dict[counter] = conves_ids
            conves_ids = []
        counter += 1
    else:
        pass

In [10]:
context_and_target = []
for conves in conves_dict.values():
    # ペアがない会話は捨てる
    if len(conves) % 2 != 0:
        conves = conves[:-1]
    for i in range(0, len(conves), 2):
        context_and_target.append((conves[i], conves[i+1]))

In [11]:
# ペア完成
context_and_target[:5]

[("Drink up, Charley. We're ahead of you.", "I'm not thirsty."),
 ("After what we been hearing about your brother, I thought your throat'd be kind of dry.",
  "So they're walking along and smiling. That doesn't mean he's going to talk. There's no evidence until he gives public testimony."),
 ("Thanks for the legal advice, Charley. That's what we always kept you around for.  Now how do we keep him from giving this testimony? Isn't that the er as you put it main order of business?",
  'He was always a good kid. You know that.'),
 ('Did you change your hair?', 'No.'),
 ('I believe I have found a faster way.', 'Hmmm?')]

In [12]:
context, target = zip(*context_and_target)

In [13]:
context = list(context)
target = list(target)

In [14]:
context[:5]

["Drink up, Charley. We're ahead of you.",
 "After what we been hearing about your brother, I thought your throat'd be kind of dry.",
 "Thanks for the legal advice, Charley. That's what we always kept you around for.  Now how do we keep him from giving this testimony? Isn't that the er as you put it main order of business?",
 'Did you change your hair?',
 'I believe I have found a faster way.']

In [16]:
target[:5]

["I'm not thirsty.",
 "So they're walking along and smiling. That doesn't mean he's going to talk. There's no evidence until he gives public testimony.",
 'He was always a good kid. You know that.',
 'No.',
 'Hmmm?']

## Step 2. Preprocessing for text data

In [17]:
# from my_seq2seq_text_cleanear import text_modifier, nonalpha_remover
import re
MAX_LEN = 12

In [18]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

### 2-1. Clean Text

In [19]:
tidy_target = []
for conve in target:
    text = clean_text(conve)
    tidy_target.append(text)

In [44]:
tidy_target[:5]

['i am not thirsty',
 "so they are walking along and smiling that does not mean he is going to talk there's no evidence until he gives public testimony",
 'he was always a good kid you know that',
 'no',
 'hmmm']

In [21]:
tidy_context = []
for conve in context:
    text = clean_text(conve)
    tidy_context.append(text)

In [43]:
tidy_context[:5]

['drink up charley we are ahead of you',
 'after what we been hearing about your brother i thought your throat would be kind of dry',
 'thanks for the legal advice charley that is what we always kept you around for  now how do we keep him from giving this testimony is not that the er as you put it main order of business',
 'did you change your hair',
 'i believe i have found a faster way']

In [23]:
# decoderのinputには<BOS>と<EOS>タグ
bos = "<BOS> "
eos = " <EOS>"
final_target = [bos + conve + eos for conve in tidy_target] 
encoder_inputs = tidy_context
decoder_inputs = final_target

In [24]:
import codecs
with codecs.open("encoder_inputs.txt", "rb", encoding="utf-8", errors="ignore") as f:
    lines = f.read().split("\n")
    encoder_text = []
    for line in lines:
        data = line.split("\n")[0]
        encoder_text.append(data)

In [25]:
len(encoder_text)

143865

In [42]:
encoder_text[:5]

['did you change your hair',
 'i missed you',
 'it was a bratwurst  i was eating lunch',
 'you the new guy',
 "c'mon  i am supposed to give you the tour"]

In [28]:
with codecs.open("decoder_inputs.txt", "rb", encoding="utf-8", errors="ignore") as f:
    lines = f.read().split("\n")
    decoder_text = []
    for line in lines:
        data = line.split("\n")[0]
        decoder_text.append(data)

In [41]:
decoder_text[:5]

['<BOS> no <EOS>',
 '<BOS> it says here you exposed yourself to a group of freshmen girls <EOS>',
 '<BOS> with the teeth of your zipper <EOS>',
 '<BOS> so they tell me <EOS>',
 '<BOS> so  which dakota you from <EOS>']

### 2-2. MAKE VOCABRALY

In [32]:
# 一旦もともと辞書サイズを調べる

full_text = encoder_text + decoder_text
dictionary = []
for text in full_text:
    words = text.split()
    for i in range(0, len(words)):
        if words[i] not in dictionary:
            dictionary.append(words[i])

In [33]:
from keras.preprocessing.text import Tokenizer
VOCAB_SIZE = 14999
tokenizer = Tokenizer(num_words=VOCAB_SIZE)

Using TensorFlow backend.


In [34]:
full_text = encoder_text + decoder_text

In [88]:
# 辞書を作る
tokenizer.fit_on_texts(full_text)
word_index = tokenizer.word_index
len(word_index)

{'genetically': 14816,
 'ecentus': 64088,
 'houston': 4172,
 'cufflinks': 30399,
 "annabelle's": 23767,
 'bank': 729,
 'afeard': 36839,
 'bloodbath': 14490,
 'yourselfwhat': 45398,
 'presenting': 61925,
 "how'm": 8189,
 'cacannot': 55395,
 'recognizable': 17327,
 'theauthorities': 36973,
 'vulva': 33204,
 "lock's": 29999,
 'brazilian': 14590,
 'indemnity': 22496,
 'ham': 9053,
 'davy': 15081,
 'slugged': 45153,
 'tangiers': 12876,
 'stilson': 56238,
 "agent's": 25183,
 'thrive': 19121,
 'strutted': 47291,
 'beforeuh': 55015,
 'nests': 28352,
 'nypd': 21304,
 'gobs': 15586,
 'newscast': 55773,
 'narrative': 19846,
 'thump': 53119,
 'levelled': 48721,
 'outsville': 59006,
 'ins': 12754,
 'deeds': 2862,
 "traffic's": 32209,
 'petyr': 64952,
 'lights': 1429,
 'crocs': 19025,
 'conclusively': 34333,
 'workingman': 18617,
 "hearst's": 60679,
 'duncan': 6649,
 'nobody': 363,
 'burritosenchiladas': 41408,
 'stretcher': 15739,
 'varmints': 48685,
 'preachnone': 61143,
 'ampi': 63536,
 'goddamnd

In [36]:
# リバースした辞書を用意
index2word = {}
for k, v in word_index.items():
    if v < 15000:
        index2word[v] = k
    if v > 15000:
        continue

In [53]:
index2word

{1: 'bos',
 2: 'eos',
 3: 'you',
 4: 'i',
 5: 'the',
 6: 'to',
 7: 'is',
 8: 'a',
 9: 'not',
 10: 'it',
 11: 'that',
 12: 'do',
 13: 'and',
 14: 'are',
 15: 'of',
 16: 'in',
 17: 'have',
 18: 'what',
 19: 'me',
 20: 'we',
 21: 'he',
 22: 'am',
 23: 'this',
 24: 'for',
 25: 'will',
 26: 'know',
 27: 'was',
 28: 'your',
 29: 'my',
 30: 'on',
 31: 'be',
 32: 'no',
 33: 'with',
 34: 'but',
 35: 'they',
 36: 'would',
 37: 'just',
 38: 'all',
 39: 'like',
 40: 'did',
 41: 'about',
 42: 'get',
 43: 'so',
 44: 'out',
 45: 'if',
 46: 'here',
 47: 'she',
 48: 'him',
 49: 'up',
 50: 'how',
 51: 'got',
 52: 'can',
 53: 'want',
 54: 'think',
 55: 'at',
 56: 'there',
 57: 'one',
 58: 'right',
 59: 'go',
 60: 'now',
 61: 'well',
 62: 'going',
 63: 'her',
 64: 'why',
 65: 'see',
 66: 'as',
 67: 'oh',
 68: 'his',
 69: 'could',
 70: 'yes',
 71: 'who',
 72: 'good',
 73: 'when',
 74: 'cannot',
 75: 'from',
 76: 'where',
 77: 'were',
 78: 'yeah',
 79: 'tell',
 80: 'come',
 81: 'some',
 82: 'been',
 83: 'an

In [38]:
word2index = {}
for k, v in index2word.items():
    word2index[v] = k

In [52]:
word2index

{'genetically': 14816,
 'houston': 4172,
 'rick': 1745,
 'burgel': 8723,
 'plot': 4502,
 'bloodbath': 14490,
 'sand': 2420,
 'pricks': 11562,
 "how'm": 8189,
 'error': 4531,
 'deformed': 14246,
 'er': 1323,
 'sophie': 5409,
 'lobe': 11981,
 'brazilian': 14590,
 "death's": 8435,
 'ham': 9053,
 'interference': 6712,
 'sissy': 6543,
 'thoughtful': 7289,
 'steak': 3396,
 'volumes': 13976,
 'associated': 7963,
 'again': 174,
 'safer': 3472,
 'iraqis': 5354,
 'welcome': 819,
 'sally': 1964,
 'commerce': 7402,
 'savior': 10105,
 'unusual': 2045,
 'ins': 12754,
 'july': 4601,
 'seniors': 14615,
 'deeds': 2862,
 'hallucination': 11053,
 'barks': 7326,
 'channels': 5858,
 'insanity': 7328,
 'swimmer': 14770,
 'hysteria': 9887,
 'arson': 10070,
 'duncan': 6649,
 'edge': 2219,
 'nobody': 363,
 'annoying': 7240,
 'eightyfive': 13204,
 'watery': 14169,
 'go': 59,
 'kike': 13936,
 'condition': 1370,
 'roasted': 13960,
 "fuck's": 5281,
 'godfather': 5549,
 'airlines': 9240,
 '79': 14249,
 'lamont': 12

In [40]:
len(word2index) == len(index2word)

True

In [45]:
len(index2word)

14999

### 2-3. ONE-HOT VECTORIZER

In [46]:
# 単語のシーケンスを作る np.arrayにする
encoder_sequences = tokenizer.texts_to_sequences(encoder_text)
# encider_sequences = np.array(encider_sequences)

In [47]:
# デコーダーデータ
decoder_sequences = tokenizer.texts_to_sequences(decoder_text)
# decoder_sequences = np.array(decoder_sequences)

In [48]:
encoder_sequences[:5]

[[40, 3, 436, 28, 621],
 [4, 950, 3],
 [10, 27, 8, 4, 27, 1107, 802],
 [3, 5, 186, 168],
 [662, 4, 22, 346, 6, 130, 3, 5, 2407]]

In [49]:
for seqs in encoder_sequences:
    for seq in seqs:
        if seq > 14999:
            print(seq)
            break

In [54]:
VOCAB_SIZE = len(index2word) + 1
VOCAB_SIZE

15000

In [55]:
decoder_sequences[:5]

[[1, 32, 2],
 [1, 10, 278, 46, 3, 4344, 226, 6, 8, 1147, 15, 570, 2],
 [1, 33, 5, 1286, 15, 28, 13725, 2],
 [1, 43, 35, 79, 19, 2],
 [1, 43, 256, 8636, 3, 75, 2]]

### 2-4. PADDING

In [64]:
from keras.preprocessing.sequence import pad_sequences
encoder_input_data = pad_sequences(encoder_sequences, maxlen=MAX_LEN, dtype='int32', padding='post', truncating='post')
decoder_input_data = pad_sequences(decoder_sequences, maxlen=MAX_LEN, dtype='int32', padding='post', truncating='post')

In [86]:
decoder_input_data[:5]

array([[    1,    32,     2,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [    1,    10,   278,    46,     3,  4344,   226,     6,     8,
         1147,    15,   570,     2,     0,     0,     0,     0,     0,
            0,     0],
       [    1,    33,     5,  1286,    15,    28, 13725,     2,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [    1,    43,    35,    79,    19,     2,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [    1,    43,   256,  8636,     3,    75,     2,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0]], dtype=int32)

In [87]:
decoder_input_data.shape

(143865, 20)

In [60]:
import numpy as np
MAX_LEN = 20
num_samples = len(encoder_sequences)
decoder_output_data = np.zeros((num_samples, MAX_LEN, VOCAB_SIZE), dtype="float32")

In [63]:
decoder_output_data.shape

(143865, 20, 15000)

In [66]:
# outputの３Dテンソル
for i, seqs in enumerate(decoder_input_data):
    for j, seq in enumerate(seqs):
        if j > 0:
            decoder_output_data[i][j][seq] = 1.

In [67]:
decoder_output_data.shape

(143865, 20, 15000)

### 2-5. Word2Vec: pretrained glove vector

In [68]:
embeddings_index = {}
with open('glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

print("Glove Loded!")

Glove Loded!


In [69]:
embedding_dimention = 50
def embedding_matrix_creater(embedding_dimention, word_index):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimention))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [70]:
embedding_matrix = embedding_matrix_creater(50, word_index=word2index)

In [73]:
embed_layer = Embedding(input_dim=VOCAB_SIZE, output_dim=50, trainable=True,)
embed_layer.build((None,))
embed_layer.set_weights([embedding_matrix])

## Step 3. Build Seq2Seq Model

In [72]:
from keras.layers import Embedding
from keras.layers import Input, Dense, LSTM, TimeDistributed
from keras.models import Model

In [74]:
def seq2seq_model_builder(HIDDEN_DIM=300):
    
    encoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
    encoder_embedding = embed_layer(encoder_inputs)
    encoder_LSTM = LSTM(HIDDEN_DIM, return_state=True)
    encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)
    
    decoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
    decoder_embedding = embed_layer(decoder_inputs)
    decoder_LSTM = LSTM(HIDDEN_DIM, return_state=True, return_sequences=True)
    decoder_outputs, _, _ = decoder_LSTM(decoder_embedding, initial_state=[state_h, state_c])
    
    # dense_layer = Dense(VOCAB_SIZE, activation='softmax')
    outputs = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], outputs)
    
    return model

In [75]:
model = seq2seq_model_builder(HIDDEN_DIM=300)

In [76]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 50)       750000      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 300), (None, 421200      embedding_1[0][0]                
__________

In [77]:
pwd

'/Users/akr712/Desktop/トレーニング待ち/CHATBOT'

In [None]:
from keras.utils import plot_model
plot_model(model, to_file='/Users/akr712/Desktop/CHATBOT/seq2seq.png')

In [78]:
model.compile(optimizer='adam', loss ='categorical_crossentropy', metrics = ['accuracy'])

## Step 4. Training Model

In [79]:
BATCH_SIZE = 32
EPOCHS = 5

In [80]:
encoder_input_data.shape

(143865, 20)

In [83]:
from sklearn.model_selection import train_test_split
en_train, en_val, ja_train, ja_val = train_test_split(encoder_input_data, decoder_input_data)

In [84]:
train_num = len(en_train)
target_train = decoder_output_data[:train_num]
target_val = decoder_output_data[train_num:]

In [None]:
history = model.fit([en_train, ja_train], 
                     target_train, 
                     epochs=EPOCHS, 
                     batch_size=BATCH_SIZE,
                     validation_data=([en_val, ja_val], target_val))

Train on 107898 samples, validate on 35967 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

#### Visualize Learning History

In [None]:
# 正確性の可視化
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10, 6))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# 損失関数の可視化
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# 重みを保存する
json_string = model.to_json()
open('seq2seq.json', 'w').write(json_string)
model.save_weights('seq2seq_weights.h5')

In [None]:
%ls

In [None]:
pwd

## Step 5. Inference 

In [None]:
json_file = open('seq2seq.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
chat_model = model.from_json(loaded_model_json)

# load weights into new model
chat_model.load_weights("model.h5")

In [None]:
ball = "I am Hungry. What do you suggest?"

In [None]:
import re
cleaned_ball = clean_text(ball)

In [None]:
query = cleaned_ball.split()
query = np.array([word2index[word] for word in query])
query = pad_sequences([query], maxlen=20)

In [None]:
encoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
encoder_embedding = embed_layer(encoder_inputs)
# encoder_LSTM = LSTM(HIDDEN_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)

decoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
decoder_embedding = embed_layer(decoder_inputs)
# decoder_LSTM = LSTM(HIDDEN_DIM, return_state=True, return_sequences=True)
decoder_outputs, _h, _c = decoder_LSTM(decoder_embedding, initial_state=[state_h, state_c])

# dense_layer = Dense(VOCAB_SIZE, activation='softmax')
outputs = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))(decoder_outputs)

chat_model = Model(encoder_inputs, [state_h, state_c])

In [None]:
# define the inputs for the decoder LSTM
h = Input(shape = (300, ))
c = Input(shape = (300, ))

In [None]:
_outputs, _h, _c = decoder_LSTM(decoder_embedding, initial_state=[h, c])
outputs = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))(_outputs)
chat_model = Model([encoder_inputs, state_h, state_c], [outputs, _h, _c])

In [None]:
# pass in the question to the encoder LSTM, to get the final encoder states of the encoder LSTM
query_h, query_c = chat_model.predict(query)

In [None]:
# initialize the answer that will be generated for the 'BOS' input. Since we have used pre-padding for padding sequences,
# the last token in the 'answer' variable is initialised with the index for 'BOS'.
answer = np.zeros((1, MAX_LEN))
answer[0, -1] = word2index['<BOS >']

In [None]:
# i keeps track of the length of the generated answer. This won't allow the model to genrate sequences with more than 20 words.
i = 1

# make a new list to store the words generated at each time step
answer_1 = []

# flag to stop the model when 'EOS' tag is generated or when 20 time steps have passed.
flag = 0

# run the inference model
while flag != 1:
    # make predictions for the given input token and encoder states
    prediction, prediction_h, prediction_c = chat_model.predict([answer, query_h, query_c])
    
    # from the generated predictions of shape (num_examples, maxLen, vocab_size), find the token with max probability
    token_arg = np.argmax(prediction[0, -1, :])
    
    # append the corresponding word of the index to the answer_1 list
    answer_1.append(index2word[token_arg])
    
    # set flag to 1 if 'EOS' token is generated or 20 time steps have passed
    if token_arg == word2index[' <EOS>'] or i > 20:
        flag = 1
    # re-initialise the answer variable, and set the last token to the output of the current time step. This is then passed
    # as input to the next time step, along with the LSTM states of the current time step
    answer = np.zeros((1, MAX_LEN))
    answer[0, -1] = token_arg
    query_h = prediction_h
    query_c = prediction_c
    
    # increment the count of the loop
    i += 1
    
 # print the answer generated for the given question
print (" ".join(answer_1))