# Seq2Seq: Encoder-Decoder Chatbot 

![](https://cdn-images-1.medium.com/max/2560/1*1I2tTjCkMHlQ-r73eRn4ZQ.png)

In [2]:
import numpy as np
import pandas as pd
import string
import pickle
import operator
import matplotlib.pyplot as plt
%matplotlib inline

## Step 1. Import Data

In [3]:
# .txtから会話データを取得する
import codecs

with codecs.open("movie_lines.txt", "rb", encoding="utf-8", errors="ignore") as f:
    lines = f.read().split("\n")
    conversations = []
    for line in lines:
        data = line.split(" +++$+++ ")
        conversations.append(data)

In [4]:
conversations[:6]

[['L1045', 'u0', 'm0', 'BIANCA', 'They do not!'],
 ['L1044', 'u2', 'm0', 'CAMERON', 'They do to!'],
 ['L985', 'u0', 'm0', 'BIANCA', 'I hope so.'],
 ['L984', 'u2', 'm0', 'CAMERON', 'She okay?'],
 ['L925', 'u0', 'm0', 'BIANCA', "Let's go."],
 ['L924', 'u2', 'm0', 'CAMERON', 'Wow']]

In [5]:
# idと会話だけ取り出す
chats = {}
for tokens in conversations:
    if len(tokens) > 4:
        idx = tokens[0][1:]
        chat = tokens[4]
        chats[int(idx)] = chat

In [6]:
chats

{524288: 'You bastards!!',
 524293: "I knew this would happen! Those bastard Canadians have now killed a child! Can't people see the damage that film is doing?!",
 524294: 'He was killed doing something he saw in the movie. It was Terrance and Phillip... THEY killed Kenny.',
 524295: 'You bastards.',
 524296: "This is it! The time for action is NOW!!  <b>Something must be done! This is like a spreading rash! They're pulling out our children's brains and filling them with trash! Can't you see what this is leading to? A world of smut and sex and poo! I believe the good fight has begun! Something must be done!</b>",
 524308: "I bet him he couldn't do it... I bet him a hundred dollars!",
 524309: "Come on, Cartman. It's not your fault.",
 524310: "No, I know. I'm just fucking STOKED I don't have to pay him!",
 524311: 'Oh.',
 524312: "I can't believe he's dead.",
 524313: "Yeah, I'm having total deja vu right now. Like this has all happened before...",
 524315: 'How can they do this?',
 52

In [7]:
# idと会話をセットにする
sorted_chats = sorted(chats.items(), key = lambda x: x[0])
sorted_chats

[(49, 'Did you change your hair?'),
 (50, 'No.'),
 (51, 'You might wanna think about it'),
 (59, 'I missed you.'),
 (60, 'It says here you exposed yourself to a group of freshmen girls.'),
 (61, 'It was a bratwurst.  I was eating lunch.'),
 (62, 'With the teeth of your zipper?'),
 (63, 'You the new guy?'),
 (64, 'So they tell me...'),
 (65, "C'mon.  I'm supposed to give you the tour."),
 (66, 'So -- which Dakota you from?'),
 (67, "North, actually.  How'd you   ?"),
 (68, 'I was kidding. People actually live there?'),
 (69, "Yeah.  A couple.  We're outnumbered by the cows, though."),
 (70, 'How many people were in your old school?'),
 (71, 'Thirty-two.'),
 (72, 'Get out!'),
 (73, 'How many people go here?'),
 (74, 'Couple thousand. Most of them evil'),
 (77, "That I'm used to."),
 (78,
  'Yeah, but these guys have never seen a horse.  They just jack off to Clint Eastwood.'),
 (87, 'That girl -- I --'),
 (88, 'You burn, you pine, you perish?'),
 (89, 'Who is she?'),
 (90, "Bianca Stratf

In [8]:
# 会話のペアごとに辞書を作る { 会話セットid: [会話リスト] }
conves_dict = {}
counter = 1
conves_ids = []
for i in range(1, len(sorted_chats)+1):
    if i < len(sorted_chats):
        if (sorted_chats[i][0] - sorted_chats[i-1][0]) == 1:
            # 1つ前の会話の頭の文字がないのを確認
            if sorted_chats[i-1][1] not in conves_ids:
                conves_ids.append(sorted_chats[i-1][1])
            conves_ids.append(sorted_chats[i][1])
        elif (sorted_chats[i][0] - sorted_chats[i-1][0]) > 1:            
            conves_dict[counter] = conves_ids
            conves_ids = []
        counter += 1
    else:
        pass

In [9]:
conves_dict

{196609: ["Drink up, Charley. We're ahead of you.",
  "I'm not thirsty.",
  "After what we been hearing about your brother, I thought your throat'd be kind of dry.",
  "So they're walking along and smiling. That doesn't mean he's going to talk. There's no evidence until he gives public testimony.",
  "Thanks for the legal advice, Charley. That's what we always kept you around for.  Now how do we keep him from giving this testimony? Isn't that the er as you put it main order of business?",
  'He was always a good kid. You know that.'],
 3: ['Did you change your hair?', 'No.', 'You might wanna think about it'],
 283990: ['I believe I have found a faster way.', 'Hmmm?', 'Uh-uh.'],
 294913: ["I got all I could get which was 750,000 shares plus 5000 March 50 calls. Average price of $47 a share And $4 per contract for the call. I just wish I could've got more.",
  "Don't expect to get it all, sport, you'll burn out. First rule of business is never get emotional about stock, clouds the judgme

In [10]:
context_and_target = []
for conves in conves_dict.values():
    # ペアがない会話は捨てる
    if len(conves) % 2 != 0:
        conves = conves[:-1]
    for i in range(0, len(conves), 2):
        context_and_target.append((conves[i], conves[i+1]))

In [11]:
# ペア完成
context_and_target[:5]

[("Drink up, Charley. We're ahead of you.", "I'm not thirsty."),
 ("After what we been hearing about your brother, I thought your throat'd be kind of dry.",
  "So they're walking along and smiling. That doesn't mean he's going to talk. There's no evidence until he gives public testimony."),
 ("Thanks for the legal advice, Charley. That's what we always kept you around for.  Now how do we keep him from giving this testimony? Isn't that the er as you put it main order of business?",
  'He was always a good kid. You know that.'),
 ('Did you change your hair?', 'No.'),
 ('I believe I have found a faster way.', 'Hmmm?')]

In [12]:
context, target = zip(*context_and_target)

In [13]:
context = list(context)
target = list(target)

In [14]:
context[:5]

["Drink up, Charley. We're ahead of you.",
 "After what we been hearing about your brother, I thought your throat'd be kind of dry.",
 "Thanks for the legal advice, Charley. That's what we always kept you around for.  Now how do we keep him from giving this testimony? Isn't that the er as you put it main order of business?",
 'Did you change your hair?',
 'I believe I have found a faster way.']

In [15]:
target[:10]

["I'm not thirsty.",
 "So they're walking along and smiling. That doesn't mean he's going to talk. There's no evidence until he gives public testimony.",
 'He was always a good kid. You know that.',
 'No.',
 'Hmmm?',
 "Don't expect to get it all, sport, you'll burn out. First rule of business is never get emotional about stock, clouds the judgment. Where do we stand?",
 "You can have the one next to yours if you want. One twenty.  It ain't took.",
 "That's got two double beds.",
 'Good luck.',
 'Very attractive.  Good idea.  Now I <u>really</u> want to fuck you.']

## Step 2. Preprocessing for text data

In [16]:
# from my_seq2seq_text_cleanear import text_modifier, nonalpha_remover
import re
MAX_LEN = 12

In [17]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

### 2-1. Clean Text

In [18]:
tidy_target = []
for conve in target:
    text = clean_text(conve)
    tidy_target.append(text)

In [19]:
tidy_target[:10]

['i am not thirsty',
 "so they are walking along and smiling that does not mean he is going to talk there's no evidence until he gives public testimony",
 'he was always a good kid you know that',
 'no',
 'hmmm',
 'do not expect to get it all sport you will burn out first rule of business is never get emotional about stock clouds the judgment where do we stand',
 'you can have the one next to yours if you want one twenty  it ai not took',
 'that is got two double beds',
 'good luck',
 'very attractive  good idea  now i ureallyu want to fuck you']

In [20]:
tidy_context = []
for conve in context:
    text = clean_text(conve)
    tidy_context.append(text)

In [21]:
tidy_context[:10]

['drink up charley we are ahead of you',
 'after what we been hearing about your brother i thought your throat would be kind of dry',
 'thanks for the legal advice charley that is what we always kept you around for  now how do we keep him from giving this testimony is not that the er as you put it main order of business',
 'did you change your hair',
 'i believe i have found a faster way',
 'i got all i could get which was 750000 shares plus 5000 march 50 calls average price of $47 a share and $4 per contract for the call i just wish i could have got more',
 'what about one fortytwo',
 'no one fortytwo',
 'ever take a look at the women who work in pet stores  wow',
 'jesus are you gay enough or what']

In [22]:
# decoderのinputには<BOS>と<EOS>タグ
bos = "<BOS> "
eos = " <EOS>"
final_target = [bos + conve + eos for conve in tidy_target] 
encoder_inputs = tidy_context
decoder_inputs = final_target

In [23]:
import codecs
with codecs.open("encoder_inputs.txt", "rb", encoding="utf-8", errors="ignore") as f:
    lines = f.read().split("\n")
    encoder_text = []
    for line in lines:
        data = line.split("\n")[0]
        encoder_text.append(data)

In [24]:
len(encoder_text)

143865

In [25]:
encoder_text[:10]

['did you change your hair',
 'i missed you',
 'it was a bratwurst  i was eating lunch',
 'you the new guy',
 "c'mon  i am supposed to give you the tour",
 'north actually  how would you   ',
 'yeah  a couple  we are outnumbered by the cows though',
 'thirtytwo',
 'how many people go here',
 'that i am used to']

In [26]:
with codecs.open("decoder_inputs.txt", "rb", encoding="utf-8", errors="ignore") as f:
    lines = f.read().split("\n")
    decoder_text = []
    for line in lines:
        data = line.split("\n")[0]
        decoder_text.append(data)

In [27]:
decoder_text[5]

'<BOS> i was kidding people actually live there <EOS>'

### 2-2. MAKE VOCABRALY

In [28]:
full_text = encoder_text + decoder_text

In [29]:
# 一旦もともと辞書サイズを調べる
dictionary = []
for text in full_text:
    words = text.split()
    for i in range(0, len(words)):
        if words[i] not in dictionary:
            dictionary.append(words[i])

In [30]:
from keras.preprocessing.text import Tokenizer
VOCAB_SIZE = 14999
tokenizer = Tokenizer(num_words=VOCAB_SIZE)

Using TensorFlow backend.


In [31]:
# 辞書を作る
tokenizer.fit_on_texts(full_text)
word_index = tokenizer.word_index
len(word_index)

65283

In [32]:
# リバースした辞書を用意
index2word = {}
for k, v in word_index.items():
    if v < 15000:
        index2word[v] = k
    if v > 15000:
        continue

In [33]:
index2word

{1: 'bos',
 2: 'eos',
 3: 'you',
 4: 'i',
 5: 'the',
 6: 'to',
 7: 'is',
 8: 'a',
 9: 'not',
 10: 'it',
 11: 'that',
 12: 'do',
 13: 'and',
 14: 'are',
 15: 'of',
 16: 'in',
 17: 'have',
 18: 'what',
 19: 'me',
 20: 'we',
 21: 'he',
 22: 'am',
 23: 'this',
 24: 'for',
 25: 'will',
 26: 'know',
 27: 'was',
 28: 'your',
 29: 'my',
 30: 'on',
 31: 'be',
 32: 'no',
 33: 'with',
 34: 'but',
 35: 'they',
 36: 'would',
 37: 'just',
 38: 'all',
 39: 'like',
 40: 'did',
 41: 'about',
 42: 'get',
 43: 'so',
 44: 'out',
 45: 'if',
 46: 'here',
 47: 'she',
 48: 'him',
 49: 'up',
 50: 'how',
 51: 'got',
 52: 'can',
 53: 'want',
 54: 'think',
 55: 'at',
 56: 'there',
 57: 'one',
 58: 'right',
 59: 'go',
 60: 'now',
 61: 'well',
 62: 'going',
 63: 'her',
 64: 'why',
 65: 'see',
 66: 'as',
 67: 'oh',
 68: 'his',
 69: 'could',
 70: 'yes',
 71: 'who',
 72: 'good',
 73: 'when',
 74: 'cannot',
 75: 'from',
 76: 'where',
 77: 'were',
 78: 'yeah',
 79: 'tell',
 80: 'come',
 81: 'some',
 82: 'been',
 83: 'an

In [34]:
word2index = {}
for k, v in index2word.items():
    word2index[v] = k

In [35]:
len(word2index) == len(index2word)

True

In [36]:
len(index2word)

14999

### 2-3. ONE-HOT VECTORIZER

In [37]:
# 単語のシーケンスを作る np.arrayにする
encoder_sequences = tokenizer.texts_to_sequences(encoder_text)
# encider_sequences = np.array(encider_sequences)

In [38]:
# デコーダーデータ
decoder_sequences = tokenizer.texts_to_sequences(decoder_text)
# decoder_sequences = np.array(decoder_sequences)

In [39]:
encoder_sequences[:5]

[[40, 3, 436, 28, 621],
 [4, 950, 3],
 [10, 27, 8, 4, 27, 1107, 802],
 [3, 5, 186, 168],
 [662, 4, 22, 346, 6, 130, 3, 5, 2407]]

In [40]:
decoder_sequences
m = 0
for seq in decoder_sequences:
    if len(seq)>m:
        m=len(seq)
print(m)

358


In [41]:
for seqs in encoder_sequences:
    for seq in seqs:
        if seq > 14999:
            print(seq)
            break

In [42]:
VOCAB_SIZE = len(index2word) + 1
VOCAB_SIZE

15000

In [43]:
import numpy as np
MAX_LEN = 20
num_samples = len(encoder_sequences)
decoder_output_data = np.zeros((num_samples, MAX_LEN, VOCAB_SIZE), dtype="float32")

In [44]:
# outputの３Dテンソル
for i, seqs in enumerate(decoder_sequences):
    for j, seq in enumerate(seqs):
        if j > 0:
            decoder_output_data[i][j][seq] = 1.

IndexError: index 20 is out of bounds for axis 0 with size 20

In [None]:
decoder_output_data.shape

### 2-4. PADDING

In [None]:
from keras.preprocessing.sequence import pad_sequences
encoder_input_data = pad_sequences(encoder_sequences, maxlen=MAX_LEN, dtype='int32', padding='post', truncating='post')
decoder_input_data = pad_sequences(decoder_sequences, maxlen=MAX_LEN, dtype='int32', padding='post', truncating='post')

In [None]:
decoder_input_data[0]

### 2-5. Word2Vec: pretrained glove vector

In [None]:
embeddings_index = {}
with open('glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

print("Glove Loded!")

In [None]:
embedding_dimention = 50
def embedding_matrix_creater(embedding_dimention, word_index):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimention))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
embedding_matrix = embedding_matrix_creater(50, word_index=word2index)

In [None]:
embed_layer = Embedding(input_dim=VOCAB_SIZE, output_dim=50, trainable=True,)
embed_layer.build((None,))
embed_layer.set_weights([embedding_matrix])

## Step 3. Build Seq2Seq Model

In [None]:
from keras.layers import Embedding
from keras.layers import Input, Dense, LSTM, TimeDistributed
from keras.models import Model

In [None]:
def seq2seq_model_builder(HIDDEN_DIM=300):
    
    encoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
    encoder_embedding = embed_layer(encoder_inputs)
    encoder_LSTM = LSTM(HIDDEN_DIM, return_state=True)
    encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)
    
    decoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
    decoder_embedding = embed_layer(decoder_inputs)
    decoder_LSTM = LSTM(HIDDEN_DIM, return_state=True, return_sequences=True)
    decoder_outputs, _, _ = decoder_LSTM(decoder_embedding, initial_state=[state_h, state_c])
    
    # dense_layer = Dense(VOCAB_SIZE, activation='softmax')
    outputs = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], outputs)
    
    return model

In [None]:
model = seq2seq_model_builder(HIDDEN_DIM=300)

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss ='categorical_crossentropy', metrics = ['accuracy'])

## Step 4. Training Model

In [None]:
BATCH_SIZE = 32
EPOCHS = 5

In [None]:
encoder_input_data.shape

In [None]:
from sklearn.model_selection import train_test_split
en_train, en_val, ja_train, ja_val = train_test_split(encoder_input_data, decoder_input_data)

In [None]:
train_num = len(en_train)
target_train = decoder_output_data[:train_num]
target_val = decoder_output_data[train_num:]

In [None]:
history = model.fit([en_train, ja_train], 
                     target_train, 
                     epochs=EPOCHS, 
                     batch_size=BATCH_SIZE,
                     validation_data=([en_val, ja_val], target_val))

#### Visualize Learning History

In [None]:
# 正確性の可視化
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10, 6))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# 損失関数の可視化
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# 重みを保存する
json_string = model.to_json()
open('seq2seq.json', 'w').write(json_string)
model.save_weights('seq2seq_weights.h5')

In [None]:
%ls

In [None]:
pwd

## Step 5. Inference 

In [None]:
json_file = open('seq2seq.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
chat_model = model.from_json(loaded_model_json)

# load weights into new model
chat_model.load_weights("model.h5")

In [None]:
ball = "I am Hungry. What do you suggest?"

In [None]:
import re
cleaned_ball = clean_text(ball)

In [None]:
query = cleaned_ball.split()
query = np.array([word2index[word] for word in query])
query = pad_sequences([query], maxlen=20)

In [None]:
encoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
encoder_embedding = embed_layer(encoder_inputs)
# encoder_LSTM = LSTM(HIDDEN_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)

decoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
decoder_embedding = embed_layer(decoder_inputs)
# decoder_LSTM = LSTM(HIDDEN_DIM, return_state=True, return_sequences=True)
decoder_outputs, _h, _c = decoder_LSTM(decoder_embedding, initial_state=[state_h, state_c])

# dense_layer = Dense(VOCAB_SIZE, activation='softmax')
outputs = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))(decoder_outputs)

chat_model = Model(encoder_inputs, [state_h, state_c])

In [None]:
# define the inputs for the decoder LSTM
h = Input(shape = (300, ))
c = Input(shape = (300, ))

In [None]:
_outputs, _h, _c = decoder_LSTM(decoder_embedding, initial_state=[h, c])
outputs = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))(_outputs)
chat_model = Model([encoder_inputs, state_h, state_c], [outputs, _h, _c])

In [None]:
# pass in the question to the encoder LSTM, to get the final encoder states of the encoder LSTM
query_h, query_c = chat_model.predict(query)

In [None]:
# initialize the answer that will be generated for the 'BOS' input. Since we have used pre-padding for padding sequences,
# the last token in the 'answer' variable is initialised with the index for 'BOS'.
answer = np.zeros((1, MAX_LEN))
answer[0, -1] = word2index['<BOS >']

In [None]:
# i keeps track of the length of the generated answer. This won't allow the model to genrate sequences with more than 20 words.
i = 1

# make a new list to store the words generated at each time step
answer_1 = []

# flag to stop the model when 'EOS' tag is generated or when 20 time steps have passed.
flag = 0

# run the inference model
while flag != 1:
    # make predictions for the given input token and encoder states
    prediction, prediction_h, prediction_c = chat_model.predict([answer, query_h, query_c])
    
    # from the generated predictions of shape (num_examples, maxLen, vocab_size), find the token with max probability
    token_arg = np.argmax(prediction[0, -1, :])
    
    # append the corresponding word of the index to the answer_1 list
    answer_1.append(index2word[token_arg])
    
    # set flag to 1 if 'EOS' token is generated or 20 time steps have passed
    if token_arg == word2index[' <EOS>'] or i > 20:
        flag = 1
    # re-initialise the answer variable, and set the last token to the output of the current time step. This is then passed
    # as input to the next time step, along with the LSTM states of the current time step
    answer = np.zeros((1, MAX_LEN))
    answer[0, -1] = token_arg
    query_h = prediction_h
    query_c = prediction_c
    
    # increment the count of the loop
    i += 1
    
 # print the answer generated for the given question
print (" ".join(answer_1))