In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("movie_dialog.csv")

In [2]:
df.columns = ['statement', 'reply']

In [3]:
df.head()

Unnamed: 0,statement,reply
0,you're asking me out. that's so cute. what's y...,forget it.
1,"no, no, it's my fault we didn't have a proper ...",cameron.
2,"gosh, if only we could find kat a boyfriend...",let me see what i can do.
3,c'esc ma tete. this is my head,right. see? you're ready for the quiz.
4,how is our little find the wench a date plan p...,"well, there's someone i think might be"


In [4]:
input_texts, target_texts = [], []
# use set to hold the seen characters in the input and target text
input_vocabulary = set()

output_vocabulary = set()


In [9]:
start_token = '\t' # start token SOS --> start of sequence
stop_token = '\n' # end token EOS --> end of sequence 

In [10]:
# set the maximum training number is 25000
max_training_samples = min(25000, len(df) -1)

In [11]:
for input_text, target_text in zip(df.statement, df.reply):
    if type(target_text) == float or type(input_text) == float:
        continue
    target_text = start_token + target_text + stop_token
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_vocabulary:
            input_vocabulary.add(char)
    for char in target_text:
        if char not in output_vocabulary:
            output_vocabulary.add(char)


### 2.  Building your character dictionary

In [12]:
# Covert each character of the input and target texts into one-hot vectors

In [13]:
repr(target_texts[10])

"'\\ttons\\n'"

In [14]:
input_vocabulary = sorted(input_vocabulary)
output_vocabulary = sorted(output_vocabulary)

In [15]:
input_vocab_size = len(input_vocabulary)
output_vocab_size = len(output_vocabulary)

In [16]:
input_vocab_size

44

In [17]:
output_vocab_size

46

In [18]:
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [19]:
max_encoder_seq_length

100

In [20]:
max_decoder_seq_length

102

In [32]:
# generate the input index-character mapping
input_token_index = dict([(str(char), i) for i, char in enumerate(input_vocabulary)])
# generate the output index-character mapping
target_token_index = dict([(str(char), i) for i, char in enumerate(input_vocabulary)])

In [33]:
reverse_input_char_index = dict((i, str(char)) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, str(char)) for char, i in target_token_index.items())

In [34]:
# Generate one-hot vectors that represent each character
encoder_input_data = np.zeros((len(input_texts), 
                               max_encoder_seq_length,
                               input_vocab_size),
                               dtype='float32')
decoder_input_data = np.zeros((len(input_texts),
                               max_decoder_seq_length,
                               output_vocab_size),
                               dtype='float32')
decoder_target_data = np.zeros((len(input_texts),
                                max_decoder_seq_length,
                                output_vocab_size),
                                dtype='float32')

In [42]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):        
        if char == '\t' or char == '\n':
            char = ' '
            
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            decoder_target_data[i, t-1, target_token_index[char]] = 1

## 3. Train your model

In [25]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

Using TensorFlow backend.


In [None]:
batch_size = 64
epochs = 100
num_neurons = 256
encoder_inputs = Input(shape=(None, input_vocab_size))
encoder = LSTM(num_neurons, return_state=True)
# 
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# encoder states
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(None, output_vocab_size))
decoder_lstm = LSTM(num_neurons, return_sequences=True,
                    return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

model.fit([encoder_input_data, decoder_input_data],
          decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)


Train on 57823 samples, validate on 6425 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
thought_input = [
    Input(shape=(num_neurons,)), Input(shape=(num_neurons,))
]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=thought_input)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_ouputs)