## Experiments / Todos:
 - share the embedding layer between encoder and decoder
 - graph loss vs. validation loss
 - use a pretrained vocabulary/embedding vectors
 - use dropout
 - more epochs
 - Deeper network
 - add special tokens for: start of string, and unknown token
 - Run on GPU (oculus machine?)
 - gracefully handel words not in the vocab
 - try lstm
 - use characters instead of words
 
## Resources
 - https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
 - https://machinelearningmastery.com/define-encoder-decoder-sequence-sequence-model-neural-machine-translation-keras/
 - http://colah.github.io/posts/2015-08-Understanding-LSTMs/
 - https://github.com/oswaldoludwig/Seq2seq-Chatbot-for-Keras

In [2]:
################################################
#################### Util ######################
################################################
import pickle
from collections import OrderedDict
from nltk.tokenize.casual import TweetTokenizer
import re

def get_vocab():
    try:
        with open('vocab.pickle', 'rb') as handle:
            return pickle.load(handle)
    except FileNotFoundError:
        vocab = OrderedDict()
        vocab['end of string'] = 0
        return vocab
    
tknzr = TweetTokenizer()
def tokenize(str):
    str = re.sub(r'(:[\w_]+:)', r'<\1>', str)
    return tknzr.tokenize(str)


In [74]:
################################################
################ Build Vocab ###################
################################################
inputFiles = ['answers_simple.txt', 'context_simple.txt'];

vocab = get_vocab()

for filePath in inputFiles:
    with open(filePath, encoding='utf8') as file:
        for line in file:
            tokens = tokenize(line)

            for token in tokens:
                if not (token in vocab):
                    vocab[token] = len(vocab)
                    
with open('vocab.pickle', 'wb') as file:
    pickle.dump(vocab, file, protocol=pickle.HIGHEST_PROTOCOL)
    print('{0} words in vocab'.format(len(vocab)))

4140 words in vocab


In [76]:
################################################
################ encode words ##################
################################################
import os

input_files = ['answers_simple.txt', 'context_simple.txt'];

vocab = get_vocab()

for file_path in input_files:
    file_name, ext = os.path.splitext(file_path)
    
    with open(file_path, encoding='utf8') as input_file:
        output_file_path = file_name + '.encoded' + ext
        with open(output_file_path, 'w') as output_file:
            max_tokens = 0
            for line in input_file:
                max_tokens = max(max_tokens, len(tokenize(line)))
            input_file.seek(0)
            
            for line in input_file:
                encoded_tokens = [str(vocab[token]) for token in tokenize(line)]
                encoded_tokens += ['0'] * (max_tokens - len(encoded_tokens))
                output_file.write(' '.join(encoded_tokens) + '\n')
            print('Encoded "{0}" to "{1}".'.format(file_path, output_file_path))

Encoded "answers_simple.txt" to "answers_simple.encoded.txt".
Encoded "context_simple.txt" to "context_simple.encoded.txt".


In [3]:
################################################
################# train model ##################
################################################
import numpy as np
from keras.models import Model
from keras.layers import Input, GRU, Dense, Embedding
from keras.utils import to_categorical
import pickle

vocab = get_vocab()

model_input_file = 'context_simple.encoded.txt'
model_output_file = 'context_simple.encoded.txt'
THOUGHT_VECTOR_SIZE = 100

encoder_input_data = np.loadtxt(model_input_file)
decoder_target_data = np.loadtxt(model_output_file)
decoder_input_data = decoder_target_data[:, :-1]
decoder_input_data = np.insert(decoder_input_data, 0, values=0, axis=1)
decoder_target_one_hot = to_categorical(decoder_target_data, len(vocab))


#Create layers
encoder_input_layer = Input(shape=(None,))
encoder_embedding_layer = Embedding(len(vocab), THOUGHT_VECTOR_SIZE)
encoder_gru_layer = GRU(THOUGHT_VECTOR_SIZE, return_state=True)

decoder_input_layer = Input(shape=(None,))
decoder_embedding_layer = Embedding(len(vocab), THOUGHT_VECTOR_SIZE)
decoder_gru_layer = GRU(THOUGHT_VECTOR_SIZE, return_sequences=True)
decoder_dense_layer = Dense(len(vocab), activation='softmax')


#connect network
encoder = encoder_embedding_layer(encoder_input_layer)
encoder, encoder_state = encoder_gru_layer(encoder)

decoder = decoder_embedding_layer(decoder_input_layer)
decoder = decoder_gru_layer(decoder, initial_state=encoder_state)
decoder = decoder_dense_layer(decoder)


model = Model([encoder_input_layer, decoder_input_layer], decoder)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_one_hot,
          batch_size=32,
          epochs=10,
          validation_split=0.2)

network_config = {
    'vocab_size': len(vocab),
    'thought_vector_size': THOUGHT_VECTOR_SIZE,
    'sequence_length': encoder_input_data.shape[1],
    'weights': {
        'encoder_embedding': encoder_embedding_layer.get_weights(),
        'encoder_gru': encoder_gru_layer.get_weights(),
        'decoder_embedding': decoder_embedding_layer.get_weights(),
        'decoder_gru': decoder_gru_layer.get_weights(),
        'decoder_dense': decoder_dense_layer.get_weights()        
    }
}

with open('network_config.pickle', 'wb') as file:
    pickle.dump(network_config, file)

print('saved network config to "{}". Vocab size: {}. Thought vector size: {}. Sequence length: {}.'
    .format(
        'network_config.pickle', 
        network_config['vocab_size'], 
        network_config['thought_vector_size'], 
        network_config['sequence_length']
    )
)  

Train on 5536 samples, validate on 1384 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
saved network config to "network_config.pickle". Vocab size: 4140. Thought vector size: 100. Sequence length: 63.


In [4]:
################################################
############## inference models ################
################################################
import numpy as np
from keras.models import Model
from keras.layers import Input, GRU, Dense, Embedding
import pickle

with open('network_config.pickle', 'rb') as file:
    config = pickle.load(file)
    
#Encoder
encoder_input_layer = Input(shape=(None,))
encoder_embedding_layer = Embedding(config['vocab_size'], config['thought_vector_size'])
encoder_gru_layer = GRU(config['thought_vector_size'], return_state=True)

encoder = encoder_embedding_layer(encoder_input_layer)
encoder, encoder_state = encoder_gru_layer(encoder)

encoder_model = Model([encoder_input_layer], encoder_state)
encoder_model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
encoder_embedding_layer.set_weights(config['weights']['encoder_embedding'])
encoder_gru_layer.set_weights(config['weights']['encoder_gru'])

#Decoder
decoder_input_layer = Input(shape=(None,))
decoder_thought_vector_input_layer = Input(shape=(config['thought_vector_size'],))
decoder_embedding_layer = Embedding(config['vocab_size'], config['thought_vector_size'])
decoder_gru_layer = GRU(config['thought_vector_size'], return_sequences=True, return_state=True)
decoder_dense_layer = Dense(config['vocab_size'], activation='softmax')

decoder = decoder_embedding_layer(decoder_input_layer)
decoder, decoder_state = decoder_gru_layer(decoder, initial_state=decoder_thought_vector_input_layer)
decoder = decoder_dense_layer(decoder)

decoder_model = Model([decoder_input_layer, decoder_thought_vector_input_layer], [decoder, decoder_state])
decoder_model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
decoder_embedding_layer.set_weights(config['weights']['decoder_embedding'])
decoder_gru_layer.set_weights(config['weights']['decoder_gru'])
decoder_dense_layer.set_weights(config['weights']['decoder_dense'])

print('finished creating models')

finished creating models


In [13]:
vocab = get_vocab()
vocab_list = list(vocab.keys())

def reply(input_str):
    tokens = tokenize(input_str)
    #TODO: doesn't support words not in the vocabulary
    encoded_tokens = [vocab[token] for token in tokenize(input_str)]
    encoded_tokens += [0] * (config['sequence_length'] - len(encoded_tokens))
    
    # Encode the input as state vectors.
    decoder_state = encoder_model.predict(np.array([encoded_tokens]))
        
    # Populate the first character of target sequence with a 0 (end of string). Should maybe use something else for this
    last_token = np.array([[0]])
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        predicted_token_one_hot, decoder_state = decoder_model.predict([last_token, decoder_state])

        # Sample a token
        predicted_token_index = np.argmax(predicted_token_one_hot[0, 0])
        predicted_word = vocab_list[predicted_token_index]

        # Exit condition: either hit max length
        # or find stop character.
        if predicted_word == 'end of string':
            stop_condition = True
        else:
            decoded_sentence += ' ' + predicted_word
            
        if len(decoded_sentence) >= config['sequence_length']:
            stop_condition = True
            
        last_token = np.array([[predicted_token_index]])

    return decoded_sentence

reply('is there an airport in your area ?')

' what is the matter ? i am not a lot .'