In [1]:
"""
The bi-directional RNN that will form the basis of all our future dialogue
models.

The goal is to create a bi-directional encoder-decoder that can be either used independently
for next-response generation, or integrated into a hierarchical (or more complicated)
model.

To that end, the BidirectionalRNN class should should support the same interface as our
other dialogue models: it should take a Config() object, it should take a DialogueCorpus()
object, and it should support the same fit() and predict() methods that (along with SciPy
classifiers) all our models support.
"""

# Keras packages
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, GRU, Dense, Bidirectional

from nltk import word_tokenize

import numpy as np

import math
import re

# Our packages
from config import Config
from dialogue_corpus import DialogueCorpus

class Encoder:
    def __init__(self, data, config=Config()):
        self.config = config
        self.data = data
        
        self.build() # this will be a Keras model for now
        
        return
        
    def build(self):
        """
        The encoder computational graph consists of three components:
        (1) the input node            `encoder_input`
        (2) the Recurrent part        `encoder_rnn`
        (3) the hidden state output   `encoder_hidden_state`
        """
        
        # Grab hyperparameters from self.config:
        hidden_dim = self.config['encoding-layer-width']
        recurrent_unit = self.config['recurrent-unit-type']
        bidirectional = False # self.config['encoding-layer-bidirectional']
        vocab_size = self.data.vocab_size
        embedding_dim = math.ceil(math.log(vocab_size, 2))    # self.config['embedding-dim']
        input_length = self.data.max_utterance_length + 1
        
        # Assemble the network components:
        encoder_input = Input(shape=(None,))
        encoder_embed = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_input) #, input_length=input_length)(encoder_input)
        # input of this Embedding() is  (None, input_length)
        # output of this Embedding() is (None, input_length, embedding_dim)
        encoder_rnn, encoder_hidden_state = None, None
        
        if recurrent_unit == 'lstm':
            encoder_rnn = LSTM(hidden_dim, return_state=True)
            encoder_output, encoder_state_h, encoder_state_c = encoder_rnn(encoder_embed)
            # discard the encoder output, keeping only the hidden state
            encoder_hidden_state = [encoder_state_h, encoder_state_c]
        if recurrent_unit == 'gru':
            encoder_rnn = GRU(hidden_dim, return_state=True)
            encoder_output, encoder_hidden_state = encoder_rnn(encoder_embed)
        else:
            raise Exception('Invalid recurrent unit type: {}'.format(recurrent_unit))
        
        # make the RNN component bidirectional, if desired
        if bidirectional:
            encoder_rnn = Bidirectional(encoder_rnn, merge_mode='ave')
        
        # save the three Enccoder components as class state
        self.encoder_input = encoder_input
        self.encoder_embed = encoder_embed
        self.encoder_rnn = encoder_rnn
        self.encoder_hidden_state = encoder_hidden_state
        
        # finally, build the training model
        self.training_model = Model(self.encoder_input, self.encoder_hidden_state)
        
        return
        
        
        
class Decoder:
    def __init__(self, data, encoder, config=Config()):
        self.config = config
        self.data = data
        self.encoder = encoder
        
        self.build() # this will be a Keras model for now
        
        return
        
    def build(self):
        """
        The decoder computational graph consists of three components:
        (1) the input node            `decoder_input`
        (2) the Recurrent part        `decoder_rnn`
        (3) the decoder output        `decoder_output`
        """
        
        # Grab hyperparameters from self.config:
        hidden_dim = self.config['encoding-layer-width']
        recurrent_unit = self.config['recurrent-unit-type']
        bidirectional = False #self.config['encoding-layer-bidirectional']
        vocab_size = self.data.vocab_size
        embedding_dim = math.ceil(math.log(vocab_size, 2))    # self.config['embedding-dim']
        input_length = self.data.max_utterance_length + 1
        
        # Assemble the network components:
        decoder_input = Input(shape=(None,))
        decoder_embed = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_input) #, input_length=input_length)(decoder_input)
        
        if recurrent_unit == 'lstm':
            decoder_rnn = LSTM(hidden_dim, return_sequences=True, return_state=True)
            decoder_output, decoder_h, decoder_c = decoder_rnn(decoder_embed,
                                                initial_state=self.encoder.encoder_hidden_state)
        elif recurrent_unit == 'gru':
            decoder_rnn = GRU(hidden_dim, return_sequences=True, return_state=True)
            decoder_output, _ = decoder_rnn(decoder_embed, 
                                             initial_state=self.encoder.encoder_hidden_state)
        else:
            raise Exception('Invalid recurrent unit type: {}'.format(recurrent_unit))
        
        # make the RNN component bidirectional, if desired
        if bidirectional:
            decoder_rnn = Bidirectional(decoder_rnn, merge_mode='ave')
        
        decoder_dense = Dense(vocab_size, activation='softmax')
        decoder_output = decoder_dense(decoder_output)
        
        # save the four Decoder components as class state
        self.decoder_input = decoder_input
        self.decoder_embed = decoder_embed
        self.decoder_rnn = decoder_rnn
        self.decoder_dense = decoder_dense
        self.decoder_output = decoder_output
        
        return

    
class EncoderDecoder:
    def __init__(self, data, encoder, decoder, config=Config()):
        self.config = config
        self.data = data
        self.encoder = encoder
        self.decoder = decoder
        
        # build the trainin and inference models; save them
        self.build_training_model()
        self.build_inference_model()
        
        self.save_models()
        
        return
        
    def build_training_model(self):
        self.encoder_input = self.encoder.encoder_input
        self.encoder_embed = self.encoder.encoder_embed
        self.decoder_input = self.decoder.decoder_input
        self.decoder_embed = self.decoder.decoder_embed
        self.decoder_output = self.decoder.decoder_output
        
        if self.config['hierarchical']:
            # do something
            pass
        else:
            self.training_model = Model([self.encoder_input, self.decoder_input], self.decoder_output)
    
    def build_inference_model(self):
        # grab some important hyperparameters
        hidden_dim = self.config['encoding-layer-width']
        recurrent_unit = self.config['recurrent-unit-type']
        
        # build the encoder model
        self.inference_encoder = Model(self.encoder.encoder_input, self.encoder.encoder_hidden_state)
    
        decoder_hidden_state_input = None
        decoder_hidden_state_output = None
        decoder_output = None
        # build the decoder model
        if recurrent_unit == 'lstm':
            decoder_hidden_state_input_h = Input(shape=(hidden_dim,))
            decoder_hidden_state_input_c = Input(shape=(hidden_dim,))
            decoder_hidden_state_input = [decoder_hidden_state_input_h, decoder_hidden_state_input_c]
            # take in the regular inputs, condition on the hidden state
            _, decoder_state_h, decoder_state_c = self.decoder.decoder_rnn(self.decoder_embed,
                                                                           initial_state=decoder_hidden_state_input)
            decoder_hidden_state_output = [decoder_state_h, decoder_state_c]
        elif recurrent_unit == 'gru':
            decoder_hidden_state_input = [Input(shape=(hidden_dim,))]
            # take in the regular inputs, condition on the hidden state
            decoder_output, hidden_state = self.decoder.decoder_rnn(self.decoder_embed,
                                                                    initial_state=decoder_hidden_state_input)
            decoder_hidden_state_output = [hidden_state]
        else:
            raise Exception('Invalid recurrent unit type: {}'.format(recurrent_unit))
            
        decoder_output = self.decoder.decoder_dense(decoder_output)
        self.decoder_model = Model([self.decoder_input] + decoder_hidden_state_input,
                                   [decoder_output] + decoder_hidden_state_output)
    
    def fit(self):
        # grab some hyperparameters from our config
        optimizer = self.config['optimizer']
        loss = self.config['loss']
        batch_size = self.config['batch-size']
        num_epochs = self.config['num-epochs']
        validation_split = self.config['validation-split']
        
        # grab the training and validation data
        encoder_x = self.data.train.encoder_x
        decoder_x = self.data.train.decoder_x
        decoder_y = self.data.train.decoder_y_ohe
        
        self.training_model.compile(optimizer=optimizer, loss=loss)
        self.training_model.fit([encoder_x, decoder_x], decoder_y,
                                batch_size=batch_size,
                                epochs=num_epochs,
                                validation_split=validation_split)
        
        self.save_models()
        
        
    def predict(self, x):
        """
        Take in an integer-vectorized (i.e., index) vector, and predict the maximally
        likely response, returning it as an integer-vectorized (i.e., index) vector.
        """
        recurrent_unit = self.config['recurrent-unit-type']
        
        # encode the input seq into a context vector
        if recurrent_unit == 'lstm':
            context_state = self.encoder_model.predict(np.array(x))
        elif recurrent_unit == 'gru':
            hidden_state = self.encoder_model.predict(np.array(x))
            context_state = [hidden_state]
        else:
            raise Exception('Invalid recurrent unit type: {}'.format(recurrent_unit))
        
        # create an empty target sequence, seeded with the start character
        y = self.data.vectorize_utterance([self.data.start])
        response = []
        
        # i = 0
        while True:
            
            # decode the current sequence + current context into a
            # conditional distribution over next token:
            output_token_probs = None
            if recurrent_unit == 'lstm':
                output_token_probs, h, c = self.decoder_model.predict([y] + context_state)
                context_state = [h, c]
            elif recurrent_unit == 'gru':
                output_token_probs, hidden_state = self.decoder_model.predict([y] + context_state)
                context_state = [hidden_state]
            else:
                raise Exception('Invalid recurrent unit type: {}'.format(recurrent_unit))
            
            # sample a token from the output distribution (currently using maximum-likelihoo -- i.e., argmax)
            sampled_token = np.argmax(output_token_probs[0, -1, :])
            
            # add the sampled token to our output string
            response += [sampled_token]
            
            # exit condition: either we've
            # - hit the max length (self.data.output_max_len), or
            # - decoded a stop token ('\n')
            if (sampled_token == self.data.ie.transform([self.data.stop]) or 
                len(response) >= self.data.max_utterance_length):
                break
                
            # update the np array (target seq)
            y = np.array([sampled_token]) # np.concatenate((y, [sampled_token]))
            
        return response
    
    def save_models():
        name = self.config['model-name']
        if name == None:
            name = 'model'
        
        self.training_model.save(name + '_train')
        self.encoder_model.save(name + '_inference_encoder')
        self.decoder_model.save(name + '_inference_decoder')
        
        return

Using TensorFlow backend.


In [2]:
data = DialogueCorpus()

Logger initialized
Configuration loaded
Preparing to process the dialogue corpus ...
Loading the dataset ...
Filtering out long samples ...
Initializing vocabulary ...
Splitting the corpus into train/test subsets ...
Recording sequence lengths ...
Initializing the encoders ...
Vectorizing the dialogues (this may take a while) ...
Padding the dialogues ...
Padding the dialogues ...
Corpus succesfully loaded! Ready for training.


In [20]:
convo.converse("Where ye from?")

Attempting to predict x='[445 458 157  22]' of type=<class 'numpy.ndarray'>
i because because because because n't kiss two thirty a.m. a.m. a.m. dell dell night dressed dressed dressed dressed dressed dressed the the dressed the dressed the dressed dressed ' the the dressed dressed dressed
> What did you say?
Attempting to predict x='[443 112 464 344  22]' of type=<class 'numpy.ndarray'>
i because because because because n't n't kiss two thirty a.m. a.m. dell dell night night dressed dressed dressed dressed dressed the the the dressed the dressed the dressed ' the ' the dressed dressed
> Doesn't want you thinking too much, huh?
Attempting to predict x='[122 273 432 464 397 408 270   9 204  22]' of type=<class 'numpy.ndarray'>
i because because because n't n't kiss two thirty a.m. a.m. dell dell dell night dressed dressed dressed dressed dressed dressed the dressed the the dressed the dressed dressed ' the the dressed dressed dressed
> #exit
