In [1]:
import numpy as np #matrix math 
import tensorflow as tf #machine learningt
import helpers #for formatting data into batches and generating random sequence data
from tensorflow.python.ops.rnn_cell import LSTMCell, LSTMStateTuple
from tensorflow.python.layers.core import Dense
from datasets.twitter import data
import data_utils
from tensorlayer.layers import *
import tensorlayer as tl


In [2]:
import time, sys


In [33]:
tf.reset_default_graph()


In [32]:
class chatbot_seq2seq(object):
    '''
    chatbot implemented with seq2seq model.
    Encoder: Bi-LSTM
    Decoder: LSTM-Attention
    '''
    
    #define init function for model
    def __init__(self, vocab_size, input_embedding_size, encoder_hidden_units, decoder_hidden_units):
        
        '''Initialize Hparams'''
        self.vocab_size = vocab_size
        self.input_embedding_size = input_embedding_size
        self.encoder_hidden_units = encoder_hidden_units
        self.decoder_hidden_units = decoder_hidden_units
        
        
        #Define placeholder
        self.build_placeholder()
        
        #Define embedding matrix
        self.build_emb_matrix()
        
        #Define Encoder as Bidirectional LSTM Cell
        self.encoder_cell, self.encoder_final_state = self.build_encoder()
        
        #Define Decoder as Basic LSTM Cell
        self.build_decoder()
        
        #Define Loss and Prediction
        self.build_op()

    
    
    
    def build_placeholder(self) :
        '''Initialize Placeholders'''
        #inputs dimension [encoder_max_time, batch_size]
        self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
        #retrieve_seq_length need argument with shape [batch_size, max_seq_len]
        #encoder_inputs_length with shape [batch_size]
        self.encoder_inputs_length = retrieve_seq_length_op2(tf.transpose(self.encoder_inputs))
        #decoder_inputs with shape [max_seq_len, batch_size], in the form [start_id, how, are, you, pad_id...]
        self.decoder_inputs = tf.placeholder(shape = (None, None), dtype = tf.int32, name = 'decoder_inputs')
        #decoder targets with shape [max_seq_len, batch_size], in the form [how, are, you, end_id, pad_id ...]
        self.decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
        #target_masks [max_seq_len, batch_size]
        self.decoder_masks = tf.placeholder(shape = (None, None), dtype = tf.int32, name = 'decoder_masks')
        #go_tokens used for inference decode, with shape [batch_size]
        self.go_tokens = tf.placeholder(shape = (None,), dtype = tf.int32, name = 'go_tokens')
        #EOS token for inference, scalar
        self.end_token = tf.placeholder(shape = (), dtype = tf.int32, name = 'end_token')
        
    def build_encoder(self):
        encoder_cell = LSTMCell(self.encoder_hidden_units, initializer = tf.truncated_normal_initializer(stddev = 0.01))
        self.encoder_cell = encoder_cell
        ((encoder_fw_outputs,
          encoder_bw_outputs),
         (encoder_fw_final_state,
          encoder_bw_final_state)) = (
            tf.nn.bidirectional_dynamic_rnn(
                cell_fw=encoder_cell,
                cell_bw=encoder_cell,
                inputs=self.encoder_inputs_embedded,
                sequence_length=self.encoder_inputs_length,
                dtype=tf.float32, time_major=True)
            )
        self.outputs, self.outputs_states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=encoder_cell,
                cell_bw=encoder_cell,
                inputs=self.encoder_inputs_embedded,
                sequence_length=self.encoder_inputs_length,
                dtype=tf.float32, time_major=True)

        #Concatenates tensors along one dimension.
        self.encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2)

        #letters h and c are commonly used to denote "output value" and "cell state". 
        #http://colah.github.io/posts/2015-08-Understanding-LSTMs/ 
        #Those tensors represent combined internal state of the cell, and should be passed together. 

        encoder_final_state_c = tf.concat(
            (encoder_fw_final_state.c, encoder_bw_final_state.c), 1)

        encoder_final_state_h = tf.concat(
            (encoder_fw_final_state.h, encoder_bw_final_state.h), 1)

        #TF Tuple used by LSTM Cells for state_size, zero_state, and output state.
        self.encoder_final_state = LSTMStateTuple(
            c=encoder_final_state_c,
            h=encoder_final_state_h
        )
        
        return self.encoder_cell, self.encoder_final_state
    
    def build_emb_matrix(self) :
        '''Initialize Embedding Matrix'''
        #this operation is moved to CPU as some bugs in emb_lookup's GPU implementation
        with tf.device("/cpu:0"):
            #randomly initialized embedding matrrix that can fit input sequence
            #used to convert sequences to vectors (embeddings) for both encoder and decoder of the right size
            #reshaping is a thing, in TF you gotta make sure you tensors are the right shape (num dimensions)
            self.embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.input_embedding_size], -0.1, 0.1), dtype=tf.float32)
            self.encoder_inputs_embedded = tf.nn.embedding_lookup(self.embeddings, self.encoder_inputs)
            self.decoder_inputs_embedded = tf.nn.embedding_lookup(self.embeddings, self.decoder_inputs)
            
    
    def build_decoder(self) :
        '''Define Decoder as Basic LSTM Cell'''
        self.decoder_cell = LSTMCell(self.decoder_hidden_units, initializer = tf.truncated_normal_initializer(stddev = 0.01))
        
        self.encoder_max_time, self.batch_size = tf.unstack(tf.shape(self.encoder_inputs))
        #????????????
        self.decoder_lengths = self.encoder_inputs_length * 0 + tf.reduce_max(retrieve_seq_length_op2(tf.transpose(self.decoder_masks)))
        # +2 additional steps, +1 leading <EOS> token for decoder inputs
        self.projection_layer = Dense(units = self.vocab_size, use_bias = True)
        #Training Helper
        self.train_helper = tf.contrib.seq2seq.TrainingHelper(
            self.decoder_inputs_embedded, self.decoder_lengths, time_major = True)
        #Training Decoder
        self.train_decoder = tf.contrib.seq2seq.BasicDecoder(
                self.decoder_cell, self.train_helper, self.encoder_final_state, output_layer = self.projection_layer)
        #Training Output
        self.train_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder = self.train_decoder,output_time_major = True, maximum_iterations= 26)
        #Inferencing Helper
        self.infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
            self.embeddings, start_tokens = self.go_tokens, end_token = self.end_token)
         #Training Decoder
        self.infer_decoder = tf.contrib.seq2seq.BasicDecoder(
                self.decoder_cell, self.infer_helper, self.encoder_final_state, output_layer = self.projection_layer)
        #Training Output
        self.infer_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder = self.infer_decoder,output_time_major = True, maximum_iterations= 25)

        


        #Beam Search
        self.beam_encoder_state = tf.contrib.seq2seq.tile_batch(
            self.encoder_final_state, multiplier = 5)
        self.beam_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
            cell = self.decoder_cell,
            embedding = self.embeddings,
            start_tokens = self.go_tokens,
            end_token = self.end_token,
            initial_state = self.beam_encoder_state,
            beam_width = 5,
            output_layer = self.projection_layer)
        self.beam_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder = self.beam_decoder,output_time_major = True, maximum_iterations= 25)
    def build_op(self) :
        self.train_logits = self.train_decoder_outputs.rnn_output
        self.decoder_prediction = tf.argmax(self.infer_decoder_outputs.rnn_output, 2)
        #cross_entropy with shape [max_seq_len, batch_size]
        self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=tf.one_hot(self.decoder_targets, depth=self.vocab_size, dtype=tf.float32),
                logits = self.train_logits)

        #loss function
        self.loss = tf.reduce_sum(self.cross_entropy * tf.cast(self.decoder_masks, tf.float32))
        #train it 
        self.train_op = tf.train.AdamOptimizer().minimize(self.loss)

In [34]:
model = chatbot_seq2seq(
    vocab_size = xvocab_size, 
    input_embedding_size = emb_dim, 
    encoder_hidden_units = emb_dim, 
    decoder_hidden_units = 2 * emb_dim
)

In [37]:
model.encoder_final_state

LSTMStateTuple(c=<tf.Tensor 'concat_1:0' shape=(?, 2048) dtype=float32>, h=<tf.Tensor 'concat_2:0' shape=(?, 2048) dtype=float32>)

In [20]:
model.attn_mechanism = tf.contrib.seq2seq.BahdanauAttention(
    num_units = model.decoder_hidden_units, 
    memory = model.encoder_outputs,
    memory_sequence_length = model.encoder_inputs_length)


In [21]:
model.attn_decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
    cell = model.decoder_cell,
    attention_mechanism = model.attn_mechanism,
    attention_layer_size = model.decoder_hidden_units,
    name = 'attn_decoder_cell')

In [24]:
model.encoder_final_state[0]

<tf.Tensor 'concat_1:0' shape=(?, 2048) dtype=float32>

In [6]:
checkpoint_prefix = 'test_ckpt_dir'

In [38]:
# load data from pickle and npy files
metadata, idx_q, idx_a = data.load_data(PATH='datasets/twitter/')
(trainX, trainY), (testX, testY), (validX, validY) = data_utils.split_dataset(idx_q, idx_a)

# pre-process training data part I
trainX = trainX.tolist()
trainY = trainY.tolist()
trainX = tl.prepro.remove_pad_sequences(trainX)
trainY = tl.prepro.remove_pad_sequences(trainY)

# parameters 
xseq_len = len(trainX)
yseq_len = len(trainY)
assert xseq_len == yseq_len
BATCH_SIZE = 32
xvocab_size = len(metadata['idx2w'])  
yvocab_size = xvocab_size
emb_dim = 1024

encoder_max_time = 20
decoder_max_time = encoder_max_time

# updata parameters with preprocessing
w2idx = metadata['w2idx']
idx2w = metadata['idx2w']
unk_id = w2idx['unk']
pad_id = w2idx['_']
start_id = xvocab_size
end_id = xvocab_size+1
w2idx.update({'start_id': start_id})
w2idx.update({'end_id': end_id})
idx2w = idx2w + ['start_id', 'end_id']
xvocab_size = yvocab_size = xvocab_size + 2

# A data for Seq2Seq should look like this:
# input_seqs : ['how', 'are', 'you', '<PAD_ID'>]
# decode_seqs : ['<START_ID>', 'I', 'am', 'fine', '<PAD_ID'>]
# target_seqs : ['I', 'am', 'fine', '<END_ID>', '<PAD_ID'>]
# target_mask : [1, 1, 1, 1, 0]

In [39]:
xvocab_size

8004

In [6]:
def chatbot_train_next_batch(model, iterator, go_id, end_id):
    X, Y = iterator.__next__()
    #[batch_size, max_seq_len]
    _encoder_seqs = tl.prepro.pad_sequences(X)
    _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id = end_id)
    ##[batch_size, max_seq_len]
    _target_seqs = tl.prepro.pad_sequences(_target_seqs)
    _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id = start_id, remove_last = False)
    #[batch_size, max_seq_len]
    _decode_seqs = tl.prepro.pad_sequences(_decode_seqs)
    #[batch_size, max_seq_len]
    _target_masks = tl.prepro.sequences_get_mask(_target_seqs)
    return {
        model.encoder_inputs: np.array(_encoder_seqs).T,
        model.decoder_inputs: np.array(_decode_seqs).T,
        model.decoder_targets: np.array(_target_seqs).T,
        model.decoder_masks: _target_masks.T,
        model.go_tokens: np.zeros(len(_encoder_seqs)) + go_id,
        model.end_token: end_id
    }

In [7]:
iterator = tl.iterate.minibatches(inputs = trainX, targets = trainY, batch_size = BATCH_SIZE, shuffle = False)

In [8]:
fd = iterator.__next__()

In [11]:
fd = chatbot_train_next_batch(model, iterator, start_id, end_id)

In [14]:
fd[model.decoder_inputs][:,1]

array([8002,   37,    2,  409, 1715,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0])

In [17]:
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,device_count={'GPU':1}))

In [18]:
saver = tf.train.Saver()
if tf.train.checkpoint_exists(checkpoint_prefix) :
    saver.restore(sess, tf.train.latest_checkpoint(checkpoint_prefix))
    print('Restoring model parameters successful')
else :
    sess.run(tf.global_variables_initializer())
    print('Model initialized')
# Training process

Model initialized


In [8]:
sess.run(tf.global_variables_initializer())

In [19]:
num_epoch = 2

batchs_per_epoch = int(len(trainX) / BATCH_SIZE)
for i in range(num_epoch):
    iterator = tl.iterate.minibatches(inputs = trainX, targets = trainY, batch_size = BATCH_SIZE, shuffle = False)
    time_start = time.time()
    for j in range(batchs_per_epoch):
        fd = chatbot_train_next_batch(model, iterator, start_id, end_id)
        _, l = sess.run([model.train_op, model.loss], fd)
        speed = (j+1)/(time.time() - time_start)
        sys.stdout.write('\rStep: %i/%i -- loss: %.10f -- Speed: %.3f batches/second' % (j+1, batchs_per_epoch, l, speed))
        sys.stdout.flush()

Step: 2863/2863 -- loss: 1083.6490478516 -- Speed: 5.359 batches/second

In [12]:
saver = tf.train.Saver()
save_path = saver.save(sess, '/model.ckpt')
print("\nModel training finished and saved!")


Model training finished and saved!


In [42]:
sess.close()

In [10]:
iterator = tl.iterate.minibatches(inputs = trainX, targets = trainY, batch_size = BATCH_SIZE, shuffle = False)

In [27]:
fd = chatbot_train_next_batch(model, iterator, start_id, end_id)

In [35]:
fd[model.encoder_inputs].shape

(19, 32)

In [33]:
X, Y = iterator.__next__()

In [34]:
#[batch_size, max_seq_len]
_encoder_seqs = tl.prepro.pad_sequences(X)
_target_seqs = tl.prepro.sequences_add_end_id(Y, end_id = end_id)
##[batch_size, max_seq_len]
_target_seqs = tl.prepro.pad_sequences(_target_seqs)
_decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id = start_id, remove_last = False)
#[batch_size, max_seq_len]
_decode_seqs = tl.prepro.pad_sequences(_decode_seqs)
#[batch_size, max_seq_len]
_target_masks = tl.prepro.sequences_get_mask(_target_seqs)

In [35]:
np.array(_encoder_seqs).T.shape

(19, 32)

In [36]:
np.array(_target_seqs).T.shape

(26, 32)