# Bidirectional RNN bot with Attention

In [1]:
import numpy as np
import tensorflow as tf

In [3]:
with open('resources/training_dataset.txt') as f:
    data = f.readlines()

In [5]:
sentences = []
for sent in data:
    sentences.append(sent.strip().split(' '))

In [6]:
person1 = []
person2 = []
for i in range(len(sentences)):
    if i%2 == 0:
        person1.append(sentences[i])
    else:
        person2.append(sentences[i])

In [7]:
person1

[['hi'],
 ['hello'],
 ['hey'],
 ["what's", 'up'],
 ['greetings'],
 ['how', 'are', 'you'],
 ["what's", 'your', 'name'],
 ['what', 'can', 'you', 'do'],
 ['when', 'can', 'I', 'check', 'in'],
 ['when', 'can', 'I', 'check', 'out'],
 ['who', 'are', 'you'],
 ["what's", 'your', 'address'],
 ["what's", 'your', 'phone', 'number'],
 ['how', 'can', 'I', 'call', 'you'],
 ['can', 'you', 'call', 'someone', 'for', 'me'],
 ['can', 'I', 'talk', 'to', 'someone'],
 ['how', 'to', 'get', 'to', 'your', 'office'],
 ['how',
  'to',
  'get',
  'to',
  'your',
  'office',
  'if',
  'I',
  'come',
  'from',
  'Bordeaux'],
 ['how',
  'to',
  'get',
  'to',
  'your',
  'office',
  'if',
  'I',
  'come',
  'from',
  'Lyon'],
 ['how',
  'to',
  'get',
  'to',
  'your',
  'office',
  'if',
  'I',
  'come',
  'from',
  'Marseille'],
 ['how',
  'to',
  'get',
  'to',
  'your',
  'office',
  'if',
  'I',
  'come',
  'from',
  'Switzerland'],
 ['how', 'to', 'get', 'to', 'your', 'office', 'if', 'I', 'come', 'by', 'car'],
 

In [8]:
person2

[['hi'],
 ['hi'],
 ['hi'],
 ['nothing', 'much'],
 ['greetings'],
 ["I'm", 'doing', 'good'],
 ["I'm", 'nameless'],
 ['anything', 'that', 'you', 'want'],
 ['the', 'date', 'specified', 'on', 'your', 'reservation'],
 ['whenever', 'you', 'want', 'to'],
 ['I', 'am', 'your', 'little', 'assistant'],
 ['I', "don't", 'have', 'an', 'address'],
 ['I', "don't", 'have', 'phone', 'number', 'either'],
 ['you',
  "can't",
  'call',
  'me,',
  'you',
  'can',
  'only',
  'talk',
  'to',
  'me',
  'here'],
 ['sure,', 'what', 'is', 'their', 'phone', 'number'],
 ['sure,', "I'm", 'getting', 'you', 'someone', 'to', 'talk', 'with', 'you'],
 ['take',
  'the',
  'metro',
  'and',
  'stop',
  'at',
  'the',
  'terminus,',
  'we',
  'are',
  '5mins',
  'away',
  'from',
  'there'],
 ['you',
  'need',
  'to',
  'take',
  'the',
  'TGV',
  'then',
  'at',
  'Paris',
  'Train',
  'Station',
  'you',
  'take',
  'the',
  'subway'],
 ['you',
  'need',
  'to',
  'take',
  'the',
  'TGV',
  'then',
  'at',
  'Paris',
  

In [9]:
words = []
[[words.append(w) for w in sent] for sent in sentences];
vocab = list(set(words))
vocab_size = len(vocab)

In [10]:
vocab_size

120

In [11]:
id_to_word = dict(enumerate(vocab))
word_to_id = {v:k for k,v in id_to_word.items()}

In [12]:
id_to_word[vocab_size] = ''

In [13]:
seq_len = max([len(s) for s in sentences])

In [14]:
seq_len

15

In [15]:
for i in range(len(person1)):
    person1[i] = [word_to_id[w] for w in person1[i]] + [vocab_size] * (seq_len - len(person1[i]))
    person2[i] = [word_to_id[w] for w in person2[i]] + [vocab_size] * (seq_len - len(person2[i]))

In [16]:
person1 = np.array(person1)

In [17]:
person2 = np.array(person2)

In [18]:
vec = np.zeros((vocab_size+1, vocab_size+1), dtype=np.float64)
for i in range(vocab_size+1):
    vec[i,i] = 1.0

In [26]:
input_shape = output_shape = vocab_size+1
hidden_shape = 64
learning_rate = 0.001
batch_size = person1.shape[0]

In [28]:
tf.reset_default_graph()
with tf.Graph().as_default() as graph:
    
     # placeholders for input and output
    X = tf.placeholder(shape=[None,seq_len], dtype=tf.int32, name="input")
    Y = tf.placeholder(shape=[None,seq_len], dtype=tf.int32, name="target")
    
    # embedding tensor
    embed = tf.constant(vec, name="embeddings", dtype=tf.float64)
    
    # input embeddings
    X_embed = tf.nn.embedding_lookup(embed, X, name="input_embeddings")
    X_embed = tf.transpose(X_embed, perm=[1,0,2])
    
    # output embeddings
    Y_embed = tf.nn.embedding_lookup(embed, Y, name="output_embeddings")
    
    # initial hidden state
    h_in = tf.placeholder(shape=[None, hidden_shape], dtype=tf.float64, name="hidden_init")
    
    # inital context vector
    c_in = tf.placeholder(shape=[None, hidden_shape*2], dtype=tf.float64, name="context_init")
    
    # RNN Cell
    def RNN(x_t, 
            h_prev, 
            input_shape=input_shape, 
            hidden_shape=hidden_shape, 
            output_shape=output_shape):
        with tf.variable_scope('RNN'):
            
            # RNN input weight
            W_xh = tf.get_variable(name="W_xh", shape=[input_shape, hidden_shape], 
                                   initializer=tf.random_normal_initializer(mean=0.0, 
                                                                            stddev=0.1), 
                                   dtype=tf.float64)
            
            # RNN hidden state weight
            W_hh = tf.get_variable(name="W_hh", shape=[hidden_shape, hidden_shape], 
                                   initializer=tf.random_normal_initializer(mean=0.0, 
                                                                            stddev=0.1), 
                                   dtype=tf.float64)
            
            # RNN output weight
            W_yh = tf.get_variable(name="W_yh", shape=[hidden_shape, output_shape], 
                                   initializer=tf.random_normal_initializer(mean=0.0, 
                                                                            stddev=0.1), 
                                   dtype=tf.float64)
            
            # hidden state
            h_t = tf.tanh(tf.matmul(x_t, W_xh) + tf.matmul(h_prev, W_hh))
            
            # output
            y_t = tf.nn.softmax(tf.matmul(h_t, W_yh))
            
            # reshape hidden state
            h_t = tf.reshape(h_t, shape=[-1, hidden_shape])
            
            y_t = tf.reshape(y_t, shape=[-1, output_shape])
            
            # return list of hidden state and output
            return [h_t, y_t]


    # helper function  for encoder
    def encoder_helper(h_prev, x_t):
        with tf.variable_scope("encoder_helper"):
            
            # unpack hidden variables of stacked layers
            h_prev_1, h_prev_2 = h_prev[0], h_prev[1]
            
            # pass current input and previous hidden state to RNN Cell of layer 1
            with tf.variable_scope("encoder_layer_1"):
                h_t_1,y_t_1 = RNN(x_t, h_prev_1)
            
            # pass current output from layer 1 and previous hidden state to RNN Cell of layer 2
            with tf.variable_scope("encoder_layer_2"):
                h_t_2, _ = RNN(y_t_1, h_prev_2)
            
            # return hidden states from both the layers
            return [h_t_1, h_t_2]
    
    def encoder(inputs, h_in):
        with tf.variable_scope("encoder"):
            
            with tf.variable_scope("left_to_right"):
                out_encoder_lr = tf.scan(encoder_helper, 
                                      inputs, 
                                      initializer=[h_in, h_in])
            
            inputs_reversed = tf.reverse(inputs, axis=[0])
            
            with tf.variable_scope("right_to_left"):
                out_encoder_rl = tf.scan(encoder_helper, 
                                      inputs_reversed, 
                                      initializer=[h_in, h_in])
            
            # concatenate the hidden layer outputs for the ultimate layer
            out_encoder = tf.concat([out_encoder_lr[-1], out_encoder_rl[-1]], 2)
            
            # return the hidden layer output for all time step
            return out_encoder
     
    
    # helper function for decoder
    def decoder_helper(inputs, # a list of previous hidden state and current input i.e. previous output
                       x_t): # no use extra variable just to iterate over the decoder
        with tf.variable_scope("decoder_helper"):
            
            # decoder RNN output weight
            W_yh = tf.get_variable(name="W_yh", shape=[hidden_shape*4, output_shape], 
                                   initializer=tf.random_normal_initializer(mean=0.0, 
                                                                            stddev=0.1), 
                                   dtype=tf.float64)
            
            # previous hidden state, previous softmax probabilities and previous one-hot vec
            h_prev, y_prev, y_prev_one_hot, c_prev = inputs[0], inputs[1], inputs[2], inputs[3] 
            
            # reshape previous hidden state
            h_prev = tf.reshape(h_prev, shape=[-1, hidden_shape*2])
            
            # reshape previous output
            y_prev_one_hot = tf.reshape(y_prev_one_hot, [-1, input_shape])
            
            # reshape previous context vector
            c_prev = tf.reshape(c_prev, shape=[-1, hidden_shape*2])
            
            # concatenate context vector with input
            x_t = tf.concat([c_prev, y_prev_one_hot], axis=1)
            
            # input to RNN Cell
            h_t, y_t = RNN(x_t, h_prev, input_shape=input_shape+hidden_shape*2, hidden_shape=hidden_shape*2)
            
            # reshape hidden states
            H = tf.transpose(hidden_states, perm=[1,0,2])
            
            # calculate attention score
            a_t = tf.matmul(H, tf.expand_dims(h_t, 2))
            
            alpha_t = tf.nn.softmax(a_t)
            
            H = tf.transpose(hidden_states, perm=[1,2,0])
            
            c_t = tf.matmul(H, alpha_t)
            
            c_t = tf.reshape(c_t, shape=[-1, hidden_shape*2])
            
            h_new = tf.concat([h_t, c_t], axis=1)
            
            y_t = tf.nn.softmax(tf.matmul(h_new, W_yh))
            
            # convert previous output to one hot vectors
            y_out = tf.nn.embedding_lookup(embed, tf.argmax(y_t, axis=1))
            
            # outputs for next time step
            outputs = [h_t, y_t, y_out, c_prev]
            
            return outputs
    
    def decoder(h_in, x_in, x_in_one_hot, c_in):
        with tf.variable_scope('decoder'):
            # scan deoder helper
            out_decoder = tf.scan(decoder_helper, 
                                  X_embed, 
                                  initializer=[h_in, x_in, x_in_one_hot, c_in])
            
            return out_decoder[1:]
    
    # thought vector output from encoder
    hidden_states = encoder(X_embed, h_in)
    
    thought_vector = hidden_states[-1]
    
    # weight to calculate encoder output 
    W_y = tf.get_variable(name="W_y", shape=[hidden_shape*2, output_shape], 
                          initializer=tf.random_normal_initializer(mean=0.0, 
                                                                   stddev=0.1), 
                          dtype=tf.float64)
    
    # encoder output
    encoder_output = tf.nn.softmax(tf.matmul(thought_vector, W_y))
    
    # convert previous output to one hot vectors
    encoder_output_one_hot = tf.nn.embedding_lookup(embed, tf.argmax(encoder_output, axis=1))
    
    # send thought vectorm, softmax output vector, one-hot vector and context vector to the decoder
    decoder_output, decoder_output_one_hot, _ = decoder(thought_vector, encoder_output, encoder_output_one_hot, c_in)
    
    seq_output = tf.transpose(decoder_output, perm=[1, 0, 2])
    
    
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=Y_embed, 
                                                                      logits=seq_output))
    
    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    
    output = tf.argmax(decoder_output, axis=2)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        
        for i in range(20001):
            
            _,l = sess.run([optimizer, loss], 
                           {X:person1, 
                            Y:person2, 
                            h_in: np.zeros((batch_size, hidden_shape)), 
                            c_in: np.zeros((batch_size, hidden_shape*2))})
            
            if i%1000 == 0:
                print(l)
            
            if i%5000 == 0:
                pred = sess.run(output, {X:person1, 
                                         Y:person2, 
                                         h_in: np.zeros((batch_size, hidden_shape)), 
                                         c_in: np.zeros((batch_size, hidden_shape*2))})
                for j in range(5, 15):
                    print("person1 : ", ' '.join([[id_to_word[w] for w in sent] for sent in person1][j]))
                    print("person2 : ", ' '.join([[id_to_word[w] for w in sent] for sent in pred.T][j]))
                    print("====================================================")
                    
        
#         out = sess.run(output, {X:a, 
#                                 Y:person2, 
#                                 h_in: np.zeros((1, hidden_shape)), 
#                                 c_in: np.zeros((1, hidden_shape*2))})
#         print("personA : ", [[id_to_word[w] for w in sent] for sent in a])
#         print("bot     : ", [[id_to_word[w] for w in sent] for sent in out.T])
        writer = tf.summary.FileWriter('tmp/1')
        writer.add_graph(sess.graph)

4.79446102689
person1 :  how are you            
person2 :  much an take depends we please take their fill bike make need their an bike
person1 :  what's your name            
person2 :  much an bike depends we child take depends please bike take need do child bike
person1 :  what can you do           
person2 :  much an take reservation we please take their an bike take need depends in 5mins
person1 :  when can I check in          
person2 :  much an take depends we please take their fill bike make need their an bike
person1 :  when can I check out          
person2 :  much an take depends we please take their fill bike make need their an bike
person1 :  who are you            
person2 :  much an bike depends we child take depends please bike take need do child bike
person1 :  what's your address            
person2 :  much an bike depends we child take depends please bike take need do child bike
person1 :  what's your phone number           
person2 :  much an bike depends we child t

KeyboardInterrupt: 

In [40]:
! tensorboard --logdir ./tmp/1

Starting TensorBoard b'41' on port 6006
(You can navigate to http://127.0.1.1:6006)
^CTraceback (most recent call last):
  File "/home/shivam/anaconda3/bin/tensorboard", line 11, in <module>
    sys.exit(main())
  File "/home/shivam/anaconda3/lib/python3.5/site-packages/tensorflow/tensorboard/tensorboard.py", line 151, in main
    tb_server.serve_forever()
  File "/home/shivam/anaconda3/lib/python3.5/socketserver.py", line 232, in serve_forever
    ready = selector.select(poll_interval)
  File "/home/shivam/anaconda3/lib/python3.5/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
KeyboardInterrupt

