# Lab04

# RNN


## Predict a last character of the word

In [0]:
import tensorflow as tf
import numpy as np


char_arr = ['a', 'b', 'c', 'd', 'e', 'f', 'g',
            'h', 'i', 'j', 'k', 'l', 'm', 'n',
            'o', 'p', 'q', 'r', 's', 't', 'u',
            'v', 'w', 'x', 'y', 'z']

# one-hot encoding and decoding 
# {'a': 0, 'b': 1, 'c': 2, ..., 'j': 9, 'k', 10, ...}
num_dic = {n: i for i, n in enumerate(char_arr)}
dic_len = len(num_dic)


# a list words for sequence data (input and output)
seq_data = ['word', 'wood', 'deep', 'dive', 'cold', 'cool', 'load', 'love', 'kiss', 'kind']

# Make a batch to have sequence data for input and ouput
# wor -> X, d -> Y
# dee -> X, p -> Y
def make_batch(seq_data):
    input_batch = []
    target_batch = []
    
    for seq in seq_data:
        # input data is:
        #     wor           woo        dee       div
        # [22, 14, 17] [22, 14, 14] [3, 4, 4] [3, 8, 21] ...
        
        input_data = [num_dic[n] for n in seq[:-1]]
        
        # target is :
        # d, d, p, e, ...
        # 3, 3, 15, 4, ...
        target = num_dic[seq[-1]]
        
        # convert input to one-hot encoding.
        # if input is [3, 4, 4]:
        # [[ 0,  0,  0,  1,  0,  0,  0, ... 0]
        #  [ 0,  0,  0,  0,  1,  0,  0, ... 0]
        #  [ 0,  0,  0,  0,  1,  0,  0, ... 0]]
        input_batch.append(np.eye(dic_len)[input_data])
        
        # sparse_softmax_cross_entropy_with_logits() will be used for cost function, does not require to convert to one-hot vector
        target_batch.append(target)

    return input_batch, target_batch


### **[softmax_cross_entropy_with_logits_v2()](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits_v2) vs [sparse_softmax_cross_entropy_with_logits()](https://www.tensorflow.org/api_docs/python/tf/nn/sparse_softmax_cross_entropy_with_logits) **

**softmax_cross_entropy_with_logits_v2()**: While the classes are mutually exclusive, their probabilities need not be. All that is required is that each row of labels is a valid probability distribution. If they are not, the computation of the gradient will be incorrect.

**sparse_softmax_cross_entropy_with_logits()**: For this operation, the probability of a given label is considered exclusive. That is, soft classes are not allowed, and the labels vector must provide a single specific index for the true class for each row of logits (each minibatch entry). 


In [0]:
### Setting hyperparameters

learning_rate = 0.01
n_hidden = 128
total_epoch = 50

# Number of sequences for RNN
n_step = 3

# number of inputs (dimension of input vector) = 26
n_input = dic_len
# number of classes = 26
n_class = dic_len


In [0]:
### Neural Network Model
tf.reset_default_graph()

X = tf.placeholder(tf.float32, [None, n_step, n_input])

# Again, here we are using sparse_softmax_cross_entropy_with_logits() for cost function, so the output is not one-hot vector
# if we are getting one-hot vector shape should be: [None, n_class]
Y = tf.placeholder(tf.int32, [None])


W = tf.Variable(tf.random_normal([n_hidden, n_class]))
b = tf.Variable(tf.random_normal([n_class]))

# Create a cell for RNN 
cell1 = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)

# Apply Dropout to prevent overfitting
cell1 = tf.nn.rnn_cell.DropoutWrapper(cell1, output_keep_prob=0.5)
cell2 = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
# RNN cell composed sequentially of multiple simple cells
multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell1, cell2])

# tf.nn.dynamic_rnn 
# time_major=True
outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)

# Convert output to one-hot vector
outputs = tf.transpose(outputs, [1, 0, 2])
outputs = outputs[-1]
model = tf.matmul(outputs, W) + b

cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=model, labels=Y))

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

### Dropout
[DropoutWrapper](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/DropoutWrapper)

Dropout makes each hidden unit more robust and drive it towards creating useful features on its own without relying on other hidden units to correct its mistakes

![dropout](https://cdn-images-1.medium.com/max/800/1*D8jriroKkjno8RztHKmMnA.png)

In [0]:
### Train Model
sess = tf.Session()
sess.run(tf.global_variables_initializer())

input_batch, target_batch = make_batch(seq_data)

for epoch in range(total_epoch):
    _, loss = sess.run([optimizer, cost],
                       feed_dict={X: input_batch, Y: target_batch})

    print('Epoch:', '%04d' % (epoch + 1),
          'cost =', '{:.6f}'.format(loss))

print('Training completed')

### Prediction Result
# Convert predict result to integer
prediction = tf.cast(tf.argmax(model, 1), tf.int32)
# Compare predicted result with actual result
prediction_check = tf.equal(prediction, Y)
accuracy = tf.reduce_mean(tf.cast(prediction_check, tf.float32))

input_batch, target_batch = make_batch(seq_data)

predict, accuracy_val = sess.run([prediction, accuracy],
                                 feed_dict={X: input_batch, Y: target_batch})

predict_words = []
for idx, val in enumerate(seq_data):
    last_char = char_arr[predict[idx]]
    predict_words.append(val[:3] + last_char)

print('\n=== Prediction Result ===')
print('Input:', [w[:3] + ' ' for w in seq_data])
print('Predicted:', predict_words)
print('Accuracy:', accuracy_val)


Epoch: 0001 cost = 4.129886
Epoch: 0002 cost = 3.138178
Epoch: 0003 cost = 1.817054
Epoch: 0004 cost = 1.625509
Epoch: 0005 cost = 0.878741
Epoch: 0006 cost = 0.996808
Epoch: 0007 cost = 0.600500
Epoch: 0008 cost = 0.592670
Epoch: 0009 cost = 0.354496
Epoch: 0010 cost = 0.359257
Epoch: 0011 cost = 0.352133
Epoch: 0012 cost = 0.250323
Epoch: 0013 cost = 0.317002
Epoch: 0014 cost = 0.364188
Epoch: 0015 cost = 0.252094
Epoch: 0016 cost = 0.103315
Epoch: 0017 cost = 0.112947
Epoch: 0018 cost = 0.125758
Epoch: 0019 cost = 0.128380
Epoch: 0020 cost = 0.153149
Epoch: 0021 cost = 0.066520
Epoch: 0022 cost = 0.065893
Epoch: 0023 cost = 0.036547
Epoch: 0024 cost = 0.031514
Epoch: 0025 cost = 0.015861
Epoch: 0026 cost = 0.016400
Epoch: 0027 cost = 0.090019
Epoch: 0028 cost = 0.049094
Epoch: 0029 cost = 0.051274
Epoch: 0030 cost = 0.018310
Epoch: 0031 cost = 0.010033
Epoch: 0032 cost = 0.011302
Epoch: 0033 cost = 0.008801
Epoch: 0034 cost = 0.008848
Epoch: 0035 cost = 0.002120
Epoch: 0036 cost = 0

# Seq2Seq Model (N to M)

We are going to implement a sequence to sequence model that translates playing card symbols (Ace, Jack, Queen, King) to their associated number.

## Preprocess data

In [0]:
import tensorflow as tf
import numpy as np


# Sequence data
seq_data = [['ace', '01'], ['jack', '11'],
            ['queen', '12'], ['king', '13']]

# Generate unique tokens list
chars = []
for seq in seq_data:    
    chars += list(seq[0])
    chars += list(seq[1])

char_arr = list(set(chars))

# special tokens are required
# B: Beginning of Sequence
# E: Ending of Sequence
# P: Padding of Sequence - for different size input
# U: Unknown element of Sequence - for different size input
char_arr.append('B')
char_arr.append('E')
char_arr.append('P')
char_arr.append('U')

num_dic = {n: i for i, n in enumerate(char_arr)}

dic_len = len(num_dic)

max_input_words_amount = 5



## Generate batch

In [0]:

# add paddings if the word is shorter than the maximum number of words
def add_paddings(word):
    diff = 5 - len(word)
    for x in range(diff):
        word += "P"
    return word
    

# generate a batch data for training/testing
def make_batch(seq_data):
    input_batch = []
    output_batch = []
    target_batch = []

    for seq in seq_data:
        # Input for encoder cell, convert to vector
        input_word = add_paddings(seq[0])
        input_data = [num_dic[n] for n in input_word]
        
        # Input for decoder cell, Add 'B' at the beginning of the sequence data
        output_data  = [num_dic[n] for n in ('B'+ seq[1])]
        
        # Output of decoder cell (Actual result), Add 'E' at the end of the sequence data
        target = [num_dic[n] for n in (seq[1] + 'E')]

        # Convert each character vector to one-hot encode data
        input_batch.append(np.eye(dic_len)[input_data])
        output_batch.append(np.eye(dic_len)[output_data])
        
        target_batch.append(target)

    return input_batch, output_batch, target_batch

## Build training model

In [0]:
### Neural Network Model

### Setting Hyperparameters
learning_rate = 0.01
n_hidden = 128
total_epoch = 100

n_class = dic_len
n_input = dic_len

tf.reset_default_graph()

# encoder/decoder shape = [batch size, time steps, input size]
enc_input = tf.placeholder(tf.float32, [None, None, n_input])
dec_input = tf.placeholder(tf.float32, [None, None, n_input])

# target shape = [batch size, time steps]
targets = tf.placeholder(tf.int64, [None, None])


# Encoder Cell
with tf.variable_scope('encode'):
    enc_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
    enc_cell = tf.nn.rnn_cell.DropoutWrapper(enc_cell, output_keep_prob=0.5)

    outputs, enc_states = tf.nn.dynamic_rnn(enc_cell, enc_input,
                                            dtype=tf.float32)
# Decoder Cell
with tf.variable_scope('decode'):
    dec_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
    dec_cell = tf.nn.rnn_cell.DropoutWrapper(dec_cell, output_keep_prob=0.5)

    # [IMPORTANT] Setting enc_states as inital_state of decoder cell
    outputs, dec_states = tf.nn.dynamic_rnn(dec_cell, dec_input,
                                            initial_state=enc_states,
                                            dtype=tf.float32)

model = tf.layers.dense(outputs, n_class, activation=None)

cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=model, labels=targets))

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)




## Train model

In [0]:
### Training Model
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# Generate a batch data
input_batch, output_batch, target_batch = make_batch(seq_data)

# print(target_batch)
# for target_each in target_batch : 
#     print(len(target_each))

for epoch in range(total_epoch):
    _, loss = sess.run([optimizer, cost],
                       feed_dict={enc_input: input_batch,
                                  dec_input: output_batch,
                                  targets: target_batch})
    if epoch % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1),
              'cost =', '{:.6f}'.format(loss))

print('Training completed')

Epoch: 0001 cost = 2.811764
Epoch: 0011 cost = 0.100158
Epoch: 0021 cost = 0.003316
Epoch: 0031 cost = 0.002131
Epoch: 0041 cost = 0.000990
Epoch: 0051 cost = 0.000265
Epoch: 0061 cost = 0.000108
Epoch: 0071 cost = 0.000176
Epoch: 0081 cost = 0.000454
Epoch: 0091 cost = 0.000064
Training completed


## Evaluation

In [0]:
### Evaluation

# Predict the result 
def predict(word):
    # Setting each character of predicted as 'U' (Unknown) 
    # ['king', 'UU']
    word = add_paddings(word)
    
    seq_data = [word, 'U' * 2]

    input_batch, output_batch, target_batch = make_batch([seq_data])
    
    prediction = tf.argmax(model, 2)

    result = sess.run(prediction,
                      feed_dict={enc_input: input_batch,
                                 dec_input: output_batch,
                                 targets: target_batch})

    # convert index number to actual character 
    decoded = [char_arr[i] for i in result[0]]

    # Remove anything after 'E' 
    end = decoded.index('E')
    translated = ''.join(decoded[:end])

    return translated


print('=== Prediction result ===')
print('ace ->', predict('ace'))
print('jack ->', predict('jack'))
print('queen ->', predict('queen'))
print('king ->', predict('king'))

=== Prediction result ===
ace -> 01
jack -> 11
queen -> 12
king -> 13


# Exercise

You are required to implement Seq2Seq model chatbot. We are going to use [Microsoft Personality Chat Datasets](https://github.com/Microsoft/BotBuilder-PersonalityChat/tree/63dd818cc22ed5a84f7b77c88076809c0b77a88d/CSharp/Datasets) (Google Drive id is provided). 

Use "Question" and "Answer" data in the tsv file. We will be implementing Many-to-One Seq2Seq model and feed word-based (tokenised) input rather than character based.

Fill the blank to complete the program




### Downloading dataset

In [0]:
# You should submit "ipynb" file (You can download it from "File" > "Download .ipynb") to Canvas
import json
import re

import tensorflow as tf
import numpy as np
import pandas as pd
import nltk

nltk.download('punkt')


# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '1zMofHuFOuc6FJ3ndj19VO5lTyWGA0QDJ'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('qna_chitchat_the_professional.tsv')  


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocess data

In [0]:
# Sequence data
# Generate unique tokens list from qas.json
seq_data = []
whole_words = []
max_input_words_amount = 0
max_output_words_amount = 1


df = pd.read_csv('qna_chitchat_the_professional.tsv', sep="\t")
for index, row in df.iterrows():
    
    ###<You need to fill here>###
    question = 
    answer = 
    
    seq_data.append(      )
    ###</You need to fill here>###
    
    
    ###<You need to fill here>###
    # we need to tokenise question    
    tokenized_q = 
    
    # we do not need to tokenise answer (because we implement N to One model)
    # make a list with only one element (whole sentence)
    tokenized_a = 
    
    ###</You need to fill here>###
    
    
    # add question list and answer list (one element)
    whole_words += tokenized_q
    whole_words += tokenized_a
    
    # we need to decide the maximum size of input word tokens
    max_input_words_amount = max(len(tokenized_q), max_input_words_amount)
    

# we now have a vacabulary list
unique_words = list(set(whole_words))

# adding special tokens in the vocabulary list    
# _B_: Beginning of Sequence
# _E_: Ending of Sequence
# _P_: Padding of Sequence - for different size input
# _U_: Unknown element of Sequence - for different size input

unique_words.append('_B_')
unique_words.append('_E_')
unique_words.append('_P_')
unique_words.append('_U_')


num_dic = {n: i for i, n in enumerate(unique_words)}
dic_len = len(num_dic)


## Generate batch

In [0]:
# get token index vector of questions and add paddings if the word is shorter than the maximum number of words
def get_vectors_q(sentence):
    
    # tokenise the sentence
    ###<You need to fill here>###
    tokenized_sentence = 
    ###</You need to fill here>###    
    
    diff = max_input_words_amount - len(tokenized_sentence)
    
    # add paddings if the word is shorter than the maximum number of words    
    for x in range(diff):
        ###<You need to fill here>###
        tokenized_sentence.append(          )
        
        ###</You need to fill here>###            
        
    ###<You need to fill here>###
    data = 
    ###</You need to fill here>###      
    
        
    return data

# get token index vector of answer
def get_vectors_a(sentence):    
    tokenized_sentence = [sentence]
    data = tokens_to_ids(tokenized_sentence)
    
    return data
    

# convert tokens to index
def tokens_to_ids(tokenized_sentence):
    ids = []

    for token in tokenized_sentence:
        ###<You need to fill here>###
        if token in num_dic:
            ids.append(               )
        else:
            ids.append(               )
        ###</You need to fill here>###      

    return ids


# generate a batch data for training/testing
def make_batch(seq_data):
    input_batch = []
    output_batch = []
    target_batch = []

    for seq in seq_data:        
        # Input for encoder cell, convert question to vector
        ###<You need to fill here>###
        input_data = 
        ###</You need to fill here>###      
        
        # Input for decoder cell, Add '_B_' at the beginning of the sequence data
        ###<You need to fill here>###
        output_data = [num_dic[                     ]]
        ###</You need to fill here>###   
        output_data += get_vectors_a(seq[1])
        
        # Output of decoder cell (Actual result), Add '_E_' at the end of the sequence data
        ###<You need to fill here>###
        target = 
        target.append(                             )
        ###</You need to fill here>###   
        
        # Convert each token vector to one-hot encode data
        input_batch.append(np.eye(dic_len)[input_data])
        output_batch.append(np.eye(dic_len)[output_data])
        
        target_batch.append(target)

    return input_batch, output_batch, target_batch



## Build training model

In [0]:
### Setting Hyperparameters
learning_rate = 0.002
n_hidden = 128

n_class = dic_len
n_input = dic_len

### Neural Network Model
tf.reset_default_graph()

# encoder/decoder shape = [batch size, time steps, input size]
enc_input = tf.placeholder(tf.float32, [None, None, n_input])
dec_input = tf.placeholder(tf.float32, [None, None, n_input])

# target shape = [batch size, time steps]
targets = tf.placeholder(tf.int64, [None, None])


# Encoder Cell
with tf.variable_scope('encode'):
    enc_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
    enc_cell = tf.nn.rnn_cell.DropoutWrapper(enc_cell, output_keep_prob=0.5)

    outputs, enc_states = tf.nn.dynamic_rnn(enc_cell, enc_input,
                                            dtype=tf.float32)
# Decoder Cell
with tf.variable_scope('decode'):
    dec_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
    dec_cell = tf.nn.rnn_cell.DropoutWrapper(dec_cell, output_keep_prob=0.5)

    # [IMPORTANT] Setting enc_states as inital_state of decoder cell
    outputs, dec_states = tf.nn.dynamic_rnn(dec_cell, dec_input,
                                            initial_state=enc_states,
                                            dtype=tf.float32)

model = tf.layers.dense(outputs, n_class, activation=None)

cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=model, labels=targets))

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

    
### Training Model
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# Generate a batch data
input_batch, output_batch, target_batch = make_batch(seq_data)


## Train model


In [0]:
total_epoch = 5000

for epoch in range(total_epoch):
    _, loss = sess.run([optimizer, cost],
                       feed_dict={enc_input: input_batch,
                                  dec_input: output_batch,
                                  targets: target_batch})
    if epoch % 100 == 0:
        print('Epoch:', '%04d' % (epoch + 1),
              'cost =', '{:.6f}'.format(loss))

print('Epoch:', '%04d' % (epoch + 1),
      'cost =', '{:.6f}'.format(loss))
print('Training completed')




### Output:
```
Epoch: 0001 cost = 6.520396
Epoch: 0101 cost = 2.285382
Epoch: 0201 cost = 2.246728
Epoch: 0301 cost = 2.245017
Epoch: 0401 cost = 2.246025
Epoch: 0501 cost = 2.240861
Epoch: 0601 cost = 2.243242
Epoch: 0701 cost = 2.242677
Epoch: 0801 cost = 2.237896
Epoch: 0901 cost = 2.231663
Epoch: 1001 cost = 2.235167
Epoch: 1101 cost = 2.241715
Epoch: 1201 cost = 2.231011
Epoch: 1301 cost = 2.239064
Epoch: 1401 cost = 2.239094
Epoch: 1501 cost = 2.238503
Epoch: 1601 cost = 2.240443
Epoch: 1701 cost = 1.363001
Epoch: 1801 cost = 1.003514
Epoch: 1901 cost = 0.825378
Epoch: 2001 cost = 0.740003
Epoch: 2101 cost = 0.740753
Epoch: 2201 cost = 0.584820
Epoch: 2301 cost = 0.520495
Epoch: 2401 cost = 0.475480
Epoch: 2501 cost = 0.451525
Epoch: 2601 cost = 0.447824
Epoch: 2701 cost = 0.391890
Epoch: 2801 cost = 0.372204
Epoch: 2901 cost = 0.364561
Epoch: 3001 cost = 0.325550
Epoch: 3101 cost = 0.328401
Epoch: 3201 cost = 0.278636
Epoch: 3301 cost = 0.287389
Epoch: 3401 cost = 0.242130
Epoch: 3501 cost = 0.226778
Epoch: 3601 cost = 0.197740
Epoch: 3701 cost = 0.351003
Epoch: 3801 cost = 0.170044
Epoch: 3901 cost = 0.138550
Epoch: 4001 cost = 0.139902
Epoch: 4101 cost = 0.119088
Epoch: 4201 cost = 0.111903
Epoch: 4301 cost = 0.117569
Epoch: 4401 cost = 0.115386
Epoch: 4501 cost = 0.089837
Epoch: 4601 cost = 0.107535
Epoch: 4701 cost = 0.094628
Epoch: 4801 cost = 0.079053
Epoch: 4901 cost = 0.762524
Epoch: 5000 cost = 0.083978
Training completed
```

## Evaluation

In [0]:
### Evaluation

# Answer the question using the trained model
def answer(sentence):
    
    seq_data = [sentence, '_U_' * max_output_words_amount]

    input_batch, output_batch, target_batch = make_batch([seq_data])
    
    prediction = tf.argmax(model, 2)

    result = sess.run(prediction,
                      feed_dict={enc_input: input_batch,
                                 dec_input: output_batch,
                                 targets: target_batch})

    # convert index number to actual token 
    decoded = [unique_words[i] for i in result[0]]
        
    # Remove anything after '_E_'        
    if "_E_" in decoded:
        end = decoded.index('_E_')
        translated = ' '.join(decoded[:end])
    else :
        translated = ' '.join(decoded[:])
    
    return translated

questions = ["Hello","I am so lonely", "Can you sleep?", "What is your age?", "I hate you", "Do you like me?", "You're so mean", "Can you drive?", "That's so bad", "what do you mean?", "oh my god"]
for q in questions:
    print(q , ' ->', answer(q))




### Output 

```
Hello  -> Hello.
I am so lonely  -> Okay.
Can you sleep?  -> I don't have a body.
What is your age?  -> Age doesn't really apply to me. 
I hate you  -> I'm sorry to hear that.
Do you like me?  -> I do like you.
You're so mean  -> I aim for efficiency.
That's so bad  -> It's nice to have things you love.
what do you mean?  -> Sorry about that.
oh my god  -> I hope you're able to get some rest soon.
```

