# Tutorial of implementing Sequence classification with RNN series
Sequence를 modeling 할 수 있는 RNN series에는 기본적인 Recurrent Neural Network, Long Short-Term Memory Units (LSTM), Gated Recurrent Unit (GRU) 등 여러 Cell을 활용할 수 있지만, ***이 Tutorial에서는 many to one의  예제로 영어단어의 알파벳을 하나씩보고 (영어단어의 길이는 모두 다르다.), 단어의 긍/부정을 예측하는 Character level RNN을 학습한다. cell은 GRU을 활용한다.*** 특히 알파벳을 벡터로 표현하는 방법은 아래의 논문에서 소개된 ***Character quantization*** 방법을 참고하였다.   

* Paper : https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf 
* Reference : https://github.com/golbin/TensorFlow-Tutorials/blob/master/10%20-%20RNN/02%20-%20Autocomplete.py

### Setup

In [1]:
import os, sys
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

### Charater quantization
Paper에서는 영어로 쓰인 어떤 글이라 One hot vector의 모음으로 표현할 수 있도록, 아래와 같은 문자열들을 모두 one hot encoding에 포함시키지만 ***이 Tutorial에서는 영어 알파벳만 활용한다.***

Paper : "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’’’/\|_@#$%ˆ&*˜‘+-=<>()[]{}"

In [2]:
char_arr = "abcdefghijklmnopqrstuvwxyz "

In [3]:
char_dic = {char : idx for idx, char in enumerate(char_arr)}
dic_len = len(char_dic)

In [4]:
# char_dic을 이용해서 영어알파벳을 27차원의 one hot vector로 만들 수 있다.
char_dic

{' ': 26,
 'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25}

In [5]:
char_dic.keys()

dict_keys(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' '])

### Define user function
***GRU의 input을 생성하는 make_batch function***을 작성한다. max_len 이라는 argument가 존재하는 이유는 RNN은 실제로 variable length sequence를 처리할 수 있지만, Tensorflow에서는 RNN의 input의 sequence는 고정된 길이로 주어야하기 때문에, 일단 max_len의 길이만큼 padding을 한다. 코드로 확인!

In [6]:
def make_batch(seq_data, max_len):
    seq_len = []
    seq_batch = []
    for seq in seq_data:
        seq_len.append(len(seq))
        seq_idx = [char_dic.get(char) for char in seq]
        seq_idx += (max_len - len(seq_idx)) * [0]
        seq_matrix = np.eye(dic_len)[seq_idx].tolist()
        seq_batch.append(seq_matrix)        
    return seq_len, seq_batch

In [7]:
# padding을 안했을 시
test_len, test_batch =  make_batch(['test'], len('test'))
print('length : {}, \n batch : {}'.format(test_len, np.array(test_batch)))

length : [4], 
 batch : [[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  1.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    1.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  1.  0.  0.  0.  0.  0.  0.  0.]]]


최대 길이를 10으로하면 아래와 같이 길이가 4인 "test"라는 단어에 대해서 우리가 생각한 것과 batch가 다르게 생성되지만, ***해당 문자열의 실제 길이를 저장한 후, tf.placehoder를 이용해서 tf.nn.dynamic_rnn에 sequence_length argument에 값을 전달하여, variable sequence length를 처리할 수 있다.***

In [8]:
# max_len (최대길이)를 10으로 padding을 했을 시 
test_len, test_batch =  make_batch(['test'], 10)
print('length : {}, \n batch : {}'.format(test_len, np.array(test_batch)))

length : [4], 
 batch : [[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  1.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    1.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  1.  0.  0.  0.  0.  0.  0.  0.]
  [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

### Define CharRNN class
가변길이의 영어단어가 긍정 단어인지 부정단어인지 예측하는 CharRNN 모형을 정의

In [9]:
class CharRNN():
    def __init__(self, n_label, max_len, input_dim, hidden_dim):
        with tf.variable_scope('input_layer'):
            self._x_len = tf.placeholder(dtype = tf.int32)
            self._x_batch = tf.placeholder(dtype = tf.float32, shape = [None, max_len, input_dim])
            self._y = tf.placeholder(dtype = tf.float32, shape = [None, n_label])
        
        with tf.variable_scope('gru_cell'):
            cell = tf.contrib.rnn.GRUCell(num_units = hidden_dim)
            _, self._hidden = tf.nn.dynamic_rnn(cell = cell, inputs = self._x_batch,
                                                sequence_length = self._x_len, dtype = tf.float32)

        with tf.variable_scope('output_layer'):
            self._score = tf.layers.dense(self._hidden, units = n_label)
        
        with tf.variable_scope('loss'):
            self._ce_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self._score, labels = self._y))
        
        with tf.variable_scope('predict'):
            self._prediction = tf.argmax(input = self._score, axis = 1)
        
    def predict(self, sess, seq_len, seq_batch):
        feed_predict = {self._x_len : seq_len, self._x_batch : seq_batch}
        return sess.run(fetches = self._prediction, feed_dict = feed_predict)
    
    def encode(self, sess, seq_len, seq_batch):
        feed_encode = {self._x_len : seq_len, self._x_batch : seq_batch}
        return sess.run(fetches = self._hidden, feed_dict = feed_encode)               

### Define Solver class

In [10]:
class Solver:
    def __init__(self, model, optimizer = tf.train.AdamOptimizer, var_list = None):
        self._model = model
        self._lr = tf.placeholder(dtype = tf.float32)
        self._optimizer = optimizer(learning_rate = self._lr)
        self._training_op = self._optimizer.minimize(loss = self._model._ce_loss, var_list = var_list)
    
    def train(self, sess, seq_len, seq_batch, y_data, lr):
        feed_train = {self._model._x_len : seq_len, self._model._x_batch : seq_batch,
                      self._model._y : y_data, self._lr : lr}
        return sess.run(fetches = [self._training_op, self._model._ce_loss], feed_dict = feed_train)
            
    def evaluate(self, sess, seq_len, seq_batch, y_data):
        feed_loss = {self._model._x_len : seq_len, self._model._x_batch : seq_batch, self._model._y : y_data}
        return sess.run(fetches = self._model._ce_loss, feed_dict = feed_loss)

### Example : word sentiment classification

In [11]:
y_data = [[1.,0.], [0.,1.], [1.,0.], [1., 0.],[0.,1.]]
words = ['good', 'bad', 'amazing', 'so good', 'bull shit']

In [12]:
# words를 CharRNN이 받을 수 있는 데이터로 변환
words_len, word_batch = make_batch(words, max_len = 10)

In [13]:
words_len

[4, 3, 7, 7, 9]

In [14]:
np.shape(word_batch)

(5, 10, 27)

### Training

In [15]:
sess = tf.Session()
char_gru = CharRNN(n_label = 2, max_len = 10, input_dim = len(char_dic), hidden_dim = 20)
adam_solver = Solver(model = char_gru)

In [16]:
sess.run(tf.global_variables_initializer())

In [17]:
# hyper parameters
n_epochs = 20
loss_history = []

In [18]:
for epoch in range(n_epochs):
    _, tr_loss = adam_solver.train(sess = sess, seq_len = words_len, seq_batch = word_batch, y_data = y_data, lr = 1e-2)
    print('epochs : {:3}, tr_loss : {:.3f}'.format(epoch, tr_loss))
    loss_history.append(tr_loss)    

epochs :   0, tr_loss : 0.648
epochs :   1, tr_loss : 0.598
epochs :   2, tr_loss : 0.553
epochs :   3, tr_loss : 0.513
epochs :   4, tr_loss : 0.475
epochs :   5, tr_loss : 0.438
epochs :   6, tr_loss : 0.400
epochs :   7, tr_loss : 0.360
epochs :   8, tr_loss : 0.317
epochs :   9, tr_loss : 0.274
epochs :  10, tr_loss : 0.233
epochs :  11, tr_loss : 0.194
epochs :  12, tr_loss : 0.159
epochs :  13, tr_loss : 0.128
epochs :  14, tr_loss : 0.101
epochs :  15, tr_loss : 0.078
epochs :  16, tr_loss : 0.058
epochs :  17, tr_loss : 0.043
epochs :  18, tr_loss : 0.031
epochs :  19, tr_loss : 0.022


In [19]:
yhat = char_gru.predict(sess = sess, seq_len = words_len, seq_batch = word_batch)
print('Accuracy : {:.2%}'.format(np.mean(np.argmax(y_data, axis = 1) == yhat)))

Accuracy : 100.00%


### Analyze embeddings of words
영어단어를 왼쪽부터 오른쪽으로 알파벳 한글자씩보고, 긍/부정을 예측하는 모형을 CharRNN으로 학습한 후, 이 때 특정 영어단어를 넣었을 때의 hidden state를 비교하면 아래와 같은 결과를 얻을 수 있음

In [20]:
words

['good', 'bad', 'amazing', 'so good', 'bull shit']

In [21]:
embedding_words = char_gru.encode(sess = sess, seq_len = words_len, seq_batch = word_batch)
embedding = pd.DataFrame(embedding_words, index = words)

In [22]:
# 각 단어의 20차원 embedding
embedding.T

Unnamed: 0,good,bad,amazing,so good,bull shit
0,-0.09062,0.383285,0.357408,-0.126503,0.689105
1,-0.142947,0.140611,-0.120752,-0.174374,0.74
2,0.481865,-0.201977,0.569966,0.677262,-0.394927
3,-0.490165,0.012184,-0.646111,-0.665153,0.374838
4,0.408214,-0.08171,0.662239,0.600297,-0.258617
5,0.552937,-0.156794,0.67202,0.731237,-0.69599
6,0.251143,-0.242771,0.5637,0.399336,-0.654192
7,0.167962,-0.33325,0.383378,0.284998,-0.73236
8,0.355938,-0.166141,0.671152,0.543169,-0.490243
9,-0.268323,0.256304,-0.575321,-0.463283,0.471359


embedding된 각 단어간의 euclidean distance를 계산해보면,***같은 범주 (긍정 또는 부정)에 속한 단어들 끼리 가깝고, 서로 다른 범주에 속한 단어들 끼리는 상대적으로 먼것을 확인 가능***

In [23]:
from sklearn.metrics.pairwise import pairwise_distances
pd.DataFrame(pairwise_distances(X = embedding), index = words, columns= words)

Unnamed: 0,good,bad,amazing,so good,bull shit
good,0.0,2.417702,1.118611,0.67733,3.980407
bad,2.417702,0.000345,2.979147,3.04703,1.678281
amazing,1.118611,2.979147,0.0,0.871289,4.414713
so good,0.67733,3.04703,0.871289,0.000977,4.5736
bull shit,3.980407,1.678281,4.414713,4.5736,0.0


### Encode unseen words
Input을 Character 단위로 받으므로 학습에 이용되지않은 영어단어에 대해서도 embedding을 구할 수 있으며, 긍/부정을 판단할 수 있음

In [24]:
unseen = ['great']

In [25]:
unseen_len, unseen_batch = make_batch(unseen, 10)
unseen_vector = char_gru.encode(sess = sess, seq_len = unseen_len, seq_batch = unseen_batch)

In [26]:
unseen_vector

array([[ 0.36647743, -0.01373214,  0.10402276, -0.1385455 , -0.00311775,
         0.09648833, -0.14816383, -0.11707296,  0.2829062 , -0.02193693,
         0.10115166,  0.08888885,  0.00755516, -0.02079342,  0.18498814,
         0.04492051,  0.04183176, -0.15769707, -0.01859909,  0.01101905]], dtype=float32)

In [27]:
char_gru.predict(sess = sess, seq_len = unseen_len, seq_batch = unseen_batch)

array([0], dtype=int64)