# Classification with an RNN (recurrent neural network)

Using an RNN allow us to use the information about the "sequence" for the input words and is more accurate. 

The example data for the current setup is for sentiment. Sentiment analysis is basicly classification with two classes. The model can easily be extended to classify any number of classes.

The example is trained over dataset of movie reviews with labels.

<img src="graph_diagram.png" width=400px>

The embedding layer is used for more efficient representation for our input data than one-hot encoded vectors. The embedding can be pre-trained with word2vec for even better results. But it's good enough to just have an embedding layer and let the network learn the embedding table on it's own for the example data.

From the embedding layer, the new representations will be passed to LSTM cells. These will add recurrent connections to the network so we can include information about the sequence of words in the data.

Finally, the LSTM cells will go to a fully connected output layer.

All outputs from FC layer are ignored except for the very last one.

In [1]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from time import time
import pickle

## Load the preprocessed data
Need to run preprocess.ipynb first for preprocession the example dataset

In [2]:
X, y = np.load('data/X_train.npy'), np.load('data/y_train.npy')
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=.9)

word_to_id = pickle.load(open('data/word_to_id.p', 'rb'))

def get_batches(x, y, batch_size=100):
    n_batches = len(x) // batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for i in range(0, len(x), batch_size):
        yield x[i:i+batch_size], y[i:i+batch_size]

## Hyperparameters

In [3]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001
embed_size = 300
epochs = 50
max_epochs_without_improvement = 5

checkpoints_dir = 'checkpoints'

try:
    os.makedirs(checkpoints_dir)
except:
    pass

n_words = len(word_to_id) + 1 # Adding 1 because we use 0's for padding, dictionary started at 1

## Graph

In [4]:
# inputs
inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
labels = tf.placeholder(tf.int32, [None, None], name='labels')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')

# embedding layer
embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
embed = tf.nn.embedding_lookup(embedding, inputs)

# basic LSTM cell with dropout
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

# Stack up multiple LSTM layers, for deep learning
cell = tf.contrib.rnn.MultiRNNCell([lstm] * lstm_layers)

# initial state of all zeros
initial_state = cell.zero_state(batch_size, tf.float32)

# run the data through the RNN nodes
outputs, final_state = tf.nn.dynamic_rnn(
    cell, embed, initial_state=initial_state)

# name the state tensors
initial_state = tf.identity(initial_state, 'initial_state')
final_state = tf.identity(final_state, 'final_state')

# output
# only care about the final output
logits = tf.contrib.layers.fully_connected(
    outputs[:, -1],
    num_outputs=y.shape[-1],
    activation_fn=None
)

cost = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits))

# cost
cost = tf.losses.mean_squared_error(labels, logits)

# optimizer
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

# accuracy
correct_prediction = tf.equal(tf.argmax(labels, 1), tf.argmax(logits, 1))

accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
accuracy = tf.identity(accuracy, 'accuracy')

## Training

In [5]:
import time

config = tf.ConfigProto()
# enable JIT optimizer
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()

best_accuracy = -np.inf
epochs_without_improvement = 0
step = 0
for epoch in range(epochs):
    state = sess.run(initial_state)
    
    if epochs_without_improvement >= max_epochs_without_improvement:
        break

    for x, y in get_batches(X_train, y_train, batch_size):
        step += 1
        feed = {inputs: x, labels: y, keep_prob: 0.3, initial_state: state}
        time_start = time.time()
        loss, state, _ = sess.run(
            [cost, final_state, optimizer], feed_dict=feed)
        step_time = time.time() - time_start

        if step % 5==0:
            print("Epoch:", epoch, "Step:", step, "Train loss:", loss, 'time for 1 step', step_time)

        if step % 25==0:
            val_acc = []
            val_state = sess.run(cell.zero_state(batch_size, tf.float32))
            for x, y in get_batches(X_valid, y_valid, batch_size):
                feed = {inputs: x, labels: y, keep_prob: 1,
                        initial_state: val_state}
                batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                val_acc.append(batch_acc)
            print("Validation accuracy:", np.mean(val_acc))
            if np.mean(val_acc) > best_accuracy:
                best_accuracy = np.mean(val_acc)
                print('Best model found. Saving ...')
                saver.save(sess, checkpoints_dir + '/model.ckpt')
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                
            if epochs_without_improvement >= max_epochs_without_improvement:
                break

print('Trining done.')

('Epoch:', 0, 'Step:', 5, 'Train loss:', 0.3617655, 'time for 1 step', 0.16616296768188477)
('Epoch:', 0, 'Step:', 10, 'Train loss:', 0.34606618, 'time for 1 step', 0.16635513305664062)
('Epoch:', 0, 'Step:', 15, 'Train loss:', 0.31412193, 'time for 1 step', 0.18479394912719727)
('Epoch:', 0, 'Step:', 20, 'Train loss:', 0.31515089, 'time for 1 step', 0.16063284873962402)
('Epoch:', 0, 'Step:', 25, 'Train loss:', 0.28616226, 'time for 1 step', 0.16037607192993164)
('Validation accuracy:', 0.59549999)
Best model found. Saving ...
('Epoch:', 0, 'Step:', 30, 'Train loss:', 0.27816761, 'time for 1 step', 0.15356898307800293)
('Epoch:', 0, 'Step:', 35, 'Train loss:', 0.26705703, 'time for 1 step', 0.15573596954345703)
('Epoch:', 0, 'Step:', 40, 'Train loss:', 0.25758621, 'time for 1 step', 0.19456696510314941)
('Epoch:', 1, 'Step:', 45, 'Train loss:', 0.23615013, 'time for 1 step', 0.15041303634643555)
('Epoch:', 1, 'Step:', 50, 'Train loss:', 0.22171262, 'time for 1 step', 0.184091806411743

('Epoch:', 9, 'Step:', 400, 'Train loss:', 0.042710666, 'time for 1 step', 0.1577320098876953)
('Validation accuracy:', 0.83099997)
('Epoch:', 10, 'Step:', 405, 'Train loss:', 0.047190767, 'time for 1 step', 0.16694307327270508)
('Epoch:', 10, 'Step:', 410, 'Train loss:', 0.043301947, 'time for 1 step', 0.1621711254119873)
('Epoch:', 10, 'Step:', 415, 'Train loss:', 0.041268766, 'time for 1 step', 0.1515331268310547)
('Epoch:', 10, 'Step:', 420, 'Train loss:', 0.047427244, 'time for 1 step', 0.15753483772277832)
('Epoch:', 10, 'Step:', 425, 'Train loss:', 0.034123875, 'time for 1 step', 0.1969301700592041)
('Validation accuracy:', 0.80899996)
('Epoch:', 10, 'Step:', 430, 'Train loss:', 0.039133232, 'time for 1 step', 0.16880488395690918)
('Epoch:', 10, 'Step:', 435, 'Train loss:', 0.052304257, 'time for 1 step', 0.15459513664245605)
('Epoch:', 10, 'Step:', 440, 'Train loss:', 0.036825299, 'time for 1 step', 0.16773104667663574)
('Epoch:', 11, 'Step:', 445, 'Train loss:', 0.037221536, '