In [132]:
import pandas as pd
import numpy as np
import tensorflow as tf
import glob
import math
import random
import re
from six.moves import xrange
from keras.preprocessing.sequence import pad_sequences

vocab_file = "aclImdb/imdb.vocab"
train_file = "aclImdb/train/labeledBow.feat"
test_file = "aclImdb/test/labeledBow.feat"
train_pos_dir = "aclImdb/train/neg/*.txt"
train_neg_dir = "aclImdb/train/pos/*.txt"
test_pos_dir = "aclImdb/test/neg/*.txt"
test_neg_dir = "aclImdb/test/pos/*.txt"
stopwords_file = "aclImdb/stopwords.txt"

In [177]:
train_file_list = glob.glob(train_neg_dir, recursive=True) + glob.glob(train_pos_dir, recursive=True)
test_file_list = glob.glob(test_neg_dir, recursive=True) + glob.glob(test_pos_dir, recursive=True)

random.shuffle(train_file_list) 
random.shuffle(test_file_list)

print("Train files number:",len(train_file_list))
test_file_list = test_file_list[:1000]
print("Test files number:",len(test_file_list))

vocab = open(vocab_file, "r").readlines()
stopwords = open(stopwords_file, "r").read()
vocab = [x for x in vocab if x not in stopwords]
vocab_size = len(vocab)
print("Vocab size:",vocab_size)
    
id_to_word = {}
for i in range(vocab_size):
    id_to_word[i] = vocab[i][:-1]
word_to_id = {v: k for k, v in id_to_word.items()}

def clean(line):
    line = line.replace("<br />","")
    line = re.sub('[,.!?]', 'a', line)
    line = line.split()
    line = [x for x in line]
    return line


Train files number: 25000
Test files number: 1000
Vocab size: 89302


In [264]:
#TEXT UTILS
current_file = 0
words_per_review = 50

def get_tokenized(file_list):
    global current_file
    file = file_list[current_file]
    current_file += 1
    
    batch_x = []
    line = [s for s in re.split("[._]",file)]
    val_y = int(line[-2])
#     batch_y = [0, 1] if val_y>5 else [1, 0]
    batch_y = [0] if val_y>5 else [1]

    lines = open(file, "r").readlines()
    for line in lines:
        line = clean(line)
        
        for i, word in enumerate(line):
            if word in word_to_id:
                batch_x.append(word_to_id[word])
                        
#     batch_x = np.array(batch_x)
    batch_x = np.array(pad_sequences([batch_x], maxlen = words_per_review)).reshape(words_per_review)
    batch_y = np.array(batch_y)
    return [batch_x, batch_y]


def get_batch(file_list, batch_size):
    x = []
    y = []
    for step in range(batch_size):
        batch_x, batch_y = get_tokenized(file_list)
        x.append(batch_x)
        y.append(batch_y)
    return [x,y]

In [265]:
batch_x, batch_y = get_tokenized(train_file_list)
print(batch_x.shape)
print(batch_y.shape)
print(batch_x)
print(batch_y)

(50,)
(1,)
[    0     0     0     0   254     1    33    61    40     1   723    92
  6032   254    14     1   106     8 29171   734   774   543   236   969
  2690   723  1802 21525    80     7  2391  2411  1715  2030 11088  1198
  1777   519     9   481   119   119  1257  2015 84790   400     1  8996
 70606   882]
[0]


In [266]:
def lstm_cell(lstm_size, keep_prob):
    return tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.BasicLSTMCell(lstm_size), output_keep_prob=keep_prob)


def build_rnn(n_words, embed_size, batch_size, lstm_size, num_layers, dropout, learning_rate, multiple_fc, fc_units):

    tf.reset_default_graph()

    # placeholders
    with tf.name_scope('inputs'):
        inputs = tf.placeholder(tf.int32, [None, None], name='inputs')

    with tf.name_scope('labels'):
        labels = tf.placeholder(tf.int32, [None, None], name='labels')

    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    
    
    # embeddings
    with tf.name_scope("embeddings"):
        embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
        embed = tf.nn.embedding_lookup(embedding, inputs)


    # rnn layers
    with tf.name_scope("RNN_layers"):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        cell = tf.contrib.rnn.MultiRNNCell([lstm_cell(lstm_size, keep_prob) for _ in range(num_layers)])
        
    # Set the initial state
    with tf.name_scope("RNN_init_state"):
        initial_state = cell.zero_state(batch_size, tf.float32)

    # Run the data through the RNN layers
    with tf.name_scope("RNN_forward"):
        outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)    
    
    # Create the fully connected layers
    with tf.name_scope("fully_connected"):
        
        # Initialize the weights and biases
        weights = tf.truncated_normal_initializer(stddev=0.1)
        biases = tf.zeros_initializer()
        
        dense = tf.contrib.layers.fully_connected(outputs[:, -1],
                                                  num_outputs = fc_units,
                                                  activation_fn = tf.sigmoid,
                                                  weights_initializer = weights,
                                                  biases_initializer = biases)
        dense = tf.contrib.layers.dropout(dense, keep_prob)

    
    with tf.name_scope('predictions'):
        predictions = tf.contrib.layers.fully_connected(dense, 
                                                        num_outputs = 1, 
                                                        activation_fn=tf.sigmoid,
                                                        weights_initializer = weights,
                                                        biases_initializer = biases)
#         tf.summary.histogram('predictions', predictions)

    with tf.name_scope('cost'):
        cost = tf.losses.mean_squared_error(labels, predictions)
#         tf.summary.scalar('cost', cost)
        

    # Train the model
    with tf.name_scope('train'):    
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

    # Determine the accuracy
    with tf.name_scope("accuracy"):
        correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        tf.summary.scalar('accuracy', accuracy)
    
    # Merge all of the summaries
#     merged = tf.summary.merge_all()    

    # Export the nodes
    export_nodes = ['inputs', 'labels', 'keep_prob', 'initial_state', 'final_state','accuracy',
                    'predictions', 'cost', 'optimizer']#, 'merged']
    Graph = namedtuple('Graph', export_nodes)
    local_dict = locals()
    graph = Graph(*[local_dict[each] for each in export_nodes])
    
    return graph

In [267]:
n_words = len(vocab)
embed_size = 300
batch_size = 250
lstm_size = 128
num_layers = 2
dropout = 0.5
learning_rate = 0.001
multiple_fc = False
fc_units = 256

model = build_rnn(n_words = n_words, 
                  embed_size = embed_size,
                  batch_size = batch_size,
                  lstm_size = lstm_size,
                  num_layers = num_layers,
                  dropout = dropout,
                  learning_rate = learning_rate,
                  multiple_fc = multiple_fc,
                  fc_units = fc_units)  

In [268]:
epochs = 5

train_iterations = len(train_file_list) // batch_size
test_iterations = len(test_file_list) // batch_size

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for e in range(epochs):
        state = sess.run(model.initial_state)
        current_file = 0

        # Record progress with each epoch
        train_loss = []
        train_acc = []
        val_acc = []
        val_loss = []


        for it in range(train_iterations):
            x, y = get_batch(train_file_list, batch_size)
            
            feed = {model.inputs: x,
                    model.labels: y,
                    model.keep_prob: dropout,
                    model.initial_state: state}
            loss, acc, state, _ = sess.run([model.cost, 
                                             model.accuracy, 
                                             model.final_state, 
                                             model.optimizer], 
                                            feed_dict=feed)                

            # Record the loss and accuracy of each training batch
            train_loss.append(loss)
            train_acc.append(acc)
            if it%20 == 0:
                print(loss,acc)


        # Average the training loss and accuracy of each epoch
        avg_train_loss = np.mean(train_loss)
        avg_train_acc = np.mean(train_acc) 

        val_state = sess.run(model.initial_state)
        current_file = 0
        
        for it in range(test_iterations):
            x, y = get_batch(test_file_list, batch_size)
            
            feed = {model.inputs: x,
                    model.labels: y,
                    model.keep_prob: 1,
                    model.initial_state: val_state}
            batch_loss, batch_acc, val_state = sess.run([model.cost, 
                                                          model.accuracy, 
                                                          model.final_state], 
                                                         feed_dict=feed)

            # Record the validation loss and accuracy of each epoch
            val_loss.append(batch_loss)
            val_acc.append(batch_acc)

        # Average the validation loss and accuracy of each epoch
        avg_valid_loss = np.mean(val_loss)    
        avg_valid_acc = np.mean(val_acc)

        # Print the progress of each epoch
        print("Epoch: {}/{}".format(e, epochs),
              "Train Loss: {:.3f}".format(avg_train_loss),
              "Train Acc: {:.3f}".format(avg_train_acc),
              "Valid Loss: {:.3f}".format(avg_valid_loss),
              "Valid Acc: {:.3f}".format(avg_valid_acc))


0.322898 0.456
0.232558 0.64
0.227968 0.64
0.185928 0.744
0.20292 0.736
Epoch: 0/5 Train Loss: 0.209 Train Acc: 0.682 Valid Loss: 0.145 Valid Acc: 0.791
0.153856 0.78
0.140713 0.808
0.150389 0.788
0.128356 0.82
0.123197 0.828
Epoch: 1/5 Train Loss: 0.132 Train Acc: 0.818 Valid Loss: 0.139 Valid Acc: 0.814
0.133732 0.824
0.101578 0.86
0.118474 0.84
0.0890032 0.88
0.0960845 0.872
Epoch: 2/5 Train Loss: 0.099 Train Acc: 0.872 Valid Loss: 0.144 Valid Acc: 0.806
0.107395 0.856
0.0681248 0.908
0.0798445 0.904
0.0643577 0.916
0.061133 0.932
Epoch: 3/5 Train Loss: 0.075 Train Acc: 0.906 Valid Loss: 0.151 Valid Acc: 0.810
0.0711755 0.912
0.052582 0.944
0.0704055 0.924
0.0714792 0.904
0.0429966 0.948
Epoch: 4/5 Train Loss: 0.065 Train Acc: 0.921 Valid Loss: 0.153 Valid Acc: 0.813
