In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import re

In [2]:
MAX_VOCAB = 20000
SEQ_LEN = 100
TRAIN_DIR = './aclImdb/train'
TEST_DIR = './aclImdb/test'
VOCAB_DIR = './aclImdb/imdb.vocab'

In [3]:
def load_vocab():
    word_index = {r'\unknow':0}
    with tf.gfile.GFile(VOCAB_DIR) as f:        
        for i in range(1, MAX_VOCAB):
            word = f.readline().strip()
            word_index[word] = i
    return word_index

In [4]:
def load_texts(directory, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    
    x = []
    trans_table = str.maketrans(dict.fromkeys(filters))

    for fname in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, fname)) as f:
            text = f.read()
            text = re.sub(r'<.{,6}?/>', ' ', text).strip().lower()            
            text = text.translate(trans_table)
            x.append(text)
    
    return x

In [5]:
def load_data(directory):
    
    x_pos = load_texts(os.path.join(directory, 'pos'))
    x_neg = load_texts(os.path.join(directory, 'neg'))
    
    y_pos = np.array([1]*len(x_pos), dtype=np.bool)
    y_neg = np.array([0]*len(x_neg), dtype=np.bool)
    
    x = np.concatenate((x_pos, x_neg))
    y = np.concatenate((y_pos, y_neg))
    
    return x, y        

In [6]:
def texts_to_sequences(texts, word_index):
    
    x = []
    
    for text in texts:
        words = text.split()
        x.append([word_index.get(word) if word in word_index else 0 for word in words])
    
    return x

In [7]:
def pad_sequences(sequences, maxlen, value=0):
    
    x = np.empty((len(sequences), maxlen), dtype=np.uint16)
    x.fill(value)
    
    for i, seq in enumerate(sequences):
        if len(seq) >= maxlen:
            x[i] = seq[:maxlen]
        else:            
            x[i,:len(seq)] = seq
    
    return x

In [8]:
def batch(x, y, batch_size=32, shuffle=True):
    n_samples = x.shape[0]
    if shuffle:
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        x = x[indices]
        y = y[indices]
    
    x_batched = []
    y_batched = []
    
    i = 0
    while i < n_samples - batch_size:
        x_batched.append(x[i:i+batch_size])
        y_batched.append(y[i:i+batch_size])
        i += batch_size
    
    x_batched.append(x[i:])
    y_batched.append(y[i:])
    
    return x_batched, y_batched

In [9]:
def my_model(inputs):
    
    embedding_matrix = tf.get_variable('embedding_matrix', [20000, 128],\
                                       initializer=tf.truncated_normal_initializer(stddev=0.05))
    
    word_embeddings = tf.nn.embedding_lookup(embedding_matrix, inputs)
    
    cell_fw = tf.nn.rnn_cell.LSTMCell(128, initializer=tf.truncated_normal_initializer(stddev=0.05))
    cell_bw = tf.nn.rnn_cell.LSTMCell(128, initializer=tf.truncated_normal_initializer(stddev=0.05))
    
    x, state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, word_embeddings, dtype=tf.float32)
    
    x = tf.concat(x, -1)
    x = tf.reshape(x, [-1, x.shape[1]*x.shape[2]])
    x = tf.layers.dense(x, 1024, activation=tf.nn.relu,\
                        kernel_initializer=tf.truncated_normal_initializer(stddev=0.05),\
                        kernel_regularizer=tf.contrib.layers.l2_regularizer)
    
    logits = tf.layers.dense(x, 2, 
                             kernel_initializer=tf.truncated_normal_initializer(stddev=0.05),\
                             kernel_regularizer=tf.contrib.layers.l2_regularizer)
    
    return logits

In [10]:
word_index = load_vocab()
x_train, y_train = load_data(TRAIN_DIR)
x_train = texts_to_sequences(x_train, word_index)
x_train = pad_sequences(x_train, maxlen=SEQ_LEN)
x_train, y_train = batch(x_train, y_train, 128)

x_test, y_test = load_data(TEST_DIR)
x_test = texts_to_sequences(x_test, word_index)
x_test = pad_sequences(x_test, maxlen=SEQ_LEN)
x_test, y_test = batch(x_test, y_test, 128, False)

In [15]:
tf.reset_default_graph()

x = tf.placeholder(tf.int32, shape=[None, SEQ_LEN], name='x')
y = tf.placeholder(tf.int32, shape=[None,], name='y')

logits = my_model(x)
logits = tf.identity(logits, name='logits')

# Loss and Optimizer
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
correct_pred = tf.equal(tf.argmax(logits, axis=1, output_type=tf.int32), y)
optimizer = tf.train.AdamOptimizer(learning_rate=0.00001).minimize(cost)

In [16]:
epochs = 10

with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(1, epochs+1):
        
        loss_train = []
        corr_pred_train = []
        
        loss_val = []
        corr_pred_val = []
        
        for batch_x, batch_y in zip(x_train, y_train):
            op, cos, corr = sess.run((optimizer, cost, correct_pred), feed_dict={x:batch_x, y:batch_y})
            loss_train.append(cos)
            corr_pred_train.append(corr)
        
        for batch_x, batch_y in zip(x_test, y_test):
            cos, corr = sess.run((cost, correct_pred), feed_dict={x:batch_x, y:batch_y})
            loss_val.append(cos)
            corr_pred_val.append(corr)
            
        acc_train = np.concatenate(tuple(corr_pred_train), axis=0).mean()
        acc_val = np.concatenate(tuple(corr_pred_val), axis=0).mean()
        print('epoch {}: loss_train: {:.4f} , acc_train: {:.4f}'\
              .format(epoch, np.mean(loss_train), acc_train), end=' , ')
        print('loss_val: {:.4f} , acc_val: {:.4f}'.format(np.mean(loss_val), acc_val))

epoch 1: loss_train: 0.6872 , acc_train: 0.5530 , loss_val: 0.6800 , acc_val: 0.5923
epoch 2: loss_train: 0.6629 , acc_train: 0.6337 , loss_val: 0.6515 , acc_val: 0.6364
epoch 3: loss_train: 0.6144 , acc_train: 0.6788 , loss_val: 0.5992 , acc_val: 0.6796
epoch 4: loss_train: 0.5510 , acc_train: 0.7204 , loss_val: 0.5519 , acc_val: 0.7156
epoch 5: loss_train: 0.4956 , acc_train: 0.7602 , loss_val: 0.5179 , acc_val: 0.7389
epoch 6: loss_train: 0.4502 , acc_train: 0.7902 , loss_val: 0.4942 , acc_val: 0.7586
epoch 7: loss_train: 0.4139 , acc_train: 0.8116 , loss_val: 0.4786 , acc_val: 0.7696
epoch 8: loss_train: 0.3845 , acc_train: 0.8272 , loss_val: 0.4683 , acc_val: 0.7772
epoch 9: loss_train: 0.3598 , acc_train: 0.8435 , loss_val: 0.4613 , acc_val: 0.7824
epoch 10: loss_train: 0.3381 , acc_train: 0.8544 , loss_val: 0.4567 , acc_val: 0.7872
