In [1]:
import tensorflow as tf
import numpy as np
import random
import re

logs_path = "/home/ape/Jupyter/logs"
vocab_file = "aclImdb/imdb.vocab"
train_file = "aclImdb/train/labeledBow.feat"
test_file = "aclImdb/test/labeledBow.feat"


In [2]:
# text utils
train_lines = []
train_lines = open(train_file, "r").readlines()
random.shuffle(train_lines)
print("train size:", len(train_lines))

test_lines = []
test_lines = open(test_file, "r").readlines()
random.shuffle(test_lines)
test_lines = test_lines[:1000]
print("test size:", len(test_lines))

vocab = open(vocab_file, "r").readlines()
vocab_size = len(vocab)
print("Vocab size:",vocab_size)

current_line = 0

def get_batch(lines, batch_size):
    global current_line
    batch_x = []
    batch_y = []
    
    for _ in range(batch_size):
        x = np.zeros(vocab_size, np.int32)
        line = [int(s) for s in re.split("[ :\n]",lines[current_line]) if s.isdigit()]
        current_line += 1
        
        for k in range(1,len(line)-1,2):
            x[line[k]] = int(line[k+1])
            
        batch_x.append(x) # doc2vec
        batch_y.append([0] if line[0]>5 else [1])

    return [np.asarray(batch_x), np.asarray(batch_y)]

# example
# x, y = get_batch(train_lines, 5)
# print("batch x format:\n", x)
# print("batch y format:\n", y)

train size: 25000
test size: 1000
Vocab size: 89527


In [3]:
learning_rate = 0.01
num_labels = 1

# with tf.name_scope('model'):
x = tf.placeholder(tf.float32, [None, vocab_size], name = 'x')
y = tf.placeholder(tf.int32, [None, num_labels], name = 'y')

W = tf.Variable(tf.random_normal([vocab_size, num_labels], mean=0, stddev=0.1), name="W")
b = tf.Variable(tf.random_normal([1, num_labels], mean=0, stddev=0.1), name="b")

# h = tf.nn.softmax(tf.matmul(x, W) + b)

# loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=h, labels=y))
# loss = tf.reduce_mean( -tf.reduce_sum( y*tf.log(h), reduction_indices=1))
# optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

# pred = tf.equal(tf.argmax(h, 1), tf.argmax(y, 1))
# accuracy = tf.reduce_mean(tf.cast(pred, tf.float32))

h = tf.nn.sigmoid(tf.matmul(x, W) + b)

loss = cost = tf.losses.mean_squared_error(y, h)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

pred = tf.equal(tf.cast(tf.round(h), tf.int32), y)
accuracy = tf.reduce_mean(tf.cast(pred, tf.float32))


init = tf.global_variables_initializer()

tf.summary.scalar("loss", loss)
tf.summary.scalar("accuracy", accuracy)
merged = tf.summary.merge_all()

In [6]:
epochs = 5
batch_size = 250
train_iterations = len(train_lines)//batch_size
test_iterations = len(test_lines)//batch_size

log_period = 10

with tf.Session() as sess:
    sess.run(init)
    writer = tf.summary.FileWriter(logs_path, sess.graph)
    print("Initialized")
    
    for epoch in range(epochs):
        print("\nEpoch: {}/{}".format(epoch+1, epochs))
        current_line = 0
        avg_loss = 0.
        avg_acc = 0.
        
        for i in range(train_iterations):
            batch_x, batch_y = get_batch(train_lines, batch_size)
            
            _, l, a, s = sess.run([optimizer, loss, accuracy, merged], feed_dict={x: batch_x, y: batch_y})
            
            avg_loss += l / train_iterations
            avg_acc += a / train_iterations

            writer.add_summary(s, i+epoch*train_iterations)

            if (i+1)%10==0:
                print("step", i+1, "loss:", "{0:.5f}".format(l))
            
        print("Train accuracy:", avg_acc, "Average loss:", avg_loss)

        current_line = 0
        avg_acc = 0.

        for i in range(test_iterations):
            batch_x, batch_y = get_batch(test_lines, batch_size)
            
            a = accuracy.eval(feed_dict={x: batch_x, y: batch_y})
            avg_acc += a / test_iterations
            
        print("Test accuracy:", avg_acc)

    writer.close()
    

Initialized

Epoch 1
step 10 loss: 0.26713
step 20 loss: 0.16851
step 30 loss: 0.13145
step 40 loss: 0.14038
step 50 loss: 0.09475
step 60 loss: 0.10111
step 70 loss: 0.08782
step 80 loss: 0.08510
step 90 loss: 0.09846
step 100 loss: 0.08844
Train accuracy: 0.814440011382 average loss: 0.132092470415
Test accuracy: 0.866000026464

Epoch 2
step 10 loss: 0.06284
step 20 loss: 0.07092
step 30 loss: 0.05778
step 40 loss: 0.05822
step 50 loss: 0.04830
step 60 loss: 0.05041
step 70 loss: 0.04252
step 80 loss: 0.03865
step 90 loss: 0.05494
step 100 loss: 0.04366
Train accuracy: 0.94340005219 average loss: 0.0490690544061
Test accuracy: 0.879999995232

Epoch 3
step 10 loss: 0.03470
step 20 loss: 0.04574
step 30 loss: 0.03457
step 40 loss: 0.03456
step 50 loss: 0.02444
step 60 loss: 0.04106
step 70 loss: 0.02509
step 80 loss: 0.03604
step 90 loss: 0.03381
step 100 loss: 0.02928
Train accuracy: 0.970440072417 average loss: 0.0304984384682
Test accuracy: 0.867999970913

Epoch 4
step 10 loss: 0.02