In [1]:
import numpy as np
import tensorflow as tf

In [2]:
with open('reviews.txt', 'r') as f:
    reviews = f.read()
with open('labels.txt', 'r') as f:
    labels = f.read()

In [3]:
from string import punctuation
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')

all_text = ' '.join(reviews)
words = all_text.split()

In [4]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split()])

In [5]:
labels = labels.split('\n')
labels = np.array([1 if each == 'positive' else 0 for each in labels])

In [6]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [7]:
# Filter out that review with 0 length
reviews_ints = [each for each in reviews_ints if len(each) > 0]

In [8]:
seq_len = 200
features = np.zeros((len(reviews), seq_len), dtype=int)
for i, row in enumerate(reviews_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [9]:
split_frac = 0.8
split_idx = int(len(features)*0.8)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2501, 200)


In [10]:
lstm_size = 6
lstm_layers = 1
batch_size = 500
learning_rate = 0.001

In [11]:
n_words = len(vocab)

# Create the graph object
# Add nodes to the graph
inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [12]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 15

embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
embed = tf.nn.embedding_lookup(embedding, inputs_)

In [13]:

    # Your basic LSTM cell
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    
    # Getting an initial state of all zeros
initial_state = cell.zero_state(batch_size, tf.float32)

In [14]:

outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [15]:
predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
cost = tf.losses.mean_squared_error(labels_, predictions)
    
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [16]:
correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [18]:
epochs = 1

#saver = tf.train.Saver()

#saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                pred = []
                y_label = []
                x_input = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state, predict = sess.run([accuracy, final_state, predictions], feed_dict=feed)
                    val_acc.append(batch_acc)
                    x_input.append(x)
                    pred.append(predict)
                    y_label.append(y)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
                print("First Input Example", x_input[0][0])
                print("First Target Label",y_label[0][0])
                print("First Predicted Label", pred[0][0])
            iteration +=1
    #saver.save(sess, "./sentiment.ckpt")

Epoch: 0/1 Iteration: 5 Train loss: 0.252
Epoch: 0/1 Iteration: 10 Train loss: 0.253
Epoch: 0/1 Iteration: 15 Train loss: 0.252
Epoch: 0/1 Iteration: 20 Train loss: 0.252
Epoch: 0/1 Iteration: 25 Train loss: 0.253
Val acc: 0.522
First Input Example [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0    10    65   418   775  9318    13 43897    21
    43    28     5   278     8   214   119    21     2   353     8   136
     8  6186     2    89    23   528   231     8    13    39  1239    22
     3 10125 19391    17     3  3595     9     1  1454  2508     2   148
     1    78   138    34     8   402   378    23   395    21 12661  3486
    12    13   280    19    12    13    24     3 19322    16     3    50
    20     8    13    43     1   573   312     1    18    12    13    85
     7     7    10   

In [47]:
example2 = []
input_examples = x_input[0][0]
for i in range(200):
    if input_examples[i] != 0:
        
        example2.append(list(vocab_to_int.keys())[list(vocab_to_int.values()).index(input_examples[i])])
print(example2,)

['i', 'really', 'liked', 'tom', 'barman', 's', 'awtwb', 'you', 'just', 'have', 'to', 'let', 'it', 'come', 'over', 'you', 'and', 'enjoy', 'it', 'while', 'it', 'lasts', 'and', 'don', 't', 'expect', 'anything', 'it', 's', 'like', 'sitting', 'on', 'a', 'caf', 'terrace', 'with', 'a', 'beer', 'in', 'the', 'summer', 'sun', 'and', 'watching', 'the', 'people', 'go', 'by', 'it', 'definitely', 'won', 't', 'keep', 'you', 'pondering', 'afterwards', 'that', 's', 'true', 'but', 'that', 's', 'not', 'a', 'prerequisite', 'for', 'a', 'good', 'film', 'it', 's', 'just', 'the', 'experience', 'during', 'the', 'movie', 'that', 's', 'great', 'br', 'br', 'i', 'felt', 'there', 'were', 'a', 'few', 'strands', 'that', 'could', 'have', 'been', 'worked', 'out', 'a', 'little', 'more', 'but', 'being', 'a', 'lynch', 'fan', 'i', 'don', 't', 'care', 'that', 'much', 'anymore', 'br', 'br', 'and', 'i', 'loved', 'the', 'style', 'or', 'flair', 'of', 'this', 'movie', 'it', 's', 'slick', 'but', 'fresh', 'and', 'the', 'soundtrack

In [29]:
#vocab_to_int["w"]
print(list(vocab_to_int.keys())[list(vocab_to_int.values()).index(10)])
print(list(vocab_to_int.keys())[list(vocab_to_int.values()).index(65)])
print(list(vocab_to_int.keys())[list(vocab_to_int.values()).index(419)])
print(list(vocab_to_int.keys())[list(vocab_to_int.values()).index(775)])
print(list(vocab_to_int.keys())[list(vocab_to_int.values()).index(9153)])


i
really
liked
tom
barman
