In [23]:
import tensorflow as tf
import numpy as np
import glob
import math
import random
import re
from six.moves import xrange

vocab_file = "aclImdb/imdb.vocab"
train_file = "aclImdb/train/labeledBow.feat"
test_file = "aclImdb/test/labeledBow.feat"
train_pos_dir = "aclImdb/train/neg/*.txt"
train_neg_dir = "aclImdb/train/pos/*.txt"
test_pos_dir = "aclImdb/test/neg/*.txt"
test_neg_dir = "aclImdb/test/pos/*.txt"
stopwords_file = "aclImdb/stopwords.txt"


In [24]:
train_file_list = glob.glob(train_neg_dir, recursive=True) + glob.glob(train_pos_dir, recursive=True)
test_file_list = glob.glob(test_neg_dir, recursive=True) + glob.glob(test_pos_dir, recursive=True)

random.shuffle(train_file_list) 
random.shuffle(test_file_list)
print("Train files number:",len(train_file_list))
test_file_list = test_file_list[:1000]
print("Test files number:",len(test_file_list))

vocab = open(vocab_file, "r").readlines()
vocab_size = len(vocab)
print("Vocab size:",vocab_size)

id_to_word = {}
for i in range(vocab_size):
    id_to_word[i] = vocab[i][:-1]
word_to_id = {v: k for k, v in id_to_word.items()}

def clean(line):
    line = line.replace("<br />","")
    line = re.sub('[,.!?]', 'a', line)
    line = line.split()
    line = [x for x in line]
    return line


Train files number: 25000
Test files number: 1000
Vocab size: 89527


In [25]:
#TEXT UTILS
current_file = 0

def get_batch_for_embedding(file_list):
    global current_file
    file = file_list[current_file]
    current_file += 1
    
    batch_x = []
    batch_y = []
    
    lines = open(file, "r").readlines()
    for line in lines:
        line = clean(line)

        for i, word in enumerate(line):
            if word in word_to_id:
                if i != 0 and i < len(line)-1:
                    if line[i-1] in word_to_id and line[i+1] in word_to_id:
                        batch_x.append(word_to_id[word])
                        batch_x.append(word_to_id[word])
                        batch_y.append(word_to_id[line[i-1]])
                        batch_y.append(word_to_id[line[i+1]])
    batch_x = np.array(batch_x)#.reshape((len(batch_x), 1))
    batch_y = np.array(batch_y).reshape((len(batch_y), 1))
    return [batch_x, batch_y]

In [29]:
batch_x, batch_y = get_batch_for_embedding(train_file_list)
print(batch_x.shape)
print(batch_y.shape)

(54,)
(54, 1)


In [30]:
# EMBEDDING GRAPH

embed_size = 32    # Dimension of the embedding vector
num_sampled = 16    # Number of negative examples to sample.
emb_learning_rate = 1.0

# Validation samples: most frequent words
valid_size = 8      # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

emb_graph = tf.Graph()
with emb_graph.as_default():

    # input
    emb_x = tf.placeholder(tf.int32, shape=[None])
    emb_y = tf.placeholder(tf.int32, shape=[None, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # embeddings
    embeddings = tf.Variable( tf.random_uniform( [vocab_size, embed_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup( embeddings, emb_x)

    # variables for the NCE loss
    emb_W = tf.Variable( tf.truncated_normal( [vocab_size, embed_size], stddev=1.0/math.sqrt(embed_size)))
    emb_b = tf.Variable( tf.zeros( [vocab_size]))
    
    # trainning
    # avg NCE loss for a batch (automatically draws a new sample of the neg labels each time we eval the loss)
    emb_loss = tf.reduce_mean( tf.nn.nce_loss(weights=emb_W, biases=emb_b, labels=emb_y, inputs=embed, num_sampled=num_sampled, num_classes=vocab_size))
    emb_optimizer = tf.train.GradientDescentOptimizer(emb_learning_rate).minimize(emb_loss)

    # similarity
    norm = tf.sqrt( tf.reduce_sum( tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup( normalized_embeddings, valid_dataset)
    similarity = tf.matmul( valid_embeddings, normalized_embeddings, transpose_b=True)

    init = tf.global_variables_initializer()
    

In [38]:
print("\nTRAINING EMBEDDINGS\n")
training_epochs = 2
num_steps = 25000
print_step = 1000

with tf.Session(graph=emb_graph) as session:
    init.run()
    print('Initialized')

    for epoch in range(training_epochs):
        
        avg_loss = 0
        current_file = 0

        for step in xrange(num_steps):
            batch_x, batch_y = get_batch_for_embedding(train_file_list)
            feed_dict = {emb_x: batch_x, emb_y: batch_y}

            _, loss = session.run([emb_optimizer, emb_loss], feed_dict=feed_dict)
            if not math.isnan(loss/print_step):
                avg_loss += loss/print_step

            if (step+1) % print_step == 0:
                print('Avg loss at step', step+1, ':', avg_loss)
                avg_loss = 0

            if (step+1) % 5000 == 0:
                print("\nStep", step+1, "similarity eval:")
                sim = similarity.eval()
                for i in xrange(valid_size):
                    valid_word = id_to_word[valid_examples[i]]
                    top_k = 6  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str=""
                    for k in xrange(top_k):
                        log_str += id_to_word[nearest[k]]+" "
                    print("Nearest to", valid_word, ":", log_str)
                print("")

    final_embeddings = normalized_embeddings.eval()



TRAINING EMBEDDINGS

Initialized
Avg loss at step 1000 : 66.8660494633
Avg loss at step 2000 : 51.5613375311
Avg loss at step 3000 : 44.1701982212
Avg loss at step 4000 : 40.1181295128
Avg loss at step 5000 : 36.341986237

Step 5000 similarity eval:
Nearest to been : must thin overall started seen familiar 
Nearest to than : feature feeling action exceptional fun forced 
Nearest to if : will why wanted kill told unique 
Nearest to them : out car once ita role exciting 
Nearest to all : turns final probably plot familiar tried 
Nearest to don't : can thema would leave loved preview 
Nearest to ? : kadal accomplish observably theoretically non-actors defensive 
Nearest to me : won indie need throws turns crap 

Avg loss at step 6000 : 34.2281350021
Avg loss at step 7000 : 32.0809573855
Avg loss at step 8000 : 29.8505212302
Avg loss at step 9000 : 28.3975401983
Avg loss at step 10000 : 26.2993593471

Step 10000 similarity eval:
Nearest to been : must thin hang seen overall familiar 
Near

In [39]:
# VISUALIZE EMBEDDINGS
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt

# def plot_with_labels(low_dim_embs, labels, filename='embed.png'):
#     assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
#     plt.figure(figsize=(18, 18))    # in inches
#     for i, label in enumerate(labels):
#         x, y = low_dim_embs[i, :]
#         plt.scatter(x, y)
#         plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
#     plt.savefig(filename)

# tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
# plot_only = 500
# low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
# labels = [id_to_word[i] for i in xrange(plot_only)]
# plot_with_labels(low_dim_embs, labels)


In [128]:
#TEXT UTILS
from keras.preprocessing.sequence import pad_sequences

words_per_review = 50
batch_size = 250
current_file = 0

def get_tokenized(file_list):
    global current_file
    file = file_list[current_file]
    current_file += 1
    
    batch_x = []
    lines = open(file, "r").readlines()
    for line in lines:
        for i, word in enumerate(clean(line)):
            if word in word_to_id and word not in stopwords:
                batch_x.append(word_to_id[word])
    
    batch_x = np.array(pad_sequences([batch_x], maxlen = words_per_review)).reshape(words_per_review)
    
    file = [s for s in re.split("[._]",file)]
    val_y = int(file[-2])
    batch_y = [0] if val_y>5 else [1]
    batch_y = np.array(batch_y)
    return [batch_x, batch_y]

def get_batch_for_classification(file_list):
    x = []
    y = []
    for step in range(batch_size):
        batch_x, batch_y = get_tokenized(file_list)
        x.append(batch_x)
        y.append(batch_y)
    return [np.array(x), np.array(y)]

# example
# batch_x, batch_y = get_batch_for_classification(train_file_list)
# print("batch x format:", batch_x.shape)
# print("batch y format:", batch_y.shape)

In [131]:
# CLASSIFICATION GRAPH
cls_learning_rate = 0.01
num_labels = 1

cls_graph = tf.Graph()
with cls_graph.as_default():
   
    # input 
    cls_x = tf.placeholder(tf.int32, [None, words_per_review])
    cls_y = tf.placeholder(tf.float32, [None, num_labels])

    embed = tf.nn.embedding_lookup(final_embeddings, cls_x)
    embed_avg = tf.reduce_mean(embed, 1)
    
    # variables
    cls_W = tf.Variable(tf.random_normal([embed_size, num_labels], mean=0, stddev=0.1))
    cls_b = tf.Variable(tf.random_normal([num_labels], mean=0, stddev=0.1))

    cls_h = tf.nn.sigmoid(tf.matmul(embed_avg,cls_W) + cls_b)

    cls_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=cls_h, labels=cls_y))
    cls_optimizer = tf.train.GradientDescentOptimizer(cls_learning_rate).minimize(cls_loss)

#     cls_h = tf.nn.softmax(tf.matmul(embed_avg,cls_W) + cls_b)
#     cls_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=cls_h, labels=cls_y))
#     cls_loss = tf.reduce_mean(-tf.reduce_sum( cls_y*tf.log(cls_h), reduction_indices=1))
#     cls_pred = tf.equal(tf.argmax(cls_h, 1), tf.argmax(cls_y, 1))

    cls_pred = tf.equal(tf.round(cls_h), cls_y)
    cls_accuracy = tf.reduce_mean(tf.cast(cls_pred, tf.float32))

    init = tf.global_variables_initializer()


In [132]:
print("\nTRAINING CLASSIFICATION\n")
epochs = 5
train_iterations = len(train_file_list)//batch_size
test_iterations = len(test_file_list)//batch_size

with tf.Session(graph = cls_graph) as sess:
    sess.run(init)
    print("Initialized")
    
    for epoch in range(epochs):
        print("\nEpoch: {}/{}".format(epoch+1, epochs))
        
        current_file = 0
        avg_loss = 0.
        avg_acc = 0.
        
        for i in range(train_iterations):
            batch_x, batch_y = get_batch_for_classification(train_file_list)
#             print(batch_x)
            _,l, a = sess.run([cls_optimizer,cls_loss, cls_accuracy], feed_dict={cls_x: batch_x, cls_y: batch_y})
            
            avg_loss += l/train_iterations if not math.isnan(l) else 0
            avg_acc += a / train_iterations

            if (i+1)%10==0:
                print("step", i+1, "loss:", "{0:.5f}".format(l))
                    
        print("Train accuracy:", avg_acc, "Average loss:", avg_loss)
        
        current_file = 0
        avg_acc = 0.
        
        for i in range(test_iterations):
            batch_x, batch_y = get_batch_for_classification(test_file_list)
            
            a = cls_accuracy.eval(feed_dict={cls_x: batch_x, cls_y: batch_y})
            avg_acc += a/test_iterations if not math.isnan(a) else 0
            
        print("Test accuracy:", avg_acc, "\n")



TRAINING CLASSIFICATION

Initialized

Epoch: 1/5
step 10 loss: 0.70440
step 20 loss: 0.69986
step 30 loss: 0.73716
step 40 loss: 0.72376
step 50 loss: 0.71480
step 60 loss: 0.70594


KeyboardInterrupt: 

In [127]:
print(final_embeddings[0])
print(id_to_word[0])

[-0.04803156  0.30272895  0.30717245 -0.0146778   0.06636044 -0.05380407
 -0.09432425 -0.07076189  0.07958436 -0.12901697 -0.0738842  -0.35219225
  0.4181647  -0.15704927 -0.2716797  -0.07507171 -0.31720325  0.14018102
 -0.10465492  0.02995977  0.14486913  0.04291577  0.35885289  0.11590518
  0.00678463  0.06071221 -0.1491105  -0.08209922  0.05052676  0.0713219
  0.0322271  -0.14242215]
the
