In [47]:
import collections
import glob
from itertools import chain
import numpy as np
import tensorflow as tf

In [2]:
files = glob.glob('*.txt')

words = []
for f in files:
    file = open(f)
    words.append(file.read())
    file.close()

words = list(chain.from_iterable(words))
words = ''.join(words)[:-1]
sentences = words.split('\n')

In [3]:
count = [['UNK', -1]]
count.extend(collections.Counter(''.join(sentences).split()).most_common(10))
count

[['UNK', -1],
 ('the', 667723),
 ('and', 324431),
 ('a', 322940),
 ('of', 289409),
 ('to', 268111),
 ('is', 211076),
 ('it', 190624),
 ('in', 186745),
 ('i', 175021),
 ('this', 150651)]

In [4]:
vocabulary_size = 50000

def build_dataset(sentences):
    words = ''.join(sentences).split()
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    
    unk_count = 0
    sent_data = []
    for sentence in sentences:
        data = []
        for word in sentence.split():
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK']
                unk_count = unk_count + 1
            data.append(index)
        sent_data.append(data)
    
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return sent_data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(sentences)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:2])
# del words  # Hint to reduce memory.

Most common words (+UNK) [['UNK', 63666], ('the', 667723), ('and', 324431), ('a', 322940), ('of', 289409)]
Sample data [[279, 174, 436, 7043, 46, 3381, 43, 3, 15, 16, 223, 1129, 72, 1706, 1168, 37, 1, 1308, 1604, 2142, 836, 4, 61, 39, 25, 53, 172, 9, 41, 117, 23, 453, 44, 99, 4, 1, 101, 89, 4, 178, 27, 2988, 8, 1, 4177, 2, 7043, 12, 104, 25, 1654, 399, 22, 2, 92, 1499, 364, 73, 304, 33, 61, 57, 9, 117, 23, 453, 1, 104, 70, 142, 64, 453, 44, 6, 3, 53, 9632, 34861, 9793, 9265, 1, 444, 6, 24, 269, 122, 14, 516, 35, 1236, 24, 12, 127, 72, 244, 322, 184, 86, 2, 273, 56, 3788, 4, 3, 17393, 4177, 26, 62, 13243, 721, 5, 29, 1705, 121, 7043, 414, 51, 70, 25, 69, 501, 1, 302, 95, 218, 4, 10, 3876, 7043, 703, 178, 30, 44, 9265, 12, 2988, 70, 25, 556, 134, 9265, 6, 2100, 5, 29, 1, 118, 17, 56, 2517, 15439, 40, 11238, 56, 1314, 129, 7, 13, 30, 9, 96, 78, 5, 387, 37, 1592, 7, 122, 34, 519, 8], [10, 6, 34, 464, 4, 134, 1, 2219, 4, 205, 105, 25, 1, 168, 3860, 2, 347, 39, 12, 64, 163, 276, 149, 129, 3,

In [5]:
skip_window = 2
instances = 0
for sentence  in data:
    instances += len(sentence)-2*skip_window
print(instances)    

11508957


In [68]:
skip_window = 2
context = np.zeros((instances,skip_window*2),dtype=np.int32)
labels = np.zeros((instances,1),dtype=np.int32)
doc = np.zeros((instances,1),dtype=np.int32)

k = 0
for doc_id, sentence  in enumerate(data):
    for i in range(skip_window, len(sentence)-skip_window):
        buffer = sentence[i-skip_window:i+skip_window+1]
        del buffer[skip_window]
        labels[k] = sentence[i]
        context[k] = buffer
        doc[k] = doc_id
        k += 1
        
shuffle_idx = np.random.permutation(k)
labels = labels[shuffle_idx]
doc = doc[shuffle_idx]
context = context[shuffle_idx]

In [124]:
batch_size = 256
context_window = 2*skip_window
embedding_size = 100 # Dimension of the embedding vector.
softmax_width = embedding_size # +embedding_size2+embedding_size3
num_sampled = 50 # Number of negative examples to sample.
sum_ids = np.repeat(np.arange(batch_size),context_window)

len_docs = len(data)

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):
    # Input data.
    train_word_dataset = tf.placeholder(tf.int32, shape=[batch_size*context_window])
    train_doc_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

    segment_ids = tf.constant(sum_ids, dtype=tf.int32)

    word_embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
    doc_embeddings = tf.Variable(tf.random_uniform([len_docs,embedding_size],-1.0,1.0))

    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, softmax_width],
                             stddev=1.0 / np.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Model.
    # Look up embeddings for inputs.
    embed_words = tf.segment_sum(tf.nn.embedding_lookup(word_embeddings, train_word_dataset),segment_ids)
    embed_docs = tf.nn.embedding_lookup(doc_embeddings, train_doc_dataset)
    embed = embed_words+embed_docs#+embed_hash+embed_users

    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(tf.nn.nce_loss(softmax_weights, softmax_biases, embed,
                                   train_labels, num_sampled, vocabulary_size))

    # Optimizer.
    optimizer = tf.train.AdagradOptimizer(0.5).minimize(loss)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embed_docs), 1, keep_dims=True))
    normalized_doc_embeddings = embed_docs / norm

In [82]:
############################
# Chunk the data to be passed into the tensorflow Model
###########################
data_idx = 0
def generate_batch(batch_size):
    global data_idx

    if data_idx+batch_size<len_docs:
        batch_labels = labels[data_idx:data_idx+batch_size]
        batch_doc_data = doc[data_idx:data_idx+batch_size]
        batch_word_data = context[data_idx:data_idx+batch_size]
        data_idx += batch_size
    else:
        overlay = batch_size - (len_docs-data_idx)
        batch_labels = np.vstack([labels[data_idx:len_docs],labels[:overlay]])
        batch_doc_data = np.vstack([doc[data_idx:len_docs],doc[:overlay]])
        batch_word_data = np.vstack([context[data_idx:len_docs],context[:overlay]])
        data_idx = overlay
    batch_word_data = np.reshape(batch_word_data,(-1,1))

    return batch_labels, batch_word_data, batch_doc_data

In [83]:
num_steps = 200001
step_delta = int(num_steps/20)

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps):
        batch_labels, batch_word_data, batch_doc_data\
        = generate_batch(batch_size)
        feed_dict = {train_word_dataset : np.squeeze(batch_word_data),
                     train_doc_dataset : np.squeeze(batch_doc_data),
                     train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
        if step % step_delta == 0:
            if step > 0:
                average_loss = average_loss / step_delta
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0

    # Get the weights to save for later
    final_doc_embeddings = normalized_doc_embeddings.eval()
    final_word_embeddings = word_embeddings.eval()
    final_word_embeddings_out = softmax_weights.eval()

Initialized
Average loss at step 0: 243.351120
Average loss at step 10000: 38.311708
Average loss at step 20000: 6.042729
Average loss at step 30000: 1.952032
Average loss at step 40000: 0.872490
Average loss at step 50000: 0.570455
Average loss at step 60000: 0.502472
Average loss at step 70000: 0.480899
Average loss at step 80000: 0.468121
Average loss at step 90000: 0.462165
Average loss at step 100000: 0.457372
Average loss at step 110000: 0.456215
Average loss at step 120000: 0.454574
Average loss at step 130000: 0.452886
Average loss at step 140000: 0.450593
Average loss at step 150000: 0.449832
Average loss at step 160000: 0.448379
Average loss at step 170000: 0.447042
Average loss at step 180000: 0.446207
Average loss at step 190000: 0.446497
Average loss at step 200000: 0.445484


In [134]:
rand_doc = np.random.randint(len_docs)
closest_doc = np.argmax(np.delete(final_doc_embeddings,rand_doc,0)
                        .dot(final_doc_embeddings[rand_doc][:,None]))
if closest_doc >= rand_doc:
    closest_doc += 1
    
print(final_doc_embeddings[rand_doc].dot(final_doc_embeddings[closest_doc][:,None])[0])

0.414069


In [135]:
sentences[rand_doc]

'wow my first review of this movie was so negative that it was not excepted i will try to tone this one down lets be real no one wants to see a chuck norris movie where he is not the main character there was a good fight scene at the end but the rest of the movie stank i have to wonder if old chuck just can t hang with the best any more has he slowed down so much that he has to turn out junk like this and hope that his reputation will carry him through the entire movie chuck is an awesome martial artist and as we have seen from walker texas ranger a fairly good actor but the trick is to combine both of these qualities in his movies and this one does not very disappointing for us norris fans chuck stay as the main character in your movies because this does not work for you gary'

In [136]:
sentences[closest_doc]

