In [1]:
import tensorflow as tf
import numpy as np
import collections
from math import ceil
import os
import nltk
import random
import math
import csv
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from matplotlib import pylab
import matplotlib.pyplot as plt

In [2]:
filename = "processed_data.txt"

In [3]:
with open(filename, 'r') as f:
    words = f.readlines()

In [4]:
words = " ".join(words)

In [5]:
def read_data(data_file):
    data = []
    print("READING DATA........")
    tokenized = nltk.word_tokenize(data_file)
    data.extend(tokenized)
    return data

words = read_data(words)
print('Data size %d' % len(words))
print('First 10 words: ', words[:10])
    

READING DATA........
Data size 6868221
First 10 words:  ['description', 'of', 'farmer', 'oak', 'an', 'incident', 'when', 'farmer', 'oak', 'smiled']


## Storing data in different manners
- dictionary: maps a word to an ID
- reverse_dictionary: maps an ID to a word
- count: stores the frequency of a word in a list of tuples
- data: stores the words as their IDs
 
This will help us in the future

In [7]:
vocabulary_size = 10000

def create_dataset(words):
    count = [['UNK', -1]]
    # we only choose the most frequently used words as our vocab rest will be replaced with UNK 
    # 'collections' will give us the most frequent words for a certain length
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    
    # mapping word to ID
    for word, _ in count:
        dictionary[word] = len(dictionary) # this will give IDs to all words as with each iteration 'len' increases
        
    # stores words as theirs IDs
    data = list()
    unk_count = 0
    
    for word in words:
        # If a word in dictionay then we use ID else it will be 'UNK'
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0 # id of UNK
            unk_count = unk_count + 1
        data.append(index)
    
    # total UNK in dataset
    count[0][1] = unk_count 

    # mapping ID to word
    # here we interchanged the positions, from (keys:values) to (values:keys)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    
    # making sure the dict is of the size of vocab
    assert len(dictionary) == vocabulary_size # this is like a checker if true then pass else raise AssertionError
    
    return data, count, dictionary, reverse_dictionary
    
data, count, dictionary, reverse_dictionary = create_dataset(words)
print('Data: ', data[:10])
print('Most common words: ' ,count[:10])


Data:  [0, 4, 162, 52, 42, 2138, 49, 162, 52, 1481]
Most common words:  [['UNK', 113980], ('the', 391991), ('and', 216126), ('a', 196601), ('of', 191339), ('to', 178043), ('in', 118910), ('i', 101234), ('was', 99654), ('it', 78347)]


## Create Batches of data for CBOW

In [9]:
data_index = 0

def create_cbow_batch(batch_size, window_size):
    # We will update the data_index everytime we read a set of data point
    global data_index
    
    span = 2 * window_size + 1 # [skip_window target skip_window]
    
    batch = np.ndarray(shape=(batch_size, span -1), dtype = np.int32)
    labels= np.ndarray(shape=(batch_size, 1), dtype = np.int32)

    buffer = collections.deque(maxlen=span)

    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size):
        target = window_size  
        target_to_avoid = [ window_size ] 
        col_idx = 0
        for j in range(span):
            if j==span//2:
                continue
            batch[i,col_idx] = buffer[j] 
            col_idx += 1
        labels[i, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

for window_size in [1,2]:
    data_index = 0
    batch, labels = create_cbow_batch(batch_size=8, window_size=window_size)
    print('\nwith window_size = %d:' % (window_size))
    print('    batch:', [[reverse_dictionary[bii] for bii in bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])
    
    


with window_size = 1:
    batch: [['UNK', 'farmer'], ['of', 'oak'], ['farmer', 'an'], ['oak', 'incident'], ['an', 'when'], ['incident', 'farmer'], ['when', 'oak'], ['farmer', 'smiled']]
    labels: ['of', 'farmer', 'oak', 'an', 'incident', 'when', 'farmer', 'oak']

with window_size = 2:
    batch: [['UNK', 'of', 'oak', 'an'], ['of', 'farmer', 'an', 'incident'], ['farmer', 'oak', 'incident', 'when'], ['oak', 'an', 'when', 'farmer'], ['an', 'incident', 'farmer', 'oak'], ['incident', 'when', 'oak', 'smiled'], ['when', 'farmer', 'smiled', 'the'], ['farmer', 'oak', 'the', 'corners']]
    labels: ['farmer', 'oak', 'an', 'incident', 'when', 'farmer', 'oak', 'smiled']


In [25]:
# Hyperparameters
batch_size = 128
embedding_size = 128
window_size = 2

#validation
valid_size = 16
valid_window = 50
valid_examples = np.array(random.sample(range(valid_window), valid_size))
valid_examples = np.append(valid_examples,random.sample(range(1000, 1000+valid_window), valid_size),axis=0)

num_sampled = 32 # Number of negative examples to sample.


In [26]:
tf.reset_default_graph()
train_dataset = tf.placeholder(tf.int32, shape=[batch_size, 2*window_size])
train_labels= tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype = tf.int32)

In [27]:
# variables
# Embedding layer, contains the word embeddings
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0,dtype=tf.float32))

# Softmax Weights and Biases
softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                 stddev=0.5 / math.sqrt(embedding_size),dtype=tf.float32))
softmax_biases = tf.Variable(tf.random_uniform([vocabulary_size],0.0,0.01))

In [28]:
# Model.
# Look up embeddings for a batch of inputs.
# Here we do embedding lookups for each column in the input placeholder
# and then average them to produce an embedding_size word vector
stacked_embedings = None
print('Defining %d embedding lookups representing each word in the context'%(2*window_size))
for i in range(2*window_size):
    embedding_i = tf.nn.embedding_lookup(embeddings, train_dataset[:,i])        
    x_size,y_size = embedding_i.get_shape().as_list()
    if stacked_embedings is None:
        stacked_embedings = tf.reshape(embedding_i,[x_size,y_size,1])
    else:
        stacked_embedings = tf.concat(axis=2,values=[stacked_embedings,tf.reshape(embedding_i,[x_size,y_size,1])])

assert stacked_embedings.get_shape().as_list()[2]==2*window_size
print("Stacked embedding size: %s"%stacked_embedings.get_shape().as_list())
mean_embeddings =  tf.reduce_mean(stacked_embedings,2,keepdims=False)
print("Reduced mean embedding size: %s"%mean_embeddings.get_shape().as_list())

# Compute the softmax loss, using a sample of the negative labels each time.
# inputs are embeddings of the train words
# with this loss we optimize weights, biases, embeddings
loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=mean_embeddings,
                           labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

Defining 4 embedding lookups representing each word in the context
Stacked embedding size: [128, 128, 4]
Reduced mean embedding size: [128, 128]


In [29]:
# optimizer 
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

In [30]:
# Getting word similarities from cosine distance
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

In [33]:
num_steps = 100001
cbow_losses = []

# ConfigProto is a way of providing various configuration settings 
# required to execute the graph
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as session:
    
    # Initialize the variables in the graph
    tf.global_variables_initializer().run()
    print('Initialized')
    
    average_loss = 0
    
    # Train the Word2vec model for num_step iterations
    for step in range(num_steps):
        
        # Generate a single batch of data
        batch_data, batch_labels = create_cbow_batch(batch_size, window_size)
        
        # Populate the feed_dict and run the optimizer (minimize loss)
        # and compute the loss
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        
        # Update the average loss variable
        average_loss += l
        
        if (step+1) % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
            cbow_losses.append(average_loss)
            print('Average loss at step %d: %f' % (step+1, average_loss))
            average_loss = 0
            
        # Evaluating validation set word similarities
        if (step+1) % 10000 == 0:
            sim = similarity.eval()
            # Here we compute the top_k closest words for a given validation word
            # in terms of the cosine distance
            # We do this for all the words in the validation set
            # Note: This is an expensive step
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)
    cbow_final_embeddings = normalized_embeddings.eval()
    

np.save('cbow_embeddings',cbow_final_embeddings)

with open('cbow_losses.csv', 'wt') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(cbow_losses)

Initialized
Average loss at step 2000: 0.888345
Average loss at step 4000: 0.814543
Average loss at step 6000: 0.783365
Average loss at step 8000: 0.786896
Average loss at step 10000: 0.789523
Nearest to were: buttons, joe, corrected, informant, marketday, bess, compounds, attending,
Nearest to for: culminated, blooming, creaks, density, narrowness, generalities, wandering, tides,
Nearest to one: convulsions, repliedi, paths, privilege, warm, olbscured, shame, twilight,
Nearest to was: changed, instincts, trained, murmuring, purse, villains, threatening, screen,
Nearest to and: pulse, profits, guessing, rarest, snakes, ho, resentment, francis,
Nearest to his: wilinot, plainly, neck, tripped, strangulation, picking, prepara, bliss,
Nearest to as: stalactite, wool, lanterns, affairs, weighted, delicate, hireling, holding,
Nearest to be: touched, have, laughing, tomb, leastwise, endeavours, corresponding, unassumingly,
Nearest to so: beg, conver, icicles, habit, dun, negative, impeachable

Average loss at step 62000: 0.577793
Average loss at step 64000: 0.575684
Average loss at step 66000: 0.559593
Average loss at step 68000: 0.570851
Average loss at step 70000: 0.548289
Nearest to were: are, upwards, enemy, fooled, ness, impressions, marketday, elderly,
Nearest to for: sluices, overtaken, spring, augured, fitness, culminated, hensible, eyeletholes,
Nearest to one: addition, repliedi, tophet, easier, fine, another, milestone, stalls,
Nearest to was: is, phases, wasnt, freshening, sowed, carrier, trained, lodged,
Nearest to and: guessing, shook, pore, incurable, purification, profits, bachelorship, lance,
Nearest to his: your, strangulation, tripped, ascent, sensations, liddys, plainly, hats,
Nearest to as: quenched, weighted, stalactite, doesnt, onward, terminated, affairs, curve,
Nearest to be: have, nonfulfilment, complements, use, tracks, sexajessamine, mystified, moth,
Nearest to so: dun, bond, gallantries, mainly, groom, garish, amble, rude,
Nearest to at: minddwelt