In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import sys
import argparse
import random
from tempfile import gettempdir
import zipfile

In [2]:
import numpy as np

In [39]:
from six.moves import xrange

In [None]:
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [36]:
# Give a folder path as an argument with '--log_dir' to save
# TensorBoard summaries. Default is a log folder in current directory.
current_path = os.path.dirname(os.path.realpath(path_to_give))

parser = argparse.ArgumentParser()
parser.add_argument(
    '--log_dir',
    type=str,
    default=os.path.join(current_path, 'log'),
    help='The log directory for TensorBoard summaries.')
FLAGS, unparsed = parser.parse_known_args()

# Create the directory for TensorBoard variables if there is not.
if not os.path.exists(FLAGS.log_dir):
    os.makedirs(FLAGS.log_dir)

The following steps are from Tensorflow's example word2vec implementation

Useful Links: <br>
    1. https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/word2vec/word2vec_basic.py 
    2. https://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/
    3. https://github.com/adventuresinML/adventures-in-ml-code

__Step 1: Read the data into a list of strings __

In [4]:
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [13]:
vocabulary = read_data('text8.zip')

In [8]:
vocabulary[0:5]

['anarchism', 'originated', 'as', 'a', 'term']

In [6]:
len(vocabulary)
#total no. of words in the vocabulary (not unique list of words)

17005207

In [7]:
len(set(vocabulary))
#Vocabulary Size = no. of unique words

253854

__Step 2: Build the dictionary and replace rare words with UNK token.__

In [8]:
vocabulary_size = 20000 #setting the upper limit to 20,000 
#the below fn does the following:  
##any words not within the top 20,000 most common words will be marked with an “UNK” designation, standing for “unknown”
##

In [10]:
def build_dataset(words, n_words):
	"""Process raw inputs into a dataset."""
	count = [['UNK', -1]]
	#count the number of words in the given argument (words) and then returns the 'n_words' (which is 20,000) most common words in a list format
	count.extend(collections.Counter(words).most_common(n_words - 1))
	dictionary = dict()
	for word, _ in count:
		dictionary[word] = len(dictionary)
	data = list()
	unk_count = 0
	for word in words:
		index = dictionary.get(word, 0)
		if index == 0:  # dictionary['UNK']
			unk_count += 1
		data.append(index)
	count[0][1] = unk_count
	reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
	return data, count, dictionary, reversed_dictionary

In [14]:
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5]) #common words including UNK token
print 
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0

Most common words (+UNK) [['UNK', 996665], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


__Step 3: Function to generate a training batch for the skip-gram model__

In other words: <br>
To create a data set comprising of our input words and associated grams, which can be used to train our Word2Vec embedding system

__num_skips__ =How many times to reuse an input to generate a label <br>
__num_skips__ restrict the number of context words we would use as output words <br>
the number of words drawn randomly from the surrounding context is defined by the argument __num_skips__ <br>
__skip_window__ = How many words to consider left and right <br>
__batch_size__ = the size of the word list that the input word and context samples will be drawn from <br>
__buffer__ will hold a maximum of span elements and will be a kind of moving window of words that samples are drawn from <br>

In [18]:
# a function to generate mini-batches during our training
def generate_batch(batch_size, num_skips, skip_window):
	global data_index
	assert batch_size % num_skips == 0
	assert num_skips <= 2 * skip_window ##if you give skip_window =1, then num_skips should be at max 2. 
	batch = np.ndarray(shape=(batch_size), dtype=np.int32)
	labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
	span = 2 * skip_window + 1  # [ skip_window target skip_window ]
	buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
	if data_index + span > len(data):
		data_index = 0
	buffer.extend(data[data_index:data_index + span])
	data_index += span
	for i in range(batch_size // num_skips):
		context_words = [w for w in range(span) if w != skip_window]
		words_to_use = random.sample(context_words, num_skips)
		for j, context_word in enumerate(words_to_use):
			batch[i * num_skips + j] = buffer[skip_window] #this is the input word
			labels[i * num_skips + j, 0] = buffer[context_word] #these are the context words
		if data_index == len(data):
			buffer.extend(data[0:span])
			data_index = span
		else:
			buffer.append(data[data_index])
			data_index += 1
	# Backtrack a little bit to avoid skipping words in the end of a batch
	data_index = (data_index + len(data) - span) % len(data)
	return batch, labels

__Step 4: Build a skip-gram model__

In [41]:
batch_size = 128
embedding_size = 300  # Dimension of the embedding vector.
skip_window = 2  # How many words to consider left and right.
num_skips = 4  # How many times to reuse an input to generate a label.
num_sampled = 64  # Number of negative examples to sample.

- The code __below__ randomly chooses 16 integers from 0-100 – this corresponds to the integer indexes of the most common 100 words in our text data <br>
- These will be the words we examine to assess how our learning is progressing in associating related words together in the vector-space

In [20]:
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

Overall Flow (similar to Andrew Ng's course): 
1. Do Forward Propagation (compute softmax)
2. Compute Cost (nce_loss)
3. Do Backward Propagation (here gradient descent optimizer)
4. Update Weights

Detailed Steps we do in below code: 
1. Setup __TensorFlow placeholders__ that will hold our the integer indexes of our input words and 
    context words which we are trying to predict <br>
2. TF Embedding Lookup fn: tf.nn.embedding_lookup(__embeddings__, __train_inputs__) <br>
    tf.nn.embedding_lookup(params, ids, partition_strategy='mod', name=None, validate_indices=True, max_norm=None) <br> The function retrieves __ids__ rows of the __params__ tensor <br> The __partitioning strategy__ is useful for larger scale problems when the matrix might be too large to keep in one piece
3. Construct the variables for the softmax (here noise contrastive estimation softmax)
4. Construct the loss function (computed via nce_loss function in tensorflow) and keep track of the loss function
5. Construct the SGD optimizer using a learning rate of 1.0
6. Construct the cosine similarity between minibatch examples and all embeddings and performing validation 

In [24]:
#we only build the graph below, computing the graph is later
graph = tf.Graph()
with graph.as_default():
            
    # Input data.
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        with tf.name_scope('embeddings'):
            embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # Construct the variables for the NCE loss
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                    tf.truncated_normal(
                            [vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # Explanation of the meaning of NCE loss:
    #     http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
                tf.nn.nce_loss(
                        weights=nce_weights,
                        biases=nce_biases,
                        labels=train_labels,
                        inputs=embed,
                        num_sampled=num_sampled,
                        num_classes=vocabulary_size))

    # Add the loss value as a scalar to summary.
    tf.summary.scalar('loss', loss)

    # Construct the SGD optimizer using a learning rate of 1.0.
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    # Merge all summaries.
    merged = tf.summary.merge_all()

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Create a saver.
    saver = tf.train.Saver()

__Step 5: Begin training__ the skip gram model

1. How to open tensorboard summaries: 
    a. go to log_dir where you have saved the summary and type: tensorboard --log_dir=summaries
    b. then go to browser and type:  'localhost:6006' , it will open projector in "http://localhost:6006/#projector" 

In [42]:
%%time
num_steps = 100001 #100K steps

with tf.Session(graph=graph) as session:
    # Open a writer to write summaries.
    writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)

    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips,skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # Define metadata variable.
        run_metadata = tf.RunMetadata()

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
        # Feed metadata variable to session for visualizing the graph in TensorBoard.
        _, summary, loss_val = session.run(
                [optimizer, merged, loss],
                feed_dict=feed_dict,
                run_metadata=run_metadata)
        average_loss += loss_val

        # Add returned summaries to writer in each step.
        writer.add_summary(summary, step)
        # Add metadata to visualize the graph for the last run.
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step%d' % step)

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8    # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

    # Write corresponding labels for the embeddings.
    with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
        for i in xrange(vocabulary_size):
            f.write(reverse_dictionary[i] + '\n')

    # Save the model for checkpoints.
    saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))

    # Create a configuration for visualizing embeddings with the labels in TensorBoard.
    config = projector.ProjectorConfig()
    embedding_conf = config.embeddings.add()
    embedding_conf.tensor_name = embeddings.name
    embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
    projector.visualize_embeddings(writer, config)
    

writer.close()

Initialized
Average loss at step  0 :  254.97625732421875
Nearest to as: constantine, harassment, istanbul, chlorophyll, eris, cartoons, ram, cheek,
Nearest to than: cohen, features, stoker, infancy, jerry, linguistics, reverend, seldom,
Nearest to they: zohar, wessex, deity, calls, sparrow, inter, especially, jose,
Nearest to but: plausible, preparing, egoism, wise, conrad, tenn, mmorpgs, charismatic,
Nearest to some: consisting, codes, flamenco, fueled, iss, bulgarian, honda, include,
Nearest to was: pearls, tariffs, thebes, angles, sacrificial, aleph, sensory, breeding,
Nearest to up: deism, hittites, drag, resort, illustrated, yielding, suit, bunny,
Nearest to over: quotas, counterpart, muppet, result, sketch, interpreters, ampere, snorri,
Nearest to years: weekend, aquaculture, lizard, enduring, king, guido, eduardo, embryo,
Nearest to four: roses, lex, corpse, companion, lasted, dynamically, turning, lawsuits,
Nearest to states: ceiling, evangelicals, parish, topography, chrono, 

Average loss at step  52000 :  4.898776931524277
Average loss at step  54000 :  4.906833098888397
Average loss at step  56000 :  4.925198348999023
Average loss at step  58000 :  4.773573890924454
Average loss at step  60000 :  4.86030411028862
Nearest to as: operatorname, for, UNK, gland, or, circ, by, is,
Nearest to than: or, for, but, and, three, four, operatorname, six,
Nearest to they: he, it, to, operatorname, that, not, acacia, bpp,
Nearest to but: operatorname, and, gland, coke, it, that, which, or,
Nearest to some: many, the, operatorname, emirates, two, that, and, this,
Nearest to was: is, were, be, operatorname, had, are, has, coke,
Nearest to up: operatorname, coke, deism, which, four, eight, albuquerque, seven,
Nearest to over: coke, five, two, three, to, four, zero, in,
Nearest to years: three, ariane, two, one, weekend, nine, coke, emirates,
Nearest to four: five, seven, three, six, eight, two, zero, nine,
Nearest to states: s, UNK, epistles, in, acacia, and, operatorname

__Step 6: Visualize the embeddings__

In [43]:
# pylint: disable=missing-docstring
# Function to draw visualization of distance between embeddings.
def plot_with_labels(low_dim_embs, labels, filename):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))    # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(
                label,
                xy=(x, y),
                xytext=(5, 2),
                textcoords='offset points',
                ha='right',
                va='bottom')

    plt.savefig(filename)


try:
    # pylint: disable=g-import-not-at-top
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    tsne = TSNE(
            perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
    plot_only = 500
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
    plot_with_labels(low_dim_embs, labels, os.path.join("path to add", 'tsne.png'))

except ImportError as ex:
    print('Please install sklearn, matplotlib, and scipy to show embeddings.')
    print(ex)

__Rough__

__Other ways of extracting Text8Corpus__

In [31]:
#from gensim.models.word2vec import Text8Corpus
#corpus=Text8Corpus('text8')
#corpus is a list of words similar to vocabulary that we produced above

In [12]:
def get_text8():
    words = open('text8').read()
    word2idx = {}
    sents = [[]]
    count = 0
    for word in words.split():
        if word not in word2idx:
            word2idx[word] = count
            count += 1
        sents[0].append(word2idx[word])
    print("count:", count)
    return sents, word2idx

In [23]:
words = open('text8').read()

In [30]:
for word in words.split()[0:25]:
    print (word)

anarchism
originated
as
a
term
of
abuse
first
used
against
early
working
class
radicals
including
the
diggers
of
the
english
revolution
and
the
sans
culottes


In [13]:
sentences, word2idx = get_text8()

count: 253854


In [26]:
#no. of sentences
len(sentences[0])

17005207

In [21]:
len(word2idx)

253854