# TensorFlow Tutorial 
https://www.tensorflow.org/tutorials/word2vec

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random

from tempfile import gettempdir
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf

  return f(*args, **kwds)


In [2]:
url = 'http://mattmahoney.net/dc/'

In [3]:
def maybe_download(filename, expected_bytes):
    local_filename = os.path.join(gettempdir(), filename)
    if not os.path.exists(local_filename):
        local_filename, _ = urllib.request.urlretrieve(url + filename, local_filename)
    statinfo = os.stat(local_filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify '+local_filename+'. Can you get to it with a browser?')
        
    return local_filename
    

In [4]:
# filename = maybe_download('text8.zip', 31344016)

In [5]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [6]:
filename = 'text8.zip'
vocabulary = read_data(filename)

print('Data size', len(vocabulary))

Data size 17005207


In [7]:
vocabulary_size = 50000

In [10]:
def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words -1))
    dictionary = dict()
    for word,_ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0: # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [11]:
# data - list of codes(words are replaced by their codes)
# count - map of words(strings) to count of occurrences
# dictionary - map of words(strings) to theirs codes(integers)
# reversed_dictionary - maps codes(integers) to words(strings)
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)

In [12]:
del vocabulary

In [13]:
print('Most common words (+UNK)', count[:5])

Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]


In [14]:
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [15]:
data_index = 0

In [16]:
# Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1), dtype=np.int32)
    span = 2 * skip_window + 1 # [skip_window target skip_window]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size//num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            buffer[:] = data[:span]
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little a bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    
    return batch, labels
    

In [17]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)

for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]],
         '->', labels[i, 0], reverse_dictionary[labels[i,0]])

3081 originated -> 5234 anarchism
3081 originated -> 12 as
12 as -> 3081 originated
12 as -> 6 a
6 a -> 195 term
6 a -> 12 as
195 term -> 2 of
195 term -> 6 a


In [18]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2
num_sampled = 64

In [20]:
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [21]:
graph = tf.Graph()

In [22]:
with graph.as_default():
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(tf.random_uniform(
            [vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                               stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  # Explanation of the meaning of NCE loss:
  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                      biases=nce_biases,
                      labels=train_labels,
                      inputs=embed,
                      num_sampled=num_sampled,
                      num_classes=vocabulary_size)
        )
    
    # SGD optimizer using learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1, keep_dims=True))
    normalized_embeddings = embeddings/norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)
    
    # Add variable initializer
    init = tf.global_variables_initializer()
    

In [23]:
num_steps = 100001

In [29]:
with tf.Session(graph=graph) as session:
    init.run()
    print('initialized.')
    
    average_loss = 0
    
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        
        average_loss += loss_val
        
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
        
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log_str = 'Nearest to %s:' % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s' % (log_str, close_word)
                    
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

initialized.
Average loss at step  0 :  286.504394531
Nearest to has: malaria bethlen cistercian giacomo omer rite gyeonggi guayaquil
Nearest to seven: bonuses structure toss lenten tsarevich merchandise snowmobile andr
Nearest to may: flywheels kaf prospectors michaux avogadro deflate loft eoin
Nearest to on: assay afb filial machined exquisite humus newspaper dumont
Nearest to four: collapses suitor pavlov chimneys venerable uniqueness schirra jacobitism
Nearest to his: writable tendencies saloon irons plot damien lena tombstone
Nearest to more: uss email purse hobbit automation outpost horn whitney
Nearest to this: kyle bouvines descriptions poem dalit bipedalism lapis sandhi
Nearest to who: underdogs diverges gepids stiff dixon transistors honoring answer
Nearest to d: khalil align webmineral feat despotism personification browsing fu
Nearest to some: pharmacy coriander ben undoing distributist hoyle storks stanislavski
Nearest to there: mena scrimmage comparably sorcerers celesta 

Nearest to from: in at between after into and on eight
Average loss at step  62000 :  4.96239105809
Average loss at step  64000 :  4.88086432087
Average loss at step  66000 :  4.78974966991
Average loss at step  68000 :  4.82754983079
Average loss at step  70000 :  4.86707739472
Nearest to has: had have was is eugenic cc became dsn
Nearest to seven: six eight five four three nine zero two
Nearest to may: can would must will could should might cannot
Nearest to on: in at during wootz mtu parakeet striping glas
Nearest to four: six five seven three eight two nine one
Nearest to his: their her its the kiang s some modula
Nearest to more: less most adventurer mode brie writers really desire
Nearest to this: which it the that brie nf one crescent
Nearest to who: he which they and there conforming entities it
Nearest to d: b khalil o novello pope UNK township fenrir
Nearest to some: many their all these other several each certain
Nearest to there: it they he which brie usually often modula
N

In [30]:
def plot_with_labels(low_dim_embs, labels, filename):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    
    plt.figure(figsize=(18,18))
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                    xy=(x,y),
                    xytext=(5,2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
        
        plt.savefig(filename)


In [31]:
try:
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
    plot_only= 500
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
    plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png'))
    
except ImportError as ex:
    print('Please install sklearn, matplotlib, and scipy')
    print(ex)