In [48]:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import matplotlib as plt
import os
import random
from tempfile import gettempdir
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

In [2]:

# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'


# pylint: disable=redefined-outer-name
def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  local_filename = os.path.join(gettempdir(), filename)
  if not os.path.exists(local_filename):
    local_filename, _ = urllib.request.urlretrieve(url + filename,
                                                   local_filename)
  statinfo = os.stat(local_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception('Failed to verify ' + local_filename +
                    '. Can you get to it with a browser?')
  return local_filename

In [3]:
filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [4]:

# Read the data into a list of strings.
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words."""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

vocabulary = read_data(filename)
print('Data size', len(vocabulary))

Data size 17005207


In [7]:
vocabulary[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [8]:
vocabulary_size = 50000
def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    index = dictionary.get(word, 0)
    if index == 0:  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)

In [9]:
data[:10]

[5240, 3084, 12, 6, 195, 2, 3135, 46, 59, 156]

In [10]:
count[:10]

[['UNK', 418391],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [13]:
dictionary

{'rear': 4027,
 'grapple': 36748,
 'sichuan': 27350,
 'encircles': 44612,
 'natively': 19751,
 'valves': 9552,
 'scharnhorst': 30323,
 'tastes': 10477,
 'foreman': 22701,
 'kingston': 10713,
 'pannonia': 22515,
 'problems': 640,
 'groundbreaking': 16023,
 'convention': 1186,
 'unending': 46467,
 'onboard': 18395,
 'guerrilla': 5715,
 'sidewise': 46283,
 'zealand': 1637,
 'vers': 37838,
 'cochran': 33801,
 'szczecin': 49334,
 'vorkosigan': 48602,
 'ign': 41018,
 'insular': 13675,
 'vivekananda': 45234,
 'convoy': 17567,
 'lantau': 31660,
 'khanate': 27040,
 'lamp': 8322,
 'theosis': 45851,
 'cavaliers': 40562,
 'heroine': 19101,
 'laval': 25637,
 'lodging': 23737,
 'sharpe': 29191,
 'harpoon': 24046,
 'canisters': 47603,
 'sugarcane': 17627,
 'thucydides': 24173,
 'unhappy': 12177,
 'cruz': 9226,
 'jockeys': 27625,
 'enable': 6973,
 'araki': 43003,
 'homeopaths': 29192,
 'direct': 873,
 'quantities': 3883,
 'linden': 34346,
 'cessation': 17996,
 'abdomen': 14710,
 'degeneration': 23099,

In [14]:
reverse_dictionary

{0: 'UNK',
 1: 'the',
 2: 'of',
 3: 'and',
 4: 'one',
 5: 'in',
 6: 'a',
 7: 'to',
 8: 'zero',
 9: 'nine',
 10: 'two',
 11: 'is',
 12: 'as',
 13: 'eight',
 14: 'for',
 15: 's',
 16: 'five',
 17: 'three',
 18: 'was',
 19: 'by',
 20: 'that',
 21: 'four',
 22: 'six',
 23: 'seven',
 24: 'with',
 25: 'on',
 26: 'are',
 27: 'it',
 28: 'from',
 29: 'or',
 30: 'his',
 31: 'an',
 32: 'be',
 33: 'this',
 34: 'which',
 35: 'at',
 36: 'he',
 37: 'also',
 38: 'not',
 39: 'have',
 40: 'were',
 41: 'has',
 42: 'but',
 43: 'other',
 44: 'their',
 45: 'its',
 46: 'first',
 47: 'they',
 48: 'some',
 49: 'had',
 50: 'all',
 51: 'more',
 52: 'most',
 53: 'can',
 54: 'been',
 55: 'such',
 56: 'many',
 57: 'who',
 58: 'new',
 59: 'used',
 60: 'there',
 61: 'after',
 62: 'when',
 63: 'into',
 64: 'american',
 65: 'time',
 66: 'these',
 67: 'only',
 68: 'see',
 69: 'may',
 70: 'than',
 71: 'world',
 72: 'i',
 73: 'b',
 74: 'would',
 75: 'd',
 76: 'no',
 77: 'however',
 78: 'between',
 79: 'about',
 80: 'over'

In [15]:
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])


Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5240, 3084, 12, 6, 195, 2, 3135, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [17]:

data_index = 0
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  if data_index + span > len(data):
    data_index = 0
  buffer.extend(data[data_index:data_index + span])
  data_index += span
  for i in range(batch_size // num_skips):
    context_words = [w for w in range(span) if w != skip_window]
    words_to_use = random.sample(context_words, num_skips)
    for j, context_word in enumerate(words_to_use):
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[context_word]
    if data_index == len(data):
      buffer[:] = data[:span]
      data_index = span
    else:
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)

In [18]:
batch

array([3084, 3084,   12,   12,    6,    6,  195,  195])

In [19]:
labels

array([[  12],
       [5240],
       [   6],
       [3084],
       [  12],
       [ 195],
       [   6],
       [   2]])

In [20]:
data[:10]

[5240, 3084, 12, 6, 195, 2, 3135, 46, 59, 156]

In [21]:
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

3084 originated -> 12 as
3084 originated -> 5240 anarchism
12 as -> 6 a
12 as -> 3084 originated
6 a -> 12 as
6 a -> 195 term
195 term -> 6 a
195 term -> 2 of


In [22]:
# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.
num_sampled = 64      # Number of negative examples to sample.

In [26]:
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [27]:
valid_examples

array([32, 16, 77, 48, 58, 93, 18, 73, 57, 43, 61, 67, 78, 60, 11,  1])

In [25]:
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)


In [28]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

In [29]:
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [30]:
embed

<tf.Tensor 'embedding_lookup:0' shape=(128, 128) dtype=float32>

In [31]:
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))

In [32]:
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [33]:
loss = tf.reduce_mean(
      tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size))

In [34]:
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

In [36]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [37]:
init = tf.global_variables_initializer()

In [50]:
num_steps =100000

In [51]:
with tf.Session() as session:
  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')

  average_loss = 0
  for step in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(
        batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step ', step, ': ', average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
        log_str = 'Nearest to %s:' % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log_str = '%s %s,' % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step  0 :  256.52764892578125
Nearest to be: moulds, glottalized, oak, falsification, inexperienced, diamonds, trichomes, vyacheslav,
Nearest to five: lubbock, sprayed, anonymously, chip, brushes, resembles, sport, battlefront,
Nearest to however: printers, created, narrative, basset, langston, quieter, cloisters, chips,
Nearest to some: caplan, siliceous, lingerie, partitioning, mews, bahri, buffett, ati,
Nearest to new: tissue, emir, pokey, niva, spinning, consultations, stockade, lemay,
Nearest to often: const, littoral, elegans, considers, sib, marinetti, multitasking, niels,
Nearest to was: incorruptible, upwards, damon, crookes, talladega, acker, antigens, sharpen,
Nearest to b: lysine, oro, lcd, calgary, boyfriend, elliott, harming, keanu,
Nearest to who: victorians, leontopithecus, contemplating, insolvency, sherlock, suffixed, esaf, tempore,
Nearest to other: timeline, etienne, madhu, yourdon, lt, urbanized, poses, merits,
Nearest to after: position

Average loss at step  52000 :  5.100784014582634
Average loss at step  54000 :  5.081681323051453
Average loss at step  56000 :  5.095704735755921
Average loss at step  58000 :  5.029109515786171
Average loss at step  60000 :  5.018202128648758
Nearest to be: but, lymphoma, batting, vma, though, that, moulds, operatorname,
Nearest to five: any, battlefront, transferred, agouti, kapoor, this, ursus, haer,
Nearest to however: printers, kapoor, narrative, mukherjee, archie, digital, complete, created,
Nearest to some: circ, dasyprocta, operatorname, kapoor, in, abet, microcebus, abakan,
Nearest to new: these, some, agouti, tissue, three, archie, two, gigantopithecus,
Nearest to often: four, five, six, two, seven, eight, one, ursus,
Nearest to was: agouti, their, kapoor, its, this, michelob, ursus, dasyprocta,
Nearest to b: after, but, agave, circ, or, however, ursus, aquaculture,
Nearest to who: was, be, were, by, tempering, had, asterism, tempore,
Nearest to other: advocates, timeline, c