In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import collections
import math
import os
import sys
import argparse
import random
from tempfile import gettempdir
import zipfile

In [3]:
import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [4]:
print(tf.__version__)

1.4.0


In [5]:
from tensorflow.contrib.tensorboard.plugins import projector

In [6]:
current_path = os.path.dirname(os.path.realpath(sys.argv[0]))

In [7]:
parser = argparse.ArgumentParser()

In [8]:
parser.add_argument(
    '--log_dir',
    type=str,
    default=os.path.join(current_path, 'log'),
    help='The log directory for TensorBoard summaries.')

_StoreAction(option_strings=['--log_dir'], dest='log_dir', nargs=None, const=None, default='/home/dayou/anaconda3/lib/python3.6/site-packages/log', type=<class 'str'>, choices=None, help='The log directory for TensorBoard summaries.', metavar=None)

In [9]:
FLAGS, unparsed = parser.parse_known_args()

In [10]:
if not os.path.exists(FLAGS.log_dir):
    os.makedirs(FLAGS.log_dir)

In [11]:
# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'

In [12]:
# pylint: disable=redefined-outer-name
def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  local_filename = os.path.join(gettempdir(), filename)
  if not os.path.exists(local_filename):
    local_filename, _ = urllib.request.urlretrieve(url + filename,
                                                   local_filename)
  statinfo = os.stat(local_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception('Failed to verify ' + local_filename +
                    '. Can you get to it with a browser?')
  return local_filename

In [14]:
filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [15]:
# Read the data into a list of strings.
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words."""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

In [18]:
vocabulary = read_data(filename)

In [21]:
type(vocabulary),len(vocabulary)

(list, 17005207)

In [22]:
vocabulary[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [24]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 71290

In [25]:
def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    index = dictionary.get(word, 0)
    if index == 0:  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary

In [26]:
data, count, dictionary, reverse_dictionary = build_dataset(
    vocabulary, vocabulary_size)

In [27]:
type(data),type(count),type(dictionary),type(reverse_dictionary)

(list, list, dict, dict)

In [28]:
len(data),len(count),len(dictionary),len(reverse_dictionary)

(17005207, 71290, 71290, 71290)

In [31]:
data[:10]

[5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [36]:
dictionary['of']

2

In [42]:
dictionary.get('aaaaa')

In [43]:
del vocabulary

In [44]:
print('Most common words (+UNK)', count[:5])

Most common words (+UNK) [['UNK', 286368], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]


In [45]:
print('Most common words (+UNK)', data[:5])

Most common words (+UNK) [5234, 3081, 12, 6, 195]


In [46]:
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [47]:
data_index = 0

In [48]:
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
  if data_index + span > len(data):
    data_index = 0
  buffer.extend(data[data_index:data_index + span])
  data_index += span
  for i in range(batch_size // num_skips):
    context_words = [w for w in range(span) if w != skip_window]
    words_to_use = random.sample(context_words, num_skips)
    for j, context_word in enumerate(words_to_use):
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[context_word]
    if data_index == len(data):
      buffer.extend(data[0:span])
      data_index = span
    else:
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels