In [4]:
import collections
import math
import os
import errno
import random
import zipfile
import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf

## Data

In [5]:
data_dir = "word2vec_data/words/"

In [6]:
data_url = "http://mattmahoney.net/dc/text8.zip"

In [7]:
def fetch_data(url = data_url, words_data = data_dir):
    os.makedirs(words_data, exist_ok=True)
    
    zip_path = os.path.join(words_data, "words.zip")
    
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
    
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
        
    return data.decode("ascii").split()

In [8]:
words = fetch_data()

In [9]:
len(words)

17005207

In [10]:
words[10000:10020]

['reciprocity',
 'qualitative',
 'impairments',
 'in',
 'communication',
 'as',
 'manifested',
 'by',
 'at',
 'least',
 'one',
 'of',
 'the',
 'following',
 'delay',
 'in',
 'or',
 'total',
 'lack',
 'of']

In [11]:
for w in words[10000:10020]:
    print(w, end=" ")

reciprocity qualitative impairments in communication as manifested by at least one of the following delay in or total lack of 

## Build Word Counts

In [12]:
from collections import Counter

In [14]:
#mylist = ["one", "two", "two", "four"]

In [15]:
#Counter(mylist)

Counter({'one': 1, 'two': 2, 'four': 1})

In [18]:
#Counter(mylist).most_common(1)

[('two', 2)]

# Create Vocab

In [30]:
def create_counts(vocab_size = 50000):
    vocab = [] + Counter(words).most_common(vocab_size)
    
    vocab = np.array([word for word, _ in vocab])
    
    dictionary = {word: code for code, word in enumerate(vocab)}

    data = np.array([dictionary.get(word,0) for word in words])
    return data, vocab

In [31]:
data, vocabulary = create_counts()

In [32]:
data[0]

5233

In [33]:
words[0]

'anarchism'

## Batch Function

In [None]:
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
    if data_index == len(data):
        buffer[:] = data[:span]
        data_index = span
    else:
        buffer.append(data[data_index])
        data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [37]:
data_index = 0
batch, labels = generate_batch(8, 2, 1)

In [38]:
batch

array([3080, 3080, 3080, 3080, 3080, 3080, 3080, 3080])

In [39]:
labels

array([[5233],
       [  11],
       [5233],
       [  11],
       [5233],
       [  11],
       [  11],
       [5233]])

## Constants

In [41]:
batch_size = 128
embedding_size = 150
skip_window = 1
num_skips = 2

In [42]:
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64
learning_rate = 0.01