In [45]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [46]:
import collections
import math
import os
import random
import zipfile

In [47]:
from six.moves import urllib
from six.moves import xrange

In [48]:
import numpy as np
# import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()

In [49]:
print(np.__version__)
print(tf.__version__)

1.20.1
2.3.0


In [50]:
DOWNLOADED_FILENAME = 'SampleText.zip'
def maybe_download(url_path, expected_bytes):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
        
    statinfo = os.stat(DOWNLOADED_FILENAME)
    if statinfo.st_size == expected_bytes:
        print('Found and verified file from tis path: ', url_path)
        print('Downloaded file: ', DOWNLOADED_FILENAME)
    else:
        print(statinfo.st_size)
        raise Exception(
        'Failed to verify file from: ' + url_path + '. Can you get it with a browser?')

In [51]:
def read_words():
    with zipfile.ZipFile(DOWNLOADED_FILENAME) as f:
        firstfile = f.namelist()[0]
        filestring = tf.compat.as_str(f.read(firstfile))
        words = filestring.split()
    
    return words

In [52]:
URL_PATH = 'http://mattmahoney.net/dc/text8.zip'
FILESIZE = 31344016

maybe_download(URL_PATH, FILESIZE)

Found and verified file from tis path:  http://mattmahoney.net/dc/text8.zip
Downloaded file:  SampleText.zip


In [53]:
vocabulary = read_words()

In [54]:
vocabulary[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [55]:
collections.Counter(vocabulary).most_common(5000 -1)

[('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430),
 ('two', 192644),
 ('is', 183153),
 ('as', 131815),
 ('eight', 125285),
 ('for', 118445),
 ('s', 116710),
 ('five', 115789),
 ('three', 114775),
 ('was', 112807),
 ('by', 111831),
 ('that', 109510),
 ('four', 108182),
 ('six', 102145),
 ('seven', 99683),
 ('with', 95603),
 ('on', 91250),
 ('are', 76527),
 ('it', 73334),
 ('from', 72871),
 ('or', 68945),
 ('his', 62603),
 ('an', 61925),
 ('be', 61281),
 ('this', 58832),
 ('which', 54788),
 ('at', 54576),
 ('he', 53573),
 ('also', 44358),
 ('not', 44033),
 ('have', 39712),
 ('were', 39086),
 ('has', 37866),
 ('but', 35358),
 ('other', 32433),
 ('their', 31523),
 ('its', 29567),
 ('first', 28810),
 ('they', 28553),
 ('some', 28161),
 ('had', 28100),
 ('all', 26229),
 ('more', 26223),
 ('most', 25563),
 ('can', 25519),
 ('been', 25383),
 ('such', 24413),
 ('many', 24096),
 ('who', 2

In [56]:
def build_dataset(words, n_words):
    word_counts = [['UNKNOWN', -1]]
    
    counter = collections.Counter(words)
    word_counts.extend(counter.most_common(n_words -1))
    
    dictionary = dict()
    
    for word, _ in word_counts:
        dictionary[word] = len(dictionary)
        
    word_indexes = list()
        
    unknown_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0 #dictionary['UNKNOWN']
            unknown_count +=1
        word_indexes.append(index)
        
    word_counts[0][1] = unknown_count
    
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    return word_counts, word_indexes, dictionary, reversed_dictionary

In [57]:
VOCABULARY_SIZE = 5000

word_counts, word_indexes, dictionary, reversed_dictionary = build_dataset(vocabulary, VOCABULARY_SIZE)

In [58]:
word_counts[:10]

[['UNKNOWN', 2735459],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [59]:
word_indexes[:10]

[0, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [60]:
import random

for key in random.sample(list(dictionary), 10):
    print(key, ":", dictionary[key])

noble : 3938
classic : 1488
edinburgh : 4203
supported : 1165
parents : 1865
tournament : 4507
principal : 2040
brought : 754
led : 281
gods : 1773


In [61]:
import random

for key in random.sample(list(reversed_dictionary), 10):
    print(key, ":", reversed_dictionary[key])

200 : own
468 : william
3712 : beauty
3277 : twentieth
2309 : fear
1397 : possibly
3041 : referring
4101 : sole
3912 : refugees
1839 : positions


In [62]:
del vocabulary

In [63]:
# Global index into words maintained across batches
global_index = 0

In [64]:
def generate_batch(word_indexes, batch_size, num_skips, skip_window):
    global global_index
    
    assert batch_size % num_skips == 0
    assert num_skips <=2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    span = 2 * skip_window + 1 # [skip_window input_word skip_window]
    
    buffer = collections.deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
        
    for i in range(batch_size // num_skips):
        target = skip_window # input word at the center of the buffer
        targets_to_avoid = [skip_window]
        
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
                
            targets_to_avoid.append(target)
            
            batch[i * num_skips + j] = buffer[skip_window] # this is the input word
            labels[i * num_skips + j, 0] = buffer[target] # these are the context words
        
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
        
    global_index = (global_index + len(word_indexes) - span) % len(word_indexes)
    
    return batch, labels

In [65]:
batch, labels = generate_batch(word_indexes, 10, 2, 5)

In [66]:
batch

array([   2,    2, 3134, 3134,   46,   46,   59,   59,  156,  156])

In [67]:
labels

array([[  59],
       [ 128],
       [  59],
       [3081],
       [   2],
       [ 195],
       [ 477],
       [ 195],
       [ 128],
       [  46]])

In [68]:
for i in range(9):
    print(reversed_dictionary[batch[i]], ":", reversed_dictionary[labels[i][0]])

of : used
of : early
abuse : used
abuse : originated
first : of
first : term
used : class
used : term
against : early


In [69]:
# Reset the global index because we updated while testing the batch code
global_index = 0

In [70]:
valid_size = 16
valid_window = 100

valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [71]:
batch_size = 128
embedding_size = 50
skip_window = 2
num_skips = 2

In [72]:
tf.reset_default_graph()

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [73]:
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [74]:
embeddings = tf.Variable(
    tf.random_uniform([VOCABULARY_SIZE, embedding_size], -1.0, 1.0))

embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [75]:
embeddings

<tf.Variable 'Variable:0' shape=(5000, 50) dtype=float32>

In [76]:
embed

<tf.Tensor 'embedding_lookup/Identity_1:0' shape=(128, 50) dtype=float32>

In [77]:
weights = tf.Variable(tf.truncated_normal([VOCABULARY_SIZE, embedding_size],
                                         stddev=1.0 / math.sqrt(embedding_size)))

biases = tf.Variable(tf.zeros([VOCABULARY_SIZE]))

hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases

In [78]:
hidden_out

<tf.Tensor 'add:0' shape=(128, 5000) dtype=float32>

In [79]:
train_one_hot = tf.one_hot(train_labels, VOCABULARY_SIZE)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=hidden_out,
                                                             labels=train_one_hot))

In [80]:
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

In [83]:
l2_norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))

normalized_embeddings = embeddings / l2_norm

In [84]:
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)

In [85]:
valid_embeddings

<tf.Tensor 'embedding_lookup_1/Identity:0' shape=(16, 50) dtype=float32>