<a href="https://colab.research.google.com/github/shekharkoirala/machinelearning_algorithms_analysis/blob/master/stanfordcourse/word2veceager.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import numpy as np
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import warnings
warnings.filterwarnings("ignore")

# import utils
# import word2vec_utils

In [3]:
!pip install tensorboardcolab



In [0]:
# Enable eager execution!
import tensorflow.contrib.eager as tfe
tfe.enable_eager_execution()

In [0]:
#hyper parameters 
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128            # dimension of the word embedding vectors
SKIP_WINDOW = 1             # the context window
NUM_SAMPLED = 64            # number of negative examples to sample
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
VISUAL_FLD = 'visualization'
SKIP_STEP = 5000

In [0]:
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016

# **Prepare Data**

In [7]:
!mkdir data
!curl -O http://mattmahoney.net/dc/text8.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 29.8M  100 29.8M    0     0  2139k      0  0:00:14  0:00:14 --:--:-- 2196k


In [0]:
!sudo mv text8.zip data



In [9]:
!ls data

text8.zip


In [0]:
import zipfile
file_path = "data/text8.zip"
with zipfile.ZipFile(file_path) as f:
  words = tf.compat.as_str(f.read(f.namelist()[0])).split() 

In [0]:
try:
  os.mkdir(VISUAL_FLD)
except OSError:
  pass

In [12]:
file = open(os.path.join(VISUAL_FLD, "vocab.tsv"), "w")
from collections import Counter
dictionary = dict()
count = [('UNK', -1)]
index = 0
count.extend(Counter(words).most_common(VOCAB_SIZE - 1))
for word, _ in count:
  print(word,_)
  dictionary[word] = index
  index += 1
  file.write(word + '\n')
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
file.close()

UNK -1
the 1061396
of 593677
and 416629
one 411764
in 372201
a 325873
to 316376
zero 264975
nine 250430
two 192644
is 183153
as 131815
eight 125285
for 118445
s 116710
five 115789
three 114775
was 112807
by 111831
that 109510
four 108182
six 102145
seven 99683
with 95603
on 91250
are 76527
it 73334
from 72871
or 68945
his 62603
an 61925
be 61281
this 58832
which 54788
at 54576
he 53573
also 44358
not 44033
have 39712
were 39086
has 37866
but 35358
other 32433
their 31523
its 29567
first 28810
they 28553
some 28161
had 28100
all 26229
more 26223
most 25563
can 25519
been 25383
such 24413
many 24096
who 23997
new 23770
used 22737
there 22707
after 21125
when 20623
into 20484
american 20477
time 20412
these 19864
only 19463
see 19206
may 19115
than 18807
world 17949
i 17581
b 17516
would 17377
d 17236
no 16155
however 15861
between 15737
about 15574
over 15122
years 14935
states 14916
people 14696
war 14629
during 14578
united 14494
known 14437
if 14420
called 14151
use 14011
th 13380
sys

In [0]:
index_words =[dictionary[word] if word in dictionary else 0 for word in words]

In [0]:
# del words , since google collab

#based on skip gram

import random
def generate_sample(index_words , context_window_size):
  #"made according to skip gram , each target context pair is treated as new data"
  for index, center in enumerate(index_words):
      #"center is index from dictionary and we need index to calculate index words"
      context = random.randint(1,context_window_size)
      # context is random , since context_window_size is 1 , it is always 1
      # before the center words
      for target in index_words[max(0, index-context):index]:
        yield center , target
      # after the center words
      for target in index_words[index+1:index+1+context]:
        yield center , target

simple_gen = generate_sample(index_words, context_window_size = SKIP_WINDOW)

In [15]:
len(words), len(dictionary)

(17005207, 50000)

In [16]:
next(simple_gen) # will print (6,?) (6,?) two times 

(5234, 3081)

In [17]:
def batch_gen():
  simple_gen = generate_sample(index_words, context_window_size= SKIP_WINDOW)
  while True:
    center_batch= np.zeros(BATCH_SIZE, dtype= np.int32)
    target_batch= np.zeros([BATCH_SIZE, 1])
#     print(center_batch.shape, target_batch.shape)
    for index in range(BATCH_SIZE):
      center_batch[index], target_batch[index] = next(simple_gen)
    yield center_batch, target_batch

batch_gen()

<generator object batch_gen at 0x7fb9a2add9e8>

**word2vec class**

In [0]:
class word2Vec(object):
  def __init__(self, vocab_size, embed_size, num_sampled=NUM_SAMPLED):
    self.vocab_size = vocab_size
    self.num_sampled = num_sampled
    self.embed_size = embed_size
    self.embed_matrix = tfe.Variable(tf.random_uniform([vocab_size, embed_size])) # vocab-size x embed-size 

    
  def compute_loss(self, center_words, target_words):
    #"computes the forward pass of word2vec with NCE loss , tf.nn.embedding_lookup"
    embed = tf.nn.embedding_lookup(self.embed_matrix, center_words)
    
    #"computes the loss , using tf.reduce_mean and tf.nn.nce_loss"
    nce_weights = tfe.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],
                                                      stddev=1.0/ (self.embed_size ** 0.5)))
    nce_biases = tfe.Variable(tf.zeros([self.vocab_size]))
    loss = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights,
                                        biases = nce_biases,
                                        labels= target_words,
                                        inputs= embed,
                                        num_sampled=self.num_sampled,
                                        num_classes=self.vocab_size))
    return loss


In [0]:
def data_generator():
  yield from batch_gen()

def main__():
  print("dataset")
  dataset = tf.data.Dataset.from_generator(data_generator, (tf.int32, tf.int32),
                                           (tf.TensorShape([BATCH_SIZE]),
                                           tf.TensorShape([BATCH_SIZE,1])))
  optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
  model = word2Vec(vocab_size=VOCAB_SIZE, embed_size=EMBED_SIZE)
  grad_fn = tfe.implicit_value_and_gradients(model.compute_loss)
  total_loss =0.0
  num_train_steps=0
  
  print("inside while loop")
  while num_train_steps < NUM_TRAIN_STEPS:
    for center_words, target_words in tfe.Iterator(dataset):
      if num_train_steps >= NUM_TRAIN_STEPS:
        break
      # Compute the loss and gradients, and take an optimization step.
      loss_batch, grads = grad_fn(center_words, target_words)
      total_loss += loss_batch
#       print(total_loss)
      optimizer.apply_gradients(grads)
      
      if (num_train_steps + 1) % SKIP_STEP == 0:
        print('Average loss at step {}: {:5.1f}'.format(
                num_train_steps, total_loss / SKIP_STEP))
        total_loss = 0.0
      num_train_steps += 1

In [0]:
main__()

dataset
inside while loop
