<a href="https://colab.research.google.com/github/shekharkoirala/machinelearning_algorithms_analysis/blob/master/stanfordcourse/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf



In [0]:
#hyper parameters 
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128            # dimension of the word embedding vectors
SKIP_WINDOW = 1             # the context window
NUM_SAMPLED = 64            # number of negative examples to sample
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
VISUAL_FLD = 'visualization'
SKIP_STEP = 5000

In [0]:
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016
NUM_VISUALIZE = 3000   

In [4]:
!ls

data  sample_data  visualization


# **Prepare data** 
***this works , when word2veceager notebooks prepare data executed ( for the first time)***

In [0]:
import zipfile
file_path = "data/text8.zip"
with zipfile.ZipFile(file_path) as f:
  words = tf.compat.as_str(f.read(f.namelist()[0])).split() 

In [0]:
from collections import Counter
dictionary = dict()
count = [('UNK', -1)]
index = 0
count.extend(Counter(words).most_common(VOCAB_SIZE - 1))
for word, _ in count:
  dictionary[word] = index
  index += 1
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

In [0]:
index_words =[dictionary[word] if word in dictionary else 0 for word in words]

In [0]:
# del words , since google collab

#based on skip gram

import random
def generate_sample(index_words , context_window_size):
  #"made according to skip gram , each target context pair is treated as new data"
  for index, center in enumerate(index_words):
      #"center is index from dictionary and we need index to calculate index words"
      context = random.randint(1,context_window_size)
      # context is random , since context_window_size is 1 , it is always 1
      # before the center words
      for target in index_words[max(0, index-context):index]:
        yield center , target
      # after the center words
      for target in index_words[index+1:index+1+context]:
        yield center , target

simple_gen = generate_sample(index_words, context_window_size = SKIP_WINDOW)

In [11]:
len(words), len(dictionary)

(17005207, 50000)

In [12]:
next(simple_gen) # will print (5234,?) (5234,?) two times 

(5234, 3081)

In [13]:
def batch_gen():
  simple_gen = generate_sample(index_words, context_window_size= SKIP_WINDOW)
  while True:
    center_batch= np.zeros(BATCH_SIZE, dtype= np.int32)
    target_batch= np.zeros([BATCH_SIZE, 1])
#     print(center_batch.shape, target_batch.shape)
    for index in range(BATCH_SIZE):
      center_batch[index], target_batch[index] = next(simple_gen)
    yield center_batch, target_batch

batch_gen()

<generator object batch_gen at 0x7f120a61eeb8>

In [0]:
try:
  os.mkdir("checkpoints")
except OSError:
  pass

In [0]:
class word2Vec:
  def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate, dataset):
    self.vocab_size = vocab_size
    self.num_sampled = num_sampled
    self.embed_size = embed_size
    self.batch_size = batch_size
    self.num_sampled= num_sampled
    self.learning_rate= learning_rate
    self.dataset = dataset
    self.skip_step = SKIP_STEP
    self.global_step = tf.get_variable('global_step',
                                       initializer=tf.constant(0),
                                       trainable=False)
    
  def _import_data(self):
    #initialize data
    with tf.name_scope("data"):
      self.iterator = self.dataset.make_initializable_iterator()
      self.center_words, self.target_words = self.iterator.get_next()
      

   
  
    
  def _create_embedding(self):
    #embedding setup
    with tf.name_scope("embed"):
      self.embed_matrix = tf.get_variable("embed_matrix", shape=[self.vocab_size,
                                                            self.embed_size],
                                    initializer=tf.random_uniform_initializer())
      #compute forward pass of word2vec with NCE loss
      self.embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words,
                                     name="embedding")
  
  def _create_loss(self):
    #define loss
    with tf.name_scope("loss"):
      nce_weights = tf.get_variable("nce_weights",
                                         shape=[self.vocab_size,self.embed_size],
                                         initializer=
                                         tf.truncated_normal_initializer(
                                             stddev=1.0/(self.embed_size ** 0.5)))
      nce_biases = tf.get_variable("nce_biases",
                                        initializer= tf.zeros([self.vocab_size]))
      self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                               biases=nce_biases,
                                               labels=self.target_words,
                                               inputs=self.embed,
                                               num_sampled=self.num_sampled,
                                               num_classes=self.vocab_size),
                                name="loss")
   
      
  def _create_optimizer(self):
    #define optimizer 
    self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss,
                                                                          global_step=self.global_step)
    
    
  def _create_summaries(self):
    with tf.name_scope("summaries"):
      tf.summary.scalar('loss', self.loss)
      tf.summary.histogram("histogram_loss", self.loss)
      self.summary_op = tf.summary.merge_all()
  
  def build_graph(self):
    #phase1
    #step 1 create dataset and samples
    self._import_data()
    #step 2 create embedding matrix, and inference ( compute forward path )
    self._create_embedding()
    #step 3 loss function
    self._create_loss()
    #step 4 optimizer and train instance
    self._create_optimizer()
    self._create_summaries()

  def main_process(self, num_tain_steps):
    #phase 2
    saver = tf.train.Saver()
    
    initial_step = 0
    
    with tf.Session() as sess:
      sess.run(self.iterator.initializer)
      sess.run(tf.global_variables_initializer())
      summary_writer = tbc.get_writer()
      writer = tf.summary.FileWriter('graphs/word2vec_simple', sess.graph)
      ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
      
      if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
      
      total_loss = 0.0
      writer = tf.summary.FileWriter('graphs/word2vec_simple', sess.graph)
      summary_writer.add_graph(sess.graph)
      initial_step = self.global_step.eval()
      
      for index in range(initial_step, initial_step + NUM_TRAIN_STEPS):
        try:
          loss_batch, _, summary = sess.run([self.loss,self.optimizer, self.summary_op])
          writer.add_summary(summary, global_step=index)
          summary_writer.add_summary(summary, global_step=index)
          total_loss+=loss_batch
          
          if (index +1)% self.skip_step ==0:
            print('Average loss at step {}: {:5.1f}'.format(index, total_loss/self.skip_step))
            total_loss =0.0
            saver.save(sess, 'checkpoints/skip-gram',index)
            
        except tf.errors.OutOfRangeError:
          sess.run(self.iterator.initializer)
      writer.close()
      


In [161]:
import tensorboardcolab as tb
tbc = tb.TensorBoardColab()

Wait for 8 seconds...
TensorBoard link:
http://df54983e.ngrok.io


In [163]:
def data_generator():
  yield from batch_gen()

def main__():
  tf.reset_default_graph()
  dataset = tf.data.Dataset.from_generator(data_generator, (tf.int32, tf.int32),
                                           (tf.TensorShape([BATCH_SIZE]),
                                           tf.TensorShape([BATCH_SIZE,1])))
  
#   vocab_size, embed_size, batch_size, num_sampled, learning_rate, dataset
  W2V = word2Vec(vocab_size=VOCAB_SIZE,
                 embed_size=EMBED_SIZE,
                 batch_size=BATCH_SIZE,
                 num_sampled=NUM_SAMPLED,
                 learning_rate=LEARNING_RATE,
                 dataset=dataset)
  W2V.build_graph()
  W2V.main_process(NUM_TRAIN_STEPS)
  
main__()

Average loss at step 4999:  65.6
Average loss at step 9999:  18.3
Average loss at step 14999:   9.6
Average loss at step 19999:   6.6
Average loss at step 24999:   5.6
Average loss at step 29999:   5.3
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Average loss at step 34999:   5.0
Average loss at step 39999:   4.8
Average loss at step 44999:   4.8
Average loss at step 49999:   4.8
Average loss at step 54999:   4.7
Average loss at step 59999:   4.7
Average loss at step 64999:   4.6
Average loss at step 69999:   4.7
Average loss at step 74999:   4.6
Average loss at step 79999:   4.6
Average loss at step 84999:   4.7
Average loss at step 89999:   4.6
Average loss at step 94999:   4.6
Average loss at step 99999:   4.6


In [0]:
#if error, or skip
!mkdir graphs/word2vec/lr


In [160]:
!rm checkpoints/*

rm: cannot remove 'checkpoints/*': No such file or directory


In [0]:
!ls checkpoints/

In [0]:
!rm -rf graphs/*

In [0]:
!ls graphs