In [21]:
from glob import glob
import string
import numpy as np
import tensorflow as tf
from datetime import datetime
from scipy.special import expit as sigmoid

import os
import sys
import string
import json


from nltk.corpus import brown
import operator

from utils import find_analogies, get_wikipedia_data1


In [22]:
def get_negative_sampling_distribution(sentences):
  # Pn(w) = prob of word occuring
  # we would like to sample the negative samples
  # such that words that occur more often
  # should be sampled more often

  word_freq = {}
  word_count = sum(len(sentence) for sentence in sentences)
  for sentence in sentences:
      for word in sentence:
          if word not in word_freq:
              word_freq[word] = 0
          word_freq[word] += 1
  
  # vocab size
  V = len(word_freq)

  p_neg = np.zeros(V)
  for j in range(V):
      p_neg[j] = word_freq[j]**0.75

  # normalize it
  p_neg = p_neg / p_neg.sum()

  assert(np.all(p_neg > 0))
  return p_neg

In [23]:
def get_context(pos, sentence, window_size):
  start = max(0, pos - window_size)
  end_  = min(len(sentence), pos + window_size)

  context = []
  for ctx_pos, ctx_word_idx in enumerate(sentence[start:end_], start=start):
    if ctx_pos != pos:
      context.append(ctx_word_idx)
  return context

In [34]:
def construct_dataset():
    # get the data
  sentences, word2idx = get_wikipedia_data1(None, 2000) #get_text8()

  # number of unique words
  vocab_size = len(word2idx)

  # config
  window_size = 10
  learning_rate = 0.025
  final_learning_rate = 0.0001
  num_negatives = 5 # number of negative samples to draw per input word
  samples_per_epoch = int(1e5)
  epochs = 20
  D = 50 # word embedding size

  # learning rate decay
  learning_rate_delta = (learning_rate - final_learning_rate) / epochs

  # distribution for drawing negative samples
  p_neg = get_negative_sampling_distribution(sentences)


  total_words = sum(len(sentence) for sentence in sentences)
  print("total number of words in corpus:", total_words)


  # for subsampling each sentence
  threshold = 1e-5
  p_drop = 1 - np.sqrt(threshold / p_neg)
  
  def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp
  
  inputs = []
  contexts = []
  labels = []
  
  print("dataset construction started ... ")

  
  for sentence in sentences:
    # keep only certain words based on p_neg
    sentence = [w for w in sentence \
      if np.random.random() < (1 - p_drop[w])
    ]
    if len(sentence) < 2:
      continue


    # randomly order words so we don't always see
    # samples in the same order
    randomly_ordered_positions = np.random.choice(
      len(sentence),
      # size=np.random.randint(1, len(sentence) + 1),
      size=len(sentence),
      replace=False,
    )


    for j, pos in enumerate(randomly_ordered_positions):
      # the middle word
      word = sentence[pos]
      context_words = get_context(pos, sentence, window_size)
      neg_word = np.random.choice(vocab_size, p=p_neg)
      for target in context_words:
        inputs.append(word)
        contexts.append(target)
        labels.append(1)
        inputs.append(neg_word)
        contexts.append(target)
        labels.append(0)
        
  inputs = np.asarray(inputs)
  contexts = np.asarray(contexts)
  labels = np.asarray(labels, dtype='float32')
  
  return inputs, contexts, labels, word2idx

In [33]:
def train_model(save_dir, inputs, contexts, labels, word2idx):
  
  print(inputs[0])
  
  print("training started ... ")
  
  vocab_size = len(word2idx)
  D = 50
  W = tf.Variable(tf.random.normal([vocab_size, D]))
  V = tf.Variable(tf.random.normal([D, vocab_size])) 
  learning_rate = 0.025
  final_learning_rate = 0.0001
  epochs = 10
  learning_rate_delta = (learning_rate - final_learning_rate) / epochs

  optimizer = tf.compat.v1.train.MomentumOptimizer(0.1, momentum=0.9) 

  def get_batch(batch_counter):
    max_counter = len(labels)
    return inputs[batch_counter:min(batch_counter + 100, max_counter),], contexts[batch_counter:min(batch_counter + 100, max_counter),], labels[batch_counter:min(batch_counter + 100, max_counter)]

  costs = []
  
  # train the model
  for epoch in range(epochs):
    cost = 0
    batch_counter = 0
    while batch_counter < len(labels):
      input_batch, context_batch, label_batch = get_batch(batch_counter)
      batch_counter += 100
      with tf.GradientTape() as t:
        input_embeddings = tf.nn.embedding_lookup(W, input_batch) # (100, 2000) * (2000, 50) - > (100, 50) embedding vector for each input
        context_embeddings = tf.nn.embedding_lookup(tf.transpose(V), context_batch) # (100, 50)
        simlarity = tf.einsum("ij,ij->i", input_embeddings, context_embeddings)
        
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
          labels=label_batch, logits=simlarity))
        
        cost += loss

        grads = t.gradient(loss, [W, V])
        optimizer.apply_gradients(zip(grads,[W, V]))
        
        if batch_counter % 10000 == 0:
          sys.stdout.write("processed %s / %s\r" % (batch_counter, len(labels)))
          sys.stdout.flush()
    
    print("epoch complete:", epoch, "cost:", cost)
  
  W, V = W.numpy(), V.numpy()
    
  if not os.path.exists(save_dir):
    os.mkdir(save_dir)

  with open('%s/word2idx.json' % save_dir, 'w') as f:
    json.dump(word2idx, f)

  np.savez('%s/weights.npz' % save_dir, W, V)
  # return the model
  return word2idx, W, V

In [43]:
def main(save_dir):
#   inputs, contexts, labels, word2idx = construct_dataset()
#   word2idx, W, V = train_model(save_dir, inputs, contexts, labels, word2idx)
  
#   with open('%s/word2idx.json' % save_dir) as f:
#     word2idx = json.load(f)
  
#   W, U = train(save_dir, 100)

  with open('%s/word2idx.json' % save_dir) as f:
    word2idx = json.load(f)
  npz = np.load('%s/weights.npz' % save_dir)
  W = npz['arr_0']
  V = npz['arr_1']

  idx2word = {i:w for w, i in word2idx.items()}
  We = (W + V.T) / 2

  find_analogies('king', 'man', 'woman', We, word2idx, idx2word)
  find_analogies('france', 'paris', 'london', We, word2idx, idx2word)
  find_analogies('france', 'paris', 'rome', We, word2idx, idx2word)
  find_analogies('paris', 'france', 'italy', We, word2idx, idx2word)
  find_analogies('france', 'french', 'english', We, word2idx, idx2word)
  find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word)
  find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word)
  find_analogies('japan', 'japanese', 'australian', We, word2idx, idx2word)
  find_analogies('december', 'november', 'june', We, word2idx, idx2word)

In [44]:
main('word2vec_wiki_2000_full')

closest match by euclidean distance: queen
king - man = queen - woman
closest match by cosine distance: queen
king - man = queen - woman
closest match by euclidean distance: england
france - paris = england - london
closest match by cosine distance: wales
france - paris = wales - london
closest match by euclidean distance: italy
france - paris = italy - rome
closest match by cosine distance: italy
france - paris = italy - rome
closest match by euclidean distance: continued
paris - france = continued - italy
closest match by cosine distance: poland
paris - france = poland - italy
closest match by euclidean distance: england
france - french = england - english
closest match by cosine distance: england
france - french = england - english
closest match by euclidean distance: china
japan - japanese = china - chinese
closest match by cosine distance: china
japan - japanese = china - chinese
closest match by euclidean distance: italy
japan - japanese = italy - italian
closest match by cosine 