In [95]:
from glob import glob
import string
import numpy as np
import tensorflow as tf
from datetime import datetime
from scipy.special import expit as sigmoid

import os
import sys
import string
import json


from nltk.corpus import brown
import operator

from sklearn.metrics.pairwise import pairwise_distances
from scipy.spatial.distance import cosine as cos_dist


In [4]:
def remove_punctuation(s):
  return s.translate(str.maketrans('','',string.punctuation))

In [5]:
def get_wiki():
  V = 20000
  files = glob('../large_files/enwiki*.txt')
  all_word_counts = {}
  for f in files:
    for line in open(f, encoding="utf8"):
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          for word in s:
            if word not in all_word_counts:
              all_word_counts[word] = 0
            all_word_counts[word] += 1
  print("finished counting")

  V = min(V, len(all_word_counts))
  all_word_counts = sorted(all_word_counts.items(), key=lambda x: x[1], reverse=True)

  top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>']
  word2idx = {w:i for i, w in enumerate(top_words)}
  unk = word2idx['<UNK>']

  sents = []
  for f in files:
    for line in open(f, encoding="utf8"):
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          # if a word is not nearby another word, there won't be any context!
          # and hence nothing to train!
          sent = [word2idx[w] if w in word2idx else unk for w in s]
          sents.append(sent)
  return sents, word2idx
              

In [27]:
def get_sentences():
  # returns 57340 of the Brown corpus
  # each sentence is represented as a list of individual string tokens
  return brown.sents()

def get_brown(n_vocab=2000, keep_words = []):
  sentences = get_sentences()
  indexed_sentences = []

  i = 0
  word2idx = {}
  idx2word = []

  word_idx_count = {}

  for sentence in sentences:
    indexed_sentence = []
    for token in sentence:
      token = token.lower()
      if token not in word2idx:
        idx2word.append(token)
        word2idx[token] = i
        i += 1

      # keep track of counts for later sorting
      idx = word2idx[token]
      word_idx_count[idx] = word_idx_count.get(idx, 0) + 1

      indexed_sentence.append(idx)
    indexed_sentences.append(indexed_sentence)



  # restrict vocab size

  # set all the words I want to keep to infinity
  # so that they are included when I pick the most
  # common words
  for word in keep_words:
    word_idx_count[word2idx[word]] = float('inf')

  sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
  word2idx_small = {}
  new_idx = 0
  idx_new_idx_map = {}
  for idx, count in sorted_word_idx_count[:n_vocab-1]:
    word = idx2word[idx]
    word2idx_small[word] = new_idx
    idx_new_idx_map[idx] = new_idx
    new_idx += 1
  # let 'unknown' be the last token
  word2idx_small['UNKNOWN'] = new_idx 
  unknown = new_idx

  # map old idx to new idx
  sentences_small = []
  for sentence in indexed_sentences:
    if len(sentence) > 1:
      new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
      sentences_small.append(new_sentence)

  return sentences_small, word2idx_small


In [7]:
def get_negative_sampling_distribution(sentences):
  # Pn(w) = prob of word occuring
  # we would like to sample the negative samples
  # such that words that occur more often
  # should be sampled more often

  word_freq = {}
  word_count = sum(len(sentence) for sentence in sentences)
  for sentence in sentences:
      for word in sentence:
          if word not in word_freq:
              word_freq[word] = 0
          word_freq[word] += 1
  
  # vocab size
  V = len(word_freq)

  p_neg = np.zeros(V)
  for j in range(V):
      p_neg[j] = word_freq[j]**0.75

  # normalize it
  p_neg = p_neg / p_neg.sum()

  assert(np.all(p_neg > 0))
  return p_neg

In [8]:
def get_context(pos, sentence, window_size):
  start = max(0, pos - window_size)
  end_  = min(len(sentence), pos + window_size)

  context = []
  for ctx_pos, ctx_word_idx in enumerate(sentence[start:end_], start=start):
    if ctx_pos != pos:
      context.append(ctx_word_idx)
  return context

In [76]:
def construct_dataset():
    # get the data
  sentences, word2idx = get_brown() #get_text8()

  # number of unique words
  vocab_size = len(word2idx)
  
  print(vocab_size)


  # config
  window_size = 10
  learning_rate = 0.025
  final_learning_rate = 0.0001
  num_negatives = 5 # number of negative samples to draw per input word
  samples_per_epoch = int(1e5)
  epochs = 20
  D = 50 # word embedding size

  # learning rate decay
  learning_rate_delta = (learning_rate - final_learning_rate) / epochs

  # distribution for drawing negative samples
  p_neg = get_negative_sampling_distribution(sentences)


  # params

  # biases = tf.Variable(np.zeros(vocab_size, dtype=np.float32))

  def dot(A, B):
    C = A * B
    return tf.reduce_sum(input_tensor=C, axis=1)

  # output = hidden.dot(tfV)

  # loss
  # neither of the built-in TF functions work well
  # per_sample_loss = tf.nn.nce_loss(
  # # per_sample_loss = tf.nn.sampled_softmax_loss(
  #   weights=tfV,
  #   biases=biases,
  #   labels=tfY,
  #   inputs=hidden,
  #   num_sampled=num_negatives,
  #   num_classes=vocab_size,
  # )
  # loss = tf.reduce_mean(per_sample_loss)

  # optimizer
  # train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
  # train_op = tf.train.AdamOptimizer(1e-2).minimize(loss)



  # save the costs to plot them per iteration
  costs = []


  # number of total words in corpus
  total_words = sum(len(sentence) for sentence in sentences)
  print("total number of words in corpus:", total_words)


  # for subsampling each sentence
  threshold = 1e-5
  p_drop = 1 - np.sqrt(threshold / p_neg)
  
  def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp
  
  inputs = []
  contexts = []
  labels = []

  
  for sentence in sentences:
    # keep only certain words based on p_neg
    sentence = [w for w in sentence \
      if np.random.random() < (1 - p_drop[w])
    ]
    if len(sentence) < 2:
      continue


    # randomly order words so we don't always see
    # samples in the same order
    randomly_ordered_positions = np.random.choice(
      len(sentence),
      # size=np.random.randint(1, len(sentence) + 1),
      size=len(sentence),
      replace=False,
    )


    for j, pos in enumerate(randomly_ordered_positions):
      # the middle word
      word = sentence[pos]
      context_words = get_context(pos, sentence, window_size)
      neg_word = np.random.choice(vocab_size, p=p_neg)
      for target in context_words:
        inputs.append(to_one_hot(word, vocab_size))
        contexts.append(to_one_hot(target, vocab_size))
        labels.append(1)
        inputs.append(to_one_hot(neg_word, vocab_size))
        contexts.append(to_one_hot(target, vocab_size))
        labels.append(0)
        
  inputs = np.asarray(inputs, dtype='float32')
  contexts = np.asarray(contexts, dtype='float32')
  labels = np.asarray(label, dtype='float32')
  
  return inputs, contexts, labels, word2idx

In [89]:
def train_model(savedir, inputs, contexts, labels, word2idx):
  vocab_size = len(word2idx)
  D = 50
  W = tf.Variable(tf.random.normal([vocab_size, D]))
  V = tf.Variable(tf.random.normal([D, vocab_size])) 

  optimizer = tf.optimizers.SGD(learning_rate=0.01)

  def get_batch(batch_counter):
    max_counter = len(label)
    return inputs[batch_counter:min(batch_counter + 100, max_counter),], contexts[batch_counter:min(batch_counter + 100, max_counter),], labels[batch_counter:min(batch_counter + 100, max_counter)]

  costs = []
  
  # train the model
  for epoch in range(20):
    cost = 0
    batch_counter = 0
    while batch_counter < len(label):
      input_batch, context_batch, label_batch = get_batch(batch_counter)
      batch_counter += 100
      with tf.GradientTape() as t:
        input_embeddings = tf.matmul(input_batch, W) # (100, 2000) * (2000, 50) - > (100, 50) embedding vector for each input
        context_embeddings = tf.matmul(context_batch, tf.transpose(V)) # (100, 50)
        simlarity = tf.einsum("ij,ij->i", input_embeddings, context_embeddings)
        
        loss = tf.nn.sigmoid_cross_entropy_with_logits(
          labels=label_batch, logits=simlarity)
        
        cost += tf.reduce_sum(loss)

        grads = t.gradient(loss, [W, V])
        optimizer.apply_gradients(zip(grads,[W, V]))
        
        if batch_counter % 10000 == 0:
          sys.stdout.write("processed %s / %s\r" % (batch_counter, len(labels)))
          sys.stdout.flush()
    
    print("epoch complete:", epoch, "cost:", cost)
  # return the model
  return word2idx, W, V

In [77]:
inputs, contexts, labels, word2idx = construct_dataset()

print(inputs.shape, contexts.shape, labels.shape)

2000
total number of words in corpus: 1160865
(298790, 2000) (298790, 2000) (296072,)


In [90]:
np.random.seed(0)
word2idx, W, V = train_model('w2v_tf2', inputs, contexts, labels, word2idx)
print(W)

epoch complete: 0 cost: tf.Tensor(630931.94, shape=(), dtype=float32)
epoch complete: 1 cost: tf.Tensor(433847.6, shape=(), dtype=float32)
epoch complete: 2 cost: tf.Tensor(349546.22, shape=(), dtype=float32)
epoch complete: 3 cost: tf.Tensor(298306.97, shape=(), dtype=float32)
epoch complete: 4 cost: tf.Tensor(263798.6, shape=(), dtype=float32)
epoch complete: 5 cost: tf.Tensor(239087.58, shape=(), dtype=float32)
epoch complete: 6 cost: tf.Tensor(220630.42, shape=(), dtype=float32)
epoch complete: 7 cost: tf.Tensor(206410.08, shape=(), dtype=float32)
epoch complete: 8 cost: tf.Tensor(195184.58, shape=(), dtype=float32)
epoch complete: 9 cost: tf.Tensor(186145.83, shape=(), dtype=float32)
epoch complete: 10 cost: tf.Tensor(178748.83, shape=(), dtype=float32)
epoch complete: 11 cost: tf.Tensor(172608.73, shape=(), dtype=float32)
epoch complete: 12 cost: tf.Tensor(167444.9, shape=(), dtype=float32)
epoch complete: 13 cost: tf.Tensor(163050.58, shape=(), dtype=float32)
epoch complete: 14 

In [92]:
def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W):
  V, D = W.shape

  # don't actually use pos2 in calculation, just print what's expected
  print("testing: %s - %s = %s - %s" % (pos1, neg1, pos2, neg2))
  for w in (pos1, neg1, pos2, neg2):
    if w not in word2idx:
      print("Sorry, %s not in word2idx" % w)
      return

  p1 = W[word2idx[pos1]]
  n1 = W[word2idx[neg1]]
  p2 = W[word2idx[pos2]]
  n2 = W[word2idx[neg2]]

  vec = p1 - n1 + n2

  distances = pairwise_distances(vec.reshape(1, D), W, metric='cosine').reshape(V)
  idx = distances.argsort()[:10]

  # pick one that's not p1, n1, or n2
  best_idx = -1
  keep_out = [word2idx[w] for w in (pos1, neg1, neg2)]
  # print("keep_out:", keep_out)
  for i in idx:
    if i not in keep_out:
      best_idx = i
      break
  # print("best_idx:", best_idx)

  print("got: %s - %s = %s - %s" % (pos1, neg1, idx2word[best_idx], neg2))
  print("closest 10:")
  for i in idx:
    print(idx2word[i], distances[i])

  print("dist to %s:" % pos2, cos_dist(p2, vec))

In [98]:
idx2word = {i:w for w, i in word2idx.items()}
print(W)
for We in (W, (W + V.T) / 2):
  print("**********")

  analogy('france', 'french', 'england', 'english', word2idx, idx2word, We)

[[-0.3455732  -0.03174401  0.02158272 ...  0.09234573  0.09032178
  -0.22473188]
 [-0.06133434  0.07464878 -0.06392034 ... -0.02111472  0.08687677
  -0.04314757]
 [-0.3760927   0.00688569 -0.08449373 ...  0.07774831 -0.03719903
   0.00480525]
 ...
 [ 0.5079636  -0.38961694 -0.25100282 ... -1.211722   -0.8478748
  -1.7721817 ]
 [-0.3447626   0.00637528 -0.16311522 ...  0.24958792  1.5857931
  -0.4317737 ]
 [-0.42255065  0.03659458  0.11579508 ...  0.03455456  0.06772879
  -0.08121987]]
**********
testing: france - french = england - english
got: france - french = nature - english
closest 10:
france 0.4186057
nature 0.43574852
along 0.5771315
english 0.58281875
left 0.59090513
answered 0.6115551
`` 0.6173
taking 0.625888
remember 0.6353825
campaign 0.6405154
dist to england: 0.9129335582256317
**********
testing: france - french = england - english
got: france - french = better - english
closest 10:
france 0.3301431
better 0.58228135
greatly 0.6109736
they 0.6158257
1958 0.6187956
rememb