In [69]:
from glob import glob
import string
import numpy as np
import tensorflow as tf
from datetime import datetime
from scipy.special import expit as sigmoid

import os
import sys
import string
import json


from nltk.corpus import brown
import operator

from sklearn.metrics.pairwise import pairwise_distances
from scipy.spatial.distance import cosine as cos_dist


In [18]:
def remove_punctuation(s):
  return s.translate(str.maketrans('','',string.punctuation))

In [39]:
def get_wiki():
  V = 20000
  files = glob('../large_files/enwiki*.txt')
  all_word_counts = {}
  for f in files:
    for line in open(f, encoding="utf8"):
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          for word in s:
            if word not in all_word_counts:
              all_word_counts[word] = 0
            all_word_counts[word] += 1
  print("finished counting")

  V = min(V, len(all_word_counts))
  all_word_counts = sorted(all_word_counts.items(), key=lambda x: x[1], reverse=True)

  top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>']
  word2idx = {w:i for i, w in enumerate(top_words)}
  unk = word2idx['<UNK>']

  sents = []
  for f in files:
    for line in open(f, encoding="utf8"):
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          # if a word is not nearby another word, there won't be any context!
          # and hence nothing to train!
          sent = [word2idx[w] if w in word2idx else unk for w in s]
          sents.append(sent)
  return sents, word2idx
              

In [59]:
def get_sentences():
  return brown.sents()

def get_brown(n_vocab=2000, keep_words = []):
  sentences = get_sentences()
  indexed_sentences = []

  i = 0
  word2idx = {}
  idx2word = []

  word_idx_count = {}

  for sentence in sentences:
    indexed_sentence = []
    for token in sentence:
      token = token.lower()
      if token not in word2idx:
        idx2word.append(token)
        word2idx[token] = i
        i += 1

      idx = word2idx[token]
      word_idx_count[idx] = word_idx_count.get(idx, 0) + 1

      indexed_sentence.append(idx)
    indexed_sentences.append(indexed_sentence)

  for word in keep_words:
    word_idx_count[word2idx[word]] = float('inf')

  sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
  word2idx_small = {}
  new_idx = 0
  idx_new_idx_map = {}
  for idx, count in sorted_word_idx_count[:n_vocab]:
    word = idx2word[idx]
    word2idx_small[word] = new_idx
    idx_new_idx_map[idx] = new_idx
    new_idx += 1
  word2idx_small['UNKNOWN'] = new_idx 
  unknown = new_idx

  sentences_small = []
  for sentence in indexed_sentences:
    if len(sentence) > 1:
      new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
      sentences_small.append(new_sentence)

  return sentences_small, word2idx_small


In [56]:
def get_negative_sampling_distribution(sentences):
  word_freq = {}
  word_count = sum(len(sentence) for sentence in sentences)
  for sentence in sentences:
      for word in sentence:
          if word not in word_freq:
              word_freq[word] = 0
          word_freq[word] += 1
  
  V = len(word_freq)

  p_neg = np.zeros(V)
  for j in range(V):
      p_neg[j] = word_freq[j]**0.75

  p_neg = p_neg / p_neg.sum()

  assert(np.all(p_neg > 0))
  return p_neg

In [21]:
def get_context(pos, sentence, window_size):
  start = max(0, pos - window_size)
  end_  = min(len(sentence), pos + window_size)

  context = []
  for ctx_pos, ctx_word_idx in enumerate(sentence[start:end_], start=start):
    if ctx_pos != pos:
      context.append(ctx_word_idx)
  return context

In [73]:
def train_model(savedir):
  sentences, word2idx = get_wiki()

  vocab_size = len(word2idx)


  # config
  window_size = 10
  learning_rate = 0.025
  final_learning_rate = 0.0001
  num_negatives = 5 
  samples_per_epoch = int(1e5)
  epochs = 20
  D = 50

  learning_rate_delta = (learning_rate - final_learning_rate) / epochs

  p_neg = get_negative_sampling_distribution(sentences)


  W = np.random.randn(vocab_size, D).astype(np.float32) # input-to-hidden
  V = np.random.randn(D, vocab_size).astype(np.float32) # hidden-to-output


  tf_input = tf.compat.v1.placeholder(tf.int32, shape=(None,))
  tf_negword = tf.compat.v1.placeholder(tf.int32, shape=(None,))
  tf_context = tf.compat.v1.placeholder(tf.int32, shape=(None,)) # targets (context)
  tfW = tf.Variable(W)
  tfV = tf.Variable(V.T)2))

  def dot(A, B):
    C = A * B
    return tf.reduce_sum(input_tensor=C, axis=1)

  emb_input = tf.nn.embedding_lookup(params=tfW, ids=tf_input) # 1 x D
  emb_output = tf.nn.embedding_lookup(params=tfV, ids=tf_context) # N x D
  correct_output = dot(emb_input, emb_output) # N
  pos_loss = tf.nn.sigmoid_cross_entropy_with_logits(
    labels=tf.ones(tf.shape(input=correct_output)), logits=correct_output)

  emb_input = tf.nn.embedding_lookup(params=tfW, ids=tf_negword)
  incorrect_output = dot(emb_input, emb_output)
  neg_loss = tf.nn.sigmoid_cross_entropy_with_logits(
    labels=tf.zeros(tf.shape(input=incorrect_output)), logits=incorrect_output)

  loss = tf.reduce_mean(input_tensor=pos_loss) + tf.reduce_mean(input_tensor=neg_loss)


  train_op = tf.compat.v1.train.MomentumOptimizer(0.1, momentum=0.9).minimize(loss)
  session = tf.compat.v1.Session()
  init_op = tf.compat.v1.global_variables_initializer()
  session.run(init_op)

  costs = []

  total_words = sum(len(sentence) for sentence in sentences)
  print("total number of words in corpus:", total_words)


  threshold = 1e-5
  p_drop = 1 - np.sqrt(threshold / p_neg)


  # train the model
  for epoch in range(epochs):
    np.random.shuffle(sentences)

    cost = 0
    counter = 0
    inputs = []
    targets = []
    negwords = []
    t0 = datetime.now()
    for sentence in sentences:

      sentence = [w for w in sentence \
        if np.random.random() < (1 - p_drop[w])
      ]
      if len(sentence) < 2:
        continue


      randomly_ordered_positions = np.random.choice(
        len(sentence),
        # size=np.random.randint(1, len(sentence) + 1),
        size=len(sentence),
        replace=False,
      )


      for j, pos in enumerate(randomly_ordered_positions):
        word = sentence[pos]

        context_words = get_context(pos, sentence, window_size)
        neg_word = np.random.choice(vocab_size, p=p_neg)

        
        n = len(context_words)
        inputs += [word]*n
        negwords += [neg_word]*n
        targets += context_words


      if len(inputs) >= 128:
        _, c = session.run(
          (train_op, loss),
          feed_dict={
            tf_input: inputs,
            tf_negword: negwords,
            tf_context: targets,
          }
        )
        cost += c

        # reset
        inputs = []
        targets = []
        negwords = []

      counter += 1
      if counter % 100 == 0:
        sys.stdout.write("processed %s / %s\r" % (counter, len(sentences)))
        sys.stdout.flush()
    dt = datetime.now() - t0
    print("epoch complete:", epoch, "cost:", cost, "dt:", dt)


    costs.append(cost)

    learning_rate -= learning_rate_delta

  W, VT = session.run((tfW, tfV))
  V = VT.T

  if not os.path.exists(savedir):
    os.mkdir(savedir)

  with open('%s/word2idx.json' % savedir, 'w') as f:
    json.dump(word2idx, f)

  np.savez('%s/weights.npz' % savedir, W, V)

  # return the model
  return word2idx, W, V

In [74]:
np.random.seed(0)
word2idx, W, V = train_model('w2v_tf1')
print(W)

finished counting
total number of words in corpus: 86478677
processed 270100 / 1271558

KeyboardInterrupt: 

In [67]:
def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W):
  V, D = W.shape

  # don't actually use pos2 in calculation, just print what's expected
  print("testing: %s - %s = %s - %s" % (pos1, neg1, pos2, neg2))
  for w in (pos1, neg1, pos2, neg2):
    if w not in word2idx:
      print("Sorry, %s not in word2idx" % w)
      return

  p1 = W[word2idx[pos1]]
  n1 = W[word2idx[neg1]]
  p2 = W[word2idx[pos2]]
  n2 = W[word2idx[neg2]]

  vec = p1 - n1 + n2

  distances = pairwise_distances(vec.reshape(1, D), W, metric='cosine').reshape(V)
  idx = distances.argsort()[:10]

  # pick one that's not p1, n1, or n2
  best_idx = -1
  keep_out = [word2idx[w] for w in (pos1, neg1, neg2)]
  # print("keep_out:", keep_out)
  for i in idx:
    if i not in keep_out:
      best_idx = i
      break
  # print("best_idx:", best_idx)

  print("got: %s - %s = %s - %s" % (pos1, neg1, idx2word[best_idx], neg2))
  print("closest 10:")
  for i in idx:
    print(idx2word[i], distances[i])

  print("dist to %s:" % pos2, cos_dist(p2, vec))

In [72]:
idx2word = {i:w for w, i in word2idx.items()}
for We in (W, (W + V.T) / 2):
  print("**********")

  analogy('france', 'french', 'england', 'english', word2idx, idx2word, We)

**********
testing: france - french = england - english
got: france - french = powerful - english
closest 10:
france 0.19890833
powerful 0.46028405
countries 0.51231515
fat 0.54968715
english 0.55031395
trip 0.55916077
dog 0.5900334
others 0.61320984
purpose 0.6287185
got 0.62895656
dist to england: 1.1183290854096413
**********
testing: france - french = england - english
got: france - french = countries - english
closest 10:
france 0.3322333
english 0.4039594
countries 0.592022
join 0.596961
man's 0.6268618
population 0.63550496
sensitive 0.640213
eight 0.66331506
sun 0.6661253
black 0.6668997
dist to england: 1.0027547792997211
