In [1]:
from glob import glob
import string
import numpy as np
import tensorflow as tf
from datetime import datetime
from scipy.special import expit as sigmoid

import os
import sys
import string
import json


from nltk.corpus import brown
import operator

from sklearn.metrics.pairwise import pairwise_distances
from scipy.spatial.distance import cosine as cos_dist


In [2]:
def remove_punctuation(s):
  return s.translate(str.maketrans('','',string.punctuation))

In [3]:
def get_wiki():
  V = 20000
  files = glob('../large_files/enwiki*.txt')
  all_word_counts = {}
  for f in files:
    for line in open(f, encoding="utf8"):
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          for word in s:
            if word not in all_word_counts:
              all_word_counts[word] = 0
            all_word_counts[word] += 1
  print("finished counting")

  V = min(V, len(all_word_counts))
  all_word_counts = sorted(all_word_counts.items(), key=lambda x: x[1], reverse=True)

  top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>']
  word2idx = {w:i for i, w in enumerate(top_words)}
  unk = word2idx['<UNK>']

  sents = []
  for f in files:
    for line in open(f, encoding="utf8"):
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          # if a word is not nearby another word, there won't be any context!
          # and hence nothing to train!
          sent = [word2idx[w] if w in word2idx else unk for w in s]
          sents.append(sent)
  return sents, word2idx
              

In [4]:
def get_sentences():
  return brown.sents()

def get_brown(n_vocab=2000, keep_words = []):
  sentences = get_sentences()
  indexed_sentences = []

  i = 0
  word2idx = {}
  idx2word = []

  word_idx_count = {}

  for sentence in sentences:
    indexed_sentence = []
    for token in sentence:
      token = token.lower()
      if token not in word2idx:
        idx2word.append(token)
        word2idx[token] = i
        i += 1

      idx = word2idx[token]
      word_idx_count[idx] = word_idx_count.get(idx, 0) + 1

      indexed_sentence.append(idx)
    indexed_sentences.append(indexed_sentence)

  for word in keep_words:
    word_idx_count[word2idx[word]] = float('inf')

  sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
  word2idx_small = {}
  new_idx = 0
  idx_new_idx_map = {}
  for idx, count in sorted_word_idx_count[:n_vocab]:
    word = idx2word[idx]
    word2idx_small[word] = new_idx
    idx_new_idx_map[idx] = new_idx
    new_idx += 1
  word2idx_small['UNKNOWN'] = new_idx 
  unknown = new_idx

  sentences_small = []
  for sentence in indexed_sentences:
    if len(sentence) > 1:
      new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
      sentences_small.append(new_sentence)

  return sentences_small, word2idx_small


In [12]:
def construct_matrix(cc_matrix, sentences, V, context_sz):
  if not os.path.exists(cc_matrix):
      X = np.zeros((V, V))
      N = len(sentences)
      print("number of sentences to process:", N)
      it = 0
      for sentence in sentences:
          it += 1
          if it % 10000 == 0:
              print("processed", it, "/", N)
          n = len(sentence)
          for i in range(n):
              wi = sentence[i]

              start = max(0, i - context_sz)
              end = min(n, i + context_sz)

              if i - context_sz < 0:
                  points = 1.0 / (i + 1)
                  X[wi,0] += points
                  X[0,wi] += points
              if i + context_sz > n:
                  points = 1.0 / (n - i)
                  X[wi,1] += points
                  X[1,wi] += points

              for j in range(start, i):
                  wj = sentence[j]
                  points = 1.0 / (i - j) # this is +ve
                  X[wi,wj] += points
                  X[wj,wi] += points

              for j in range(i + 1, end):
                  wj = sentence[j]
                  points = 1.0 / (j - i) # this is +ve
                  X[wi,wj] += points
                  X[wj,wi] += points

      np.save(cc_matrix, X)

In [29]:
def train(cc_matrix, V, D, learning_rate=1e-4, reg=0.1, xmax=100, alpha=0.75, epochs=100):
  tf.compat.v1.disable_eager_execution()
  X = np.load(cc_matrix)
  print("max in X:", X.max())

  # weighting
  fX = np.zeros((V, V))
  fX[X < xmax] = (X[X < xmax] / float(xmax)) ** alpha
  fX[X >= xmax] = 1

  print("max in f(X):", fX.max())

  # target
  logX = np.log(X + 1)

  print("max in log(X):", logX.max())

  # initialize weights
  W = np.random.randn(V, D) / np.sqrt(V + D)
  b = np.zeros(V)
  U = np.random.randn(V, D) / np.sqrt(V + D)
  c = np.zeros(V)
  mu = logX.mean()

  # initialize weights, inputs, targets placeholders
  tfW = tf.Variable(W.astype(np.float32))
  tfb = tf.Variable(b.reshape(V, 1).astype(np.float32))
  tfU = tf.Variable(U.astype(np.float32))
  tfc = tf.Variable(c.reshape(1, V).astype(np.float32))
  tfLogX = tf.compat.v1.placeholder(tf.float32, shape=(V, V))
  tffX = tf.compat.v1.placeholder(tf.float32, shape=(V, V))

  delta = tf.matmul(tfW, tf.transpose(a=tfU)) + tfb + tfc + mu - tfLogX
  cost = tf.reduce_sum(input_tensor=tffX * delta * delta)
  regularized_cost = cost
  for param in (tfW, tfU):
      regularized_cost += reg*tf.reduce_sum(input_tensor=param * param)

  train_op = tf.compat.v1.train.MomentumOptimizer(
    learning_rate,
    momentum=0.9
  ).minimize(regularized_cost)
  # train_op = tf.train.AdamOptimizer(1e-3).minimize(regularized_cost)
  init = tf.compat.v1.global_variables_initializer()
  session = tf.compat.v1.InteractiveSession()
  session.run(init)

  costs = []
  sentence_indexes = range(len(sentences))
  for epoch in range(epochs):
      c, _ = session.run((cost, train_op), feed_dict={tfLogX: logX, tffX: fX})
      print("epoch:", epoch, "cost:", c)
      costs.append(c)

  # save for future calculations
  W, U = session.run([tfW, tfU])
  return W, U

In [30]:
sentences, word2idx = get_brown()
V = len(word2idx)
# construct_matrix('cc_matrix_brown', sentences, V, 10)
W, U = train('cc_matrix_brown.npy', V, 100)


max in X: 502704.6412708048
max in f(X): 1.0
max in log(X): 13.127760071560818




epoch: 0 cost: 716149.8
epoch: 1 cost: 468802.97
epoch: 2 cost: 236526.61
epoch: 3 cost: 188934.94
epoch: 4 cost: 254416.97
epoch: 5 cost: 278433.38
epoch: 6 cost: 222999.83
epoch: 7 cost: 165718.36
epoch: 8 cost: 162992.72
epoch: 9 cost: 185571.17
epoch: 10 cost: 183009.48
epoch: 11 cost: 155528.08
epoch: 12 cost: 137868.7
epoch: 13 cost: 141035.84
epoch: 14 cost: 141300.84
epoch: 15 cost: 120410.695
epoch: 16 cost: 91344.016
epoch: 17 cost: 77912.734
epoch: 18 cost: 84179.3
epoch: 19 cost: 94306.58
epoch: 20 cost: 94951.695
epoch: 21 cost: 87087.59
epoch: 22 cost: 77832.484
epoch: 23 cost: 70386.38
epoch: 24 cost: 64786.543
epoch: 25 cost: 62155.438
epoch: 26 cost: 63125.484
epoch: 27 cost: 64530.125
epoch: 28 cost: 62164.336
epoch: 29 cost: 56565.125
epoch: 30 cost: 52702.24
epoch: 31 cost: 53319.773
epoch: 32 cost: 55320.875
epoch: 33 cost: 54212.516
epoch: 34 cost: 50115.098
epoch: 35 cost: 47107.215
epoch: 36 cost: 47439.613
epoch: 37 cost: 48873.355
epoch: 38 cost: 48237.816
epo

In [41]:
sentences, word2idx = get_wiki()
V = len(word2idx)
construct_matrix('cc_matrix_wiki', sentences, V, 10)
W, U = train('cc_matrix_wiki.npy', V, 100)

finished counting
number of sentences to process: 1271558
processed 10000 / 1271558
processed 20000 / 1271558
processed 30000 / 1271558
processed 40000 / 1271558
processed 50000 / 1271558
processed 60000 / 1271558
processed 70000 / 1271558
processed 80000 / 1271558
processed 90000 / 1271558
processed 100000 / 1271558
processed 110000 / 1271558
processed 120000 / 1271558
processed 130000 / 1271558
processed 140000 / 1271558
processed 150000 / 1271558
processed 160000 / 1271558
processed 170000 / 1271558
processed 180000 / 1271558
processed 190000 / 1271558
processed 200000 / 1271558
processed 210000 / 1271558
processed 220000 / 1271558
processed 230000 / 1271558
processed 240000 / 1271558
processed 250000 / 1271558
processed 260000 / 1271558
processed 270000 / 1271558
processed 280000 / 1271558
processed 290000 / 1271558
processed 300000 / 1271558
processed 310000 / 1271558
processed 320000 / 1271558
processed 330000 / 1271558
processed 340000 / 1271558
processed 350000 / 1271558
proces



epoch: 0 cost: 48124376.0
epoch: 1 cost: 107440690.0


KeyboardInterrupt: 

In [31]:
def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W):
  V, D = W.shape

  # don't actually use pos2 in calculation, just print what's expected
  print("testing: %s - %s = %s - %s" % (pos1, neg1, pos2, neg2))
  for w in (pos1, neg1, pos2, neg2):
    if w not in word2idx:
      print("Sorry, %s not in word2idx" % w)
      return

  p1 = W[word2idx[pos1]]
  n1 = W[word2idx[neg1]]
  p2 = W[word2idx[pos2]]
  n2 = W[word2idx[neg2]]

  vec = p1 - n1 + n2

  distances = pairwise_distances(vec.reshape(1, D), W, metric='cosine').reshape(V)
  idx = distances.argsort()[:10]

  # pick one that's not p1, n1, or n2
  best_idx = -1
  keep_out = [word2idx[w] for w in (pos1, neg1, neg2)]
  # print("keep_out:", keep_out)
  for i in idx:
    if i not in keep_out:
      best_idx = i
      break
  # print("best_idx:", best_idx)

  print("got: %s - %s = %s - %s" % (pos1, neg1, idx2word[best_idx], neg2))
  print("closest 10:")
  for i in idx:
    print(idx2word[i], distances[i])

  print("dist to %s:" % pos2, cos_dist(p2, vec))

In [40]:
print(W.shape, U.shape)
idx2word = {i:w for w, i in word2idx.items()}
We = (W + U) / 2

analogy('france', 'french', 'england', 'english', word2idx, idx2word, We)

(2001, 100) (2001, 100)
testing: france - french = england - english
got: france - french = interests - english
closest 10:
english 0.38648582
france 0.4389286
interests 0.5164262
features 0.5265418
advantage 0.53877854
cells 0.5425768
actual 0.5515337
energy 0.55583745
actions 0.567492
painting 0.57363844
dist to england: 0.9251848310232162
