In [15]:
from glob import glob
import string
import numpy as np
import tensorflow as tf
from datetime import datetime

import os
import sys
import string
import json


from nltk.corpus import brown
import operator
from utils import find_analogies, get_wikipedia_data

In [24]:
def construct_matrix(save_dir, sentences, V, context_sz):
  if not os.path.exists(os.path.join(save_dir, 'matrix')):
      X = np.zeros((V, V))
      N = len(sentences)
      print("number of sentences to process:", N)
      it = 0
      for sentence in sentences:
          it += 1
          if it % 10000 == 0:
              print("processed", it, "/", N)
          n = len(sentence)
          for i in range(n):
              wi = sentence[i]

              start = max(0, i - context_sz)
              end = min(n, i + context_sz)

              if i - context_sz < 0:
                  points = 1.0 / (i + 1)
                  X[wi,0] += points
                  X[0,wi] += points
              if i + context_sz > n:
                  points = 1.0 / (n - i)
                  X[wi,1] += points
                  X[1,wi] += points

              for j in range(start, i):
                  wj = sentence[j]
                  points = 1.0 / (i - j)
                  X[wi,wj] += points
                  X[wj,wi] += points

              for j in range(i + 1, end):
                  wj = sentence[j]
                  points = 1.0 / (j - i)
                  X[wi,wj] += points
                  X[wj,wi] += points

      np.save(os.path.join(save_dir, 'matrix'), X)

In [36]:
def train(save_dir, D, learning_rate=1e-4, reg=0.1, xmax=100, alpha=0.75, epochs=200):
  X = np.load(os.path.join(save_dir, 'matrix.npy'))
  V = len(X)
  print("max in X:", X.max())

  # weighting
  fX = np.zeros((V, V))
  fX[X < xmax] = (X[X < xmax] / float(xmax)) ** alpha
  fX[X >= xmax] = 1

  print("max in f(X):", fX.max())

  # target
  logX = np.log(X + 1)

  print("max in log(X):", logX.max())

  # initialize weights
  W = np.random.randn(V, D) / np.sqrt(V + D)
  b = np.zeros(V)
  U = np.random.randn(V, D) / np.sqrt(V + D)
  c = np.zeros(V)
  mu = logX.mean()

  # initialize weights, inputs, targets placeholders
  tfW = tf.Variable(W.astype(np.float32))
  tfb = tf.Variable(b.reshape(V, 1).astype(np.float32))
  tfU = tf.Variable(U.astype(np.float32))
  tfc = tf.Variable(c.reshape(1, V).astype(np.float32))

  optimizer = tf.compat.v1.train.MomentumOptimizer(learning_rate, momentum=0.9)

  costs = []
  for epoch in range(epochs):
      with tf.GradientTape() as t:
        delta = tf.matmul(tfW, tf.transpose(tfU)) + tfb + tfc + mu - logX
        cost = tf.math.reduce_sum(fX * delta * delta)
        print("epoch:", epoch, "cost:", cost)
        grads = t.gradient(cost, [tfW, tfb, tfU, tfc])
        optimizer.apply_gradients(zip(grads,[tfW, tfb, tfU, tfc]))
        
        
  # save for future calculations
  W, U = tfW.numpy(), tfU.numpy()
  return W, U

In [49]:
def main(save_dir):
  if not os.path.exists(save_dir):
    os.mkdir(save_dir)
  sentences, word2idx = get_wikipedia_data(None, 2000)
  V = len(word2idx)
  with open('%s/word2idx.json' % save_dir, 'w') as f:
    json.dump(word2idx, f)
    
  construct_matrix(save_dir, sentences, V, 10)







#   with open('%s/word2idx.json' % save_dir) as f:
#     word2idx = json.load(f)
  
#   W, U = train(save_dir, 100)
#   idx2word = {i:w for w, i in word2idx.items()}
#   We = (W + U) / 2

#   find_analogies('king', 'man', 'woman', We, word2idx, idx2word)
#   find_analogies('france', 'paris', 'london', We, word2idx, idx2word)
#   find_analogies('france', 'paris', 'rome', We, word2idx, idx2word)
#   find_analogies('paris', 'france', 'italy', We, word2idx, idx2word)
#   find_analogies('france', 'french', 'english', We, word2idx, idx2word)
#   find_analogies('japan', 'japanese', 'chinese', We, word2idx, idx2word)
#   find_analogies('japan', 'japanese', 'italian', We, word2idx, idx2word)
#   find_analogies('japan', 'japanese', 'australian', We, word2idx, idx2word)
#   find_analogies('december', 'november', 'june', We, word2idx, idx2word)

In [50]:
main('wiki_2000_full')

reading: enwiki-20180401-pages-articles1.xml-p10p30302-01.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-02.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-03.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-04.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-05.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-06.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-07.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-08.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-09.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-10.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-11.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-12.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-13.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-14.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-15.txt
reading: enwiki-20180401-pages-articles1.xml-p10p30302-16.txt
reading:

KeyboardInterrupt: 