In [7]:
from glob import glob
import string
import numpy as np
import tensorflow as tf
from datetime import datetime
from scipy.special import expit as sigmoid
import nltk
from nltk.corpus import brown
nltk.data.path.append("C:\\UBS\\Dev\\nltk_data")

In [None]:
!conda install tensorflow

In [8]:
def remove_punctuation(s):
  return s.translate(str.maketrans('','',string.punctuation))

In [9]:
def get_wiki():
  V = 20000
  files = glob('large_files/enwiki*.txt')
  all_word_counts = {}
  for f in files:
    for line in open(f, encoding="utf8"):
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          for word in s:
            if word not in all_word_counts:
              all_word_counts[word] = 0
            all_word_counts[word] += 1
  V = min(V, len(all_word_counts))
  all_word_counts = sorted(all_word_counts.items(), key=lambda x: x[1], reverse=True)
  top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>']
  word2idx = {w: i for i, w in enumerate(top_words)}
  unk = word2idx['<UNK>']
  
  sents = []
  for f in files:
    for line in open(f, encoding="utf8"):
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          sent = [word2idx[w] if w in word2idx else unk for w in s]
          sents.append(sent)
  return sents, word2idx
          
              

In [10]:
def get_wiki_small():
  V = 20
  file = 'large_files/enwiki-20180401-pages-articles1.xml-p10p30302-01.txt'
  all_word_counts = {}
  i = 0
  for line in open(file, encoding="utf8"):
    if (i < 10): 
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          i += 1
          for word in s:
            if word not in all_word_counts:
              all_word_counts[word] = 0
            all_word_counts[word] += 1
            
  V = min(V, len(all_word_counts))
  all_word_counts = sorted(all_word_counts.items(), key=lambda x: x[1], reverse=True)
  top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>']
  word2idx = {w: i for i, w in enumerate(top_words)}
  unk = word2idx['<UNK>']
  
  sents = []
  i = 0
  for line in open(file, encoding="utf8"):
    if (i < 10): 
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          i += 1
          sent = [word2idx[w] if w in word2idx else unk for w in s]
          sents.append(sent)
  return sents, word2idx
          

In [11]:
def get_brown_small():
  sentences = brown.sents()
  word_count = {}
  
  V = 20
  i = 0

  for sentence in sentences:
    if i < 10:
      i += 1
      for word in sentence:
        if word in word_count:
          word_count[word] += 1
        else:
          word_count[word] = 1

  all_word_counts = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
  top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>']
  word2idx = {w: i for i, w in enumerate(top_words)}
  unk = word2idx['<UNK>']
  
  sents = []
  
  i = 0
  
  for sentence in sentences:
    if i < 10:
      i += 1
      sent = [word2idx[word] if word in word2idx else unk for word in sentence]
      sents.append(sent)

  return sents, word2idx

In [12]:
def get_negative_sampling_distribution(sentences):
  # Pn(w) = prob of word occuring
  # we would like to sample the negative samples
  # such that words that occur more often
  # should be sampled more often

  word_freq = {}
  word_count = sum(len(sentence) for sentence in sentences)
  for sentence in sentences:
      for word in sentence:
          if word not in word_freq:
              word_freq[word] = 0
          word_freq[word] += 1
  
  # vocab size
  V = len(word_freq)

  p_neg = np.zeros(V)
  for j in range(V):
      p_neg[j] = word_freq[j]**0.75

  # normalize it
  p_neg = p_neg / p_neg.sum()

  assert(np.all(p_neg > 0))
  return p_neg

In [13]:
def get_context(pos, sentence, window_size):
  start = max(0, pos - window_size)
  end_  = min(len(sentence), pos + window_size)

  context = []
  for ctx_pos, ctx_word_idx in enumerate(sentence[start:end_], start=start):
    if ctx_pos != pos:
      context.append(ctx_word_idx)
  return context

In [41]:
def train_model():
  sentences, word2idx = get_brown_small()
  p_neg = get_negative_sampling_distribution(sentences)
  
  # number of unique words
  vocab_size = len(word2idx)


  # config
  window_size = 5
  learning_rate = 0.025
  final_learning_rate = 0.0001
  num_negatives = 5 # number of negative samples to draw per input word
  samples_per_epoch = int(1e5)
  epochs = 5000
  D = 5 # word embedding size

  learning_rate_delta = (learning_rate - final_learning_rate) / epochs

  # distribution for drawing negative sample
  # params
  W = tf.Variable(tf.random.normal([vocab_size, D]))
  V = tf.Variable(tf.random.normal([D, vocab_size])) # hidden-to-output


  def dot(A, B):
    C = A * B
    return tf.reduce_sum(input_tensor=C, axis=1)
  
  optimizer = tf.optimizers.SGD(learning_rate=0.01)
  
  x_train = []
  y_train = []
  
  def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp
  
  for sentence in sentences:
    for pos in range(len(sentence)):
      word = sentence[pos]
      context_words = get_context(pos, sentence, window_size)
      for target in context_words:
        x_train.append(to_one_hot(word, vocab_size))
        y_train.append(to_one_hot(target, vocab_size))
        
  x_train = np.asarray(x_train, dtype='float32')
  y_train = np.asarray(y_train, dtype='float32')
  
  for _ in range(epochs):
    with tf.GradientTape() as t:
      hidden_layer = tf.matmul(x_train, W)
      output_layer = tf.nn.softmax(tf.matmul(hidden_layer, V))
      cross_entropy_loss = tf.reduce_mean(-tf.math.reduce_sum(y_train * tf.math.log(output_layer), axis=[1]))

      grads = t.gradient(cross_entropy_loss, [W, V])
      optimizer.apply_gradients(zip(grads,[W, V]))
      if(_ % 1000 == 0):
        print(cross_entropy_loss)



  return word2idx, W, V

In [42]:
np.random.seed(0)
word2idx, W, V = train_model()

tf.Tensor(5.6123095, shape=(), dtype=float32)
tf.Tensor(2.7801604, shape=(), dtype=float32)
tf.Tensor(2.4376032, shape=(), dtype=float32)
tf.Tensor(2.2505875, shape=(), dtype=float32)
tf.Tensor(2.1507685, shape=(), dtype=float32)


In [44]:
W = np.random.randn(6, 4)
V = np.random.randn(4, 6)
prob = sigmoid(W[3].dot(V[:,[2, 4]]))
print(prob)

gV = np.outer(W[3], prob - 1)
print(gV)

[0.79979108 0.33298967]
[[0.16974153 0.56550604]
 [0.15930054 0.53072116]
 [0.00803347 0.02676408]
 [0.07554102 0.2516703 ]]
