In [93]:
from glob import glob
import string
import numpy as np
from datetime import datetime
from scipy.special import expit as sigmoid

In [34]:
!pip install scipy

Collecting scipy
  Downloading scipy-1.5.2-cp37-cp37m-win_amd64.whl (31.2 MB)
Installing collected packages: scipy
Successfully installed scipy-1.5.2


You should consider upgrading via the 'c:\users\mazic\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


In [15]:
def remove_punctuation(s):
  return s.translate(str.maketrans('','',string.punctuation))

In [49]:
def get_wiki():
  V = 20
  files = glob('large_files/enwiki*.txt')
  all_word_counts = {}
  for f in files:
    for line in open(f, encoding="utf8"):
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          for word in s:
            if word not in all_word_counts:
              all_word_counts[word] = 0
            all_word_counts[word] += 1
  V = min(V, len(all_word_counts))
  all_word_counts = sorted(all_word_counts.items(), key=lambda x: x[1], reverse=True)
  top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>']
  word2idx = {w: i for i, w in enumerate(top_words)}
  unk = word2idx['<UNK>']
  
  sents = []
  for f in files:
    for line in open(f, encoding="utf8"):
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          sent = [word2idx[w] if w in word2idx else unk for w in s]
          sents.append(sent)
  return sents, word2idx
          
              

In [53]:
def get_wiki_small():
  V = 20
  file = 'large_files/enwiki-20180401-pages-articles1.xml-p10p30302-01.txt'
  all_word_counts = {}
  i = 0
  for line in open(file, encoding="utf8"):
    if (i < 10): 
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          i += 1
          for word in s:
            if word not in all_word_counts:
              all_word_counts[word] = 0
            all_word_counts[word] += 1
            
  V = min(V, len(all_word_counts))
  all_word_counts = sorted(all_word_counts.items(), key=lambda x: x[1], reverse=True)
  top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>']
  word2idx = {w: i for i, w in enumerate(top_words)}
  unk = word2idx['<UNK>']
  
  sents = []
  i = 0
  for line in open(file, encoding="utf8"):
    if (i < 10): 
      if line and line[0] not in '[*-|=\{\}':
        s = remove_punctuation(line).lower().split()
        if len(s) > 1:
          i += 1
          sent = [word2idx[w] if w in word2idx else unk for w in s]
          sents.append(sent)
  return sents, word2idx
          

In [25]:
def get_negative_sampling_distribution(sentences, vocab_size):
  word_freq = np.zeros(vocab_size)
  
  for sentence in sentences:
      for word in sentence:
          word_freq[word] += 1

  # smooth it
  p_neg = word_freq**0.75

  # normalize it
  p_neg = p_neg / p_neg.sum()

  assert(np.all(p_neg > 0))
  return p_neg

In [26]:
def get_context(pos, sentence, window_size):
  start = max(0, pos - window_size)
  end_  = min(len(sentence), pos + window_size)

  context = []
  for ctx_pos, ctx_word_idx in enumerate(sentence[start:end_], start=start):
    if ctx_pos != pos:
      context.append(ctx_word_idx)
  return context

In [27]:
def sgd(input_, targets, label, learning_rate, W, V):
  # W[input_] shape: D
  # V[:,targets] shape: D x N
  # activation shape: N
  # print("input_:", input_, "targets:", targets)
  activation = W[input_].dot(V[:,targets])
  prob = sigmoid(activation)

  # gradients
  gV = np.outer(W[input_], prob - label) # D x N
  gW = np.sum((prob - label)*V[:,targets], axis=1) # D

  V[:,targets] -= learning_rate*gV # D x N
  W[input_] -= learning_rate*gW # D

  # return cost (binary cross entropy)
  cost = label * np.log(prob + 1e-10) + (1 - label) * np.log(1 - prob + 1e-10)
  return cost.sum()

In [87]:
def train_model():
  sentences, word2idx = get_wiki_small()
  vocab_size = len(word2idx)
  
  window_size = 5
  learning_rate = 0.025
  final_learning_rate = 0.0001
  num_negatives = 5
  epochs = 20
  D = 5
  
  learning_rate_delta = (learning_rate - final_learning_rate) / epochs
  
  W = np.random.randn(vocab_size, D)
  V = np.random.randn(D, vocab_size)
  
  p_neg = get_negative_sampling_distribution(sentences, vocab_size)
  costs = []
  
  threshold = 1e-5
  p_drop = 1 - np.sqrt(threshold / p_neg)
  
  for epoch in range(epochs):
    np.random.shuffle(sentences)
    cost = 0
    counter = 0
    t0 = datetime.now()
    for sentence in sentences:
      sentence = [w for w in sentence if np.random.random() < (1 - p_drop[w])]
      if len(sentence) < 2:
        continue
      randomly_ordered_positions = np.random.choice(
        len(sentence),
        size=len(sentence),
        replace=False,
      )
      
      for pos in randomly_ordered_positions:
        word = sentence[pos]
        context_words = get_context(pos, sentence, window_size)
        neg_word = np.random.choice(vocab_size, p=p_neg)
        targets = np.array(context_words)
        
        c = sgd(word, targets, 1, learning_rate, W, V)
        cost += c
        c = sgd(neg_word, targets, 0, learning_rate, W, V)
        cost += c
      counter += 1
      if counter % 100 == 0:
        print("processed %s / %s\r" % (counter, len(sentences)))
    
    dt = datetime.now() - t0
    print("epoch complete:", epoch, "cost:", cost, "dt:", dt)

    # save the cost
    costs.append(cost)

    # update the learning rate
    learning_rate -= learning_rate_delta

  # return the model
  return word2idx, W, V

In [61]:
sentences, word2idx = get_wiki_small()
print(sentences, word2idx)

[[19, 3, 19, 19, 19, 19, 19, 19, 12, 19, 12, 19, 19, 19], [3, 10, 8, 12, 16, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 5, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 5, 19, 19, 19, 19, 9, 19, 19, 3, 19, 0, 19, 7, 19, 19, 19, 4, 19], [19, 19, 7, 0, 19, 10, 19, 3, 19, 19, 19, 19, 9, 19, 19, 2, 0, 19, 1, 19, 19, 19, 3, 10, 19, 19, 8, 19, 19, 4, 19, 1, 6, 19, 4, 6, 19, 16, 19, 19, 19, 1, 19, 19, 19, 19, 9, 19, 19], [3, 19, 19, 19, 8, 19, 19, 1, 19, 11, 8, 19, 19, 19, 19, 19, 19, 4, 19, 5, 8, 16, 19, 19, 4, 19, 1, 3, 19, 19, 19, 1, 19, 19, 19, 19, 6, 19, 1, 19, 19, 19, 19, 19, 19, 11, 19, 19, 7, 19, 19, 19, 1, 3, 19, 19, 19, 19, 19, 0, 19, 1, 19, 4, 19, 3, 9, 19, 19, 19], [0, 17, 3, 10, 19, 11, 0, 17, 19, 4, 0, 19, 19, 19, 19, 19, 11, 0, 19, 19, 19, 19, 11, 19, 19, 19, 19, 19, 19, 11, 0, 19, 19, 19, 19, 19, 19, 4, 19, 19, 19, 19, 19, 19, 19, 9, 19, 19, 19, 19, 19, 19, 19, 4, 0, 19, 19, 9, 19, 19, 19, 11, 0, 19, 19, 19, 19, 19, 0, 13, 19, 18, 1, 19, 17, 14, 2, 19, 19, 19, 19, 0, 19, 19, 19

In [97]:
np.random.seed(0)
word2idx, W, V = train_model()
print(W)

epoch complete: 0 cost: -15.264472271784413 dt: 0:00:00.001001
epoch complete: 1 cost: -29.614341482538222 dt: 0:00:00.002989
epoch complete: 2 cost: -10.962995073739663 dt: 0:00:00.000961
epoch complete: 3 cost: -10.437147104361781 dt: 0:00:00.000998
epoch complete: 4 cost: -4.768189741953256 dt: 0:00:00.001028
epoch complete: 5 cost: -44.334816238657446 dt: 0:00:00.001997
epoch complete: 6 cost: -1.1587330323231546 dt: 0:00:00.001031
epoch complete: 7 cost: -21.696532680766865 dt: 0:00:00.001996
epoch complete: 8 cost: -63.79827152943896 dt: 0:00:00.002944
epoch complete: 9 cost: -81.19862030117831 dt: 0:00:00.003007
epoch complete: 10 cost: -10.806632059104425 dt: 0:00:00.001005
epoch complete: 11 cost: -13.832729191596421 dt: 0:00:00.000998
epoch complete: 12 cost: -12.76264452732396 dt: 0:00:00.000983
epoch complete: 13 cost: -14.868321482498917 dt: 0:00:00.000996
epoch complete: 14 cost: -7.0466905703664064 dt: 0:00:00.001992
epoch complete: 15 cost: 0 dt: 0:00:00.001025
epoch co

In [44]:
W = np.random.randn(6, 4)
V = np.random.randn(4, 6)
prob = sigmoid(W[3].dot(V[:,[2, 4]]))
print(prob)

gV = np.outer(W[3], prob - 1)
print(gV)

[0.79979108 0.33298967]
[[0.16974153 0.56550604]
 [0.15930054 0.53072116]
 [0.00803347 0.02676408]
 [0.07554102 0.2516703 ]]
