<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [2]:
%load_ext autoreload
%load_ext line_profiler
%autoreload 2

from tqdm import tqdm_notebook, tqdm
from scipy.optimize import minimize
from math import exp, log
import numpy as np
from utils import getPairsAndVocs, getNegPairs, costFunction, gradCost, update_voc 
import string
import re

In [3]:
def text2sentences(path):
    # feel free to make a better tokenization/pre-processing
    sentences = []
    reg = re.compile('[%s]' % re.escape(string.punctuation))
    with open(path) as f:
        for l in f:
            l_wo_punct = reg.sub('', l)
            sentences.append( l_wo_punct.lower().split() )
    return sentences

In [4]:
sentences = text2sentences("data/odyssey.txt")

In [5]:
sent = sentences[80]
print(sent)

['the', 'prophesyings', 'with', 'which', 'phoebus', 'apollo', 'had', 'inspired', 'him', 'with']


In [94]:
class mSkipGram:
    def __init__(self, sentences, nEmbed=100, negativeRate=5, winSize=5, minCount=5):
        self.nEmbed = nEmbed
        self.wc_pairs, self.word_voc, self.context_voc = getPairsAndVocs(sentences, winSize)
        self.neg_wc_pairs = getNegPairs(self.wc_pairs, self.word_voc, self.context_voc, negativeRate)
    
    def train(self, stepsize=0.1, epochs=10, batch_size=32):
        nb_words = len(list(self.word_voc.keys()))
        nb_context = len(list(self.context_voc.keys()))
        
        theta0 = np.zeros(self.nEmbed*(nb_words+nb_context))
        cost_to_optimize = lambda theta, wc_pairs, neg_wc_pairs: costFunction(theta, self.nEmbed, wc_pairs, neg_wc_pairs, nb_words, nb_context)
        grad = lambda theta, wc_pairs, neg_wc_pairs: gradCost(theta, self.nEmbed, wc_pairs, neg_wc_pairs, nb_words, nb_context)
        
        ####### STOCHASTIC GRADIENT DESCENT #######
        converged = False
        tol = 1e-6
        thetan = theta0
        fn = 1e8
        it = 0
        rho = stepsize
        nb_pairs = len(self.wc_pairs)        
        
        for epoch in range(epochs):
            print("Epoch {}".format(epoch))
            for batch in tqdm_notebook(range(nb_pairs//batch_size)):
                batch_begin = batch*batch_size
                batch_end = (batch+1)*batch_size
                if batch_end > nb_pairs:
                    batch_end = nb_pairs
                batch_pos = self.wc_pairs[batch_begin:batch_end]
                batch_neg = self.neg_wc_pairs[batch_begin:batch_end]
                gradn = grad(thetan, batch_pos, batch_neg)
                thetanp1 = thetan + rho*gradn
                fnp1 = cost_to_optimize(thetanp1, batch_pos, batch_neg)
                
                div = 0 
                while fnp1>fn:
                    rho = rho/2
                    print("rho: {} ->".format(rho), end=" ")
                    thetanp1 = thetan + rho*gradn
                    fnp1 = cost_to_optimize(thetanp1, batch_pos, batch_neg)
                    div += 1
                
                print("{:.2f} ->".format(fnp1), end = " ")
                rho = 2*rho
                thetan = thetanp1
                fn = fnp1
            print("")
        ####### STOCHASTIC GRADIENT DESCENT #######   

        
        ######## GRADIENT DESCENT #######
        #converged = False
        #tol = 1e-6
        #thetan = theta0
        #fn = 1e8
        #rho = stepsize
        #nb_pairs = len(self.wc_pairs)
        #
        #for epoch in tqdm(range(epochs)):
        #    gradn = grad(thetan, self.wc_pairs, self.neg_wc_pairs)
        #    thetanp1 = thetan + rho*gradn
        #    fnp1 = cost_to_optimize(thetanp1, self.wc_pairs, self.neg_wc_pairs)
        #    while fnp1>fn:
        #        rho = rho/2
        #        print("rho: {} ->".format(rho), end=" ")
        #        thetanp1 = thetan + rho*gradn
        #        fnp1 = cost_to_optimize(thetanp1, self.wc_pairs, self.neg_wc_pairs)
        #        
        #    print("{:.2f} ->".format(fnp1), end = " ")
        #    rho = 2*rho
        #    thetan = thetanp1
        #    fn = fnp1
        #    print("")
        ######## GRADIENT DESCENT #######
        
        return (fn, thetan, converged)

my_sg = mSkipGram(sentences)

In [96]:
my_sg.train(stepsize=1, batch_size=100000)

Epoch 0


A Jupyter Widget




Exception in thread Thread-11:
Traceback (most recent call last):
  File "/Users/sidou/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/sidou/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/sidou/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



138629.44 -> 138629.44 -> 


KeyboardInterrupt: 

In [20]:
nb_words = len(list(my_sg.word_voc.keys()))
nb_context = len(list(my_sg.context_voc.keys()))
theta0 = np.ones((nb_words+nb_context)*my_sg.nEmbed)*0.1

In [21]:
%lprun -f costFunction costFunction(theta0, my_sg.nEmbed, my_sg.wc_pairs, my_sg.neg_wc_pairs, nb_words, nb_context)

Appel cost function


In [27]:
%lprun -f gradCost gradCost(theta0, my_sg.nEmbed, my_sg.wc_pairs, my_sg.neg_wc_pairs, nb_words, nb_context)

Appel gradient
Début wc pairs
Début neg wc pairs


In [42]:
theta0 = np.concatenate([np.ones(10), np.ones(20)*2])

In [43]:
theta0

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.])

In [60]:
nEmbed = 2
nb_words = 10
nb_contexts = 5
W = theta0[:nEmbed*nb_words].reshape(nb_words, nEmbed)
C = theta0[nEmbed*nb_words:].reshape(nb_contexts, nEmbed)

In [71]:
%timeit sum(W[0,:]*(C[0,:]))

4.16 µs ± 86.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [72]:
%timeit W[0,:].dot((C[0,:]))

1.61 µs ± 13.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [75]:
W

array([[ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 2.,  2.],
       [ 2.,  2.],
       [ 2.,  2.],
       [ 2.,  2.],
       [ 2.,  2.]])

In [74]:
C[0,:].shape

(2,)