<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
from scipy.optimize import minimize
from math import exp, log
import numpy as np
import string
import re

In [2]:
%load_ext line_profiler

In [3]:
def text2sentences(path):
    # feel free to make a better tokenization/pre-processing
    sentences = []
    reg = re.compile('[%s]' % re.escape(string.punctuation))
    with open(path) as f:
        for l in f:
            l_wo_punct = reg.sub('', l)
            sentences.append( l_wo_punct.lower().split() )
    return sentences

In [4]:
sentences = text2sentences("data/odyssey.txt")

In [5]:
sent = sentences[80]
print(sent)

['the', 'prophesyings', 'with', 'which', 'phoebus', 'apollo', 'had', 'inspired', 'him', 'with']


In [6]:
def update_voc(voc_dict, word, index):
    # (word: index)
    if word not in voc_dict:
        voc_dict[word] = index
        index += 1
    return voc_dict, index

In [51]:
def getPairsAndVocs(sentences, winSize):
    wc_pairs = []
    word_voc = {}
    word_index = 0
    context_voc = {}
    context_index = 0
    
    for sent in sentences:
        for i in range(len(sent)):
            word = sent[i]
            # Update word_voc dict
            word_voc, word_index = update_voc(word_voc, word, word_index)
            for j in range(max(0,i-winSize), min(i+winSize+1, len(sent))):
                if i != j:
                    context = sent[j]
                    context_voc, context_index = update_voc(context_voc, context, context_index)
                    wc_pairs.append((word_voc[word], context_voc[context]))
                    
    return wc_pairs, word_voc, context_voc

def getNegPairs(wc_pairs, word_voc, context_voc):
    word_voc_list = list(word_voc.keys())
    context_voc_list = list(context_voc.keys())
    
    nb_words = len(word_voc_list)
    nb_contexts = len(context_voc_list)
    
    nb_pairs = len(wc_pairs)
    neg_wc_pairs = []

    for _ in range(nb_pairs):
        ind1 = np.random.randint(0,nb_words)
        ind2 = np.random.randint(0,nb_contexts)
        neg_wc_pairs.append((ind1, ind2))
    return neg_wc_pairs
    
def costFunction(theta, nEmbed, wc_pairs, neg_wc_pairs, nb_words, nb_contexts):
    print("Appel cost function")
    W = theta[:nEmbed*nb_words].reshape(nEmbed, nb_words)
    C = theta[nEmbed*nb_words:].reshape(nEmbed, nb_contexts)
    
    S = W.transpose().dot(C)
    
    wc_cost = sum([-log(1+exp(-S[wc_pair])) for wc_pair in wc_pairs])
    
    neg_wc_cost = 0
    for neg_wc_pair in neg_wc_pairs:
        #print(1+exp(S[neg_wc_pair]))
        neg_wc_cost += -log(1+exp(S[neg_wc_pair]))
    
    #neg_wc_cost = sum([log(exp(-S[neg_wc_pair])/(1+exp(-S[neg_wc_pair]))) for neg_wc_pair in neg_wc_pairs])
    
    return - wc_cost - neg_wc_cost

def gradCost(theta, nEmbed, wc_pairs, neg_wc_pairs, nb_words, nb_contexts):
    print("Appel gradient")
    grad = np.zeros(theta.shape[0])
    W = theta[:nEmbed*nb_words].reshape(nEmbed, nb_words)
    C = theta[nEmbed*nb_words:].reshape(nEmbed, nb_contexts)
    
    S = W.transpose().dot(C)
    
    print("Début wc pairs")
    c = 0
    for wc_pair in wc_pairs:
        c+=1
        word_idx = wc_pair[0]
        context_idx = wc_pair[1]
        
        exp_mwdotc = exp(-S[wc_pair])
        
        # Update the derivative for word and context
        df_dw = C[:,context_idx]*exp_mwdotc/(1+exp_mwdotc)
        grad[word_idx*nEmbed:(word_idx+1)*nEmbed] += df_dw
        df_dc = W[:,word_idx]*exp_mwdotc/(1+exp_mwdotc)
        grad[context_idx*nEmbed:(context_idx+1)*nEmbed] += df_dc
    
    print("Début neg wc pairs")
    for neg_wc_pair in neg_wc_pairs:
        word_idx = neg_wc_pair[0]
        context_idx = neg_wc_pair[1]
        
        exp_wdotc = exp(S[neg_wc_pair])
        
        # Update the derivative for word and context
        df_dw = -C[:,context_idx]*exp_wdotc/(1+exp_wdotc)
        grad[word_idx*nEmbed:(word_idx+1)*nEmbed] += df_dw
        
        df_dc = -W[:,word_idx]*exp_wdotc/(1+exp_wdotc)
        grad[context_idx*nEmbed:(context_idx+1)*nEmbed] += df_dc
    
    return grad

In [56]:
class mSkipGram:
    
    def __init__(self,sentences, nEmbed=50, negativeRate=5, winSize=1, minCount=5):
        self.nEmbed = nEmbed
        self.wc_pairs, self.word_voc, self.context_voc = getPairsAndVocs(sentences, winSize)
        self.neg_wc_pairs = getNegPairs(self.wc_pairs, self.word_voc, self.context_voc)
    
    def train(self):
        nb_words = len(list(self.word_voc.keys()))
        nb_context = len(list(self.context_voc.keys()))
        
        theta0 = np.ones((nb_words+nb_context)*self.nEmbed)*0.01
        cost_to_optimize = lambda theta: costFunction(theta, self.nEmbed, self.wc_pairs, self.neg_wc_pairs, nb_words, nb_context)
        grad = lambda theta: gradCost(theta, self.nEmbed, self.wc_pairs, self.neg_wc_pairs, nb_words, nb_context)
        
        ####### GRADIENT DESCENT #######
        converged = False
        itermax = 1000
        tol = 1e-6
        thetan = theta0
        fn = cost_to_optimize(thetan)
        print(fn)
        rho = 0.01
        it = 0
        
        while not converged and it < itermax:
            it+=1
            gradn = grad(thetan)
            thetanp1 = thetan + rho*gradn
            fnp1 = cost_to_optimize(thetanp1)
            print(fnp1)
            thetan = thetanp1
            fn = fnp1
        ####### GRADIENT DESCENT #######   
        
        return (fn, thetan, converged)

my_sg = mSkipGram(sentences)

In [57]:
res = my_sg.train()

Appel cost function
389117.945933477
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
388928.0479586347
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
388735.33474611817
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
388551.63089107396
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
388408.2740327246
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
388341.97652589553
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
388364.4820358639
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
388467.03936034505
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
388643.9981963306
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
388897.22698573937
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
389232.1432096518
Appel gradient
Début wc pairs
Début neg wc pairs
Appel cost function
389654.91685382306


KeyboardInterrupt: 

In [58]:
len(my_sg.wc_pairs)

280688

In [20]:
nb_words = len(list(my_sg.word_voc.keys()))
nb_context = len(list(my_sg.context_voc.keys()))
theta0 = np.ones((nb_words+nb_context)*my_sg.nEmbed)*0.1

In [21]:
%lprun -f costFunction costFunction(theta0, my_sg.nEmbed, my_sg.wc_pairs, my_sg.neg_wc_pairs, nb_words, nb_context)

Appel cost function


In [27]:
%lprun -f gradCost gradCost(theta0, my_sg.nEmbed, my_sg.wc_pairs, my_sg.neg_wc_pairs, nb_words, nb_context)

Appel gradient
Début wc pairs
Début neg wc pairs
