<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
%load_ext autoreload
%load_ext line_profiler
%autoreload 2

from tqdm import tqdm_notebook, tqdm
from scipy.optimize import minimize
from math import exp, log
import numpy as np
from utils import getPairsAndVocs, getNegPairs, costFunction, gradCost, update_voc, remove_rare_words
import string
import re
from nltk.probability import FreqDist

In [2]:
def text2sentences(path):
    # feel free to make a better tokenization/pre-processing
    sentences = []
    reg = re.compile('[%s]' % re.escape(string.punctuation))
    with open(path) as f:
        for l in f:
            l_wo_punct = reg.sub('', l)
            sentences.append( l_wo_punct.lower().split() )
    np.random.shuffle(sentences)
    return sentences

In [119]:
sentences = text2sentences("data/odyssey.txt")

In [120]:
sent = sentences[80]
print(sent)

['yield', 'even', 'to', 'achilles', 'in', 'handtohand', 'fight', 'and', 'in', 'fleetness', 'of']


In [121]:
sentences_w_min_count = remove_rare_words(sentences, 5)

In [122]:
class mSkipGram:
    def __init__(self, sentences, nEmbed=100, negativeRate=5, winSize=3, minCount=5):
        sentences_w_min_count = remove_rare_words(sentences, minCount)
        self.nEmbed = nEmbed
        self.negativeRate = negativeRate
        self.wc_pairs, self.word_voc, self.context_voc = getPairsAndVocs(sentences_w_min_count, winSize)
        self.neg_wc_pairs = getNegPairs(self.wc_pairs, self.word_voc, self.context_voc, negativeRate)
        
    def generate_neg_pair(word, negativeRate, wc_pairs, nb_pairs):
        neg_pairs = []
        for _ in range(negativeRate):
            rd_pair = np.random.randint(nb_pairs)
            neg_pairs.append((word, wc_pairs[rd_pair][1]))
        return neg_pairs

    def train(self, stepsize=0.1, epochs=10, batch_size=1000):
        nb_words = len(list(self.word_voc.keys()))
        nb_contexts = len(list(self.context_voc.keys()))
        
        theta0 = np.random.random(self.nEmbed*(nb_words+nb_contexts))*1e-5
        cost_to_optimize = lambda theta, wc_pairs, neg_wc_pairs: costFunction(theta, self.nEmbed, wc_pairs, neg_wc_pairs, nb_words, nb_contexts)
        grad = lambda theta, wc_pairs, neg_wc_pairs: gradCost(theta, self.nEmbed, wc_pairs, neg_wc_pairs, nb_words, nb_contexts)
        
        ####### STOCHASTIC GRADIENT DESCENT #######
        thetan = theta0
        #fn = cost_to_optimize(thetan, self.wc_pairs, self.neg_wc_pairs)
        #print(fn)
        rho = stepsize
        nb_pairs = len(self.wc_pairs)        
        
        for epoch in range(epochs):
            print("Epoch {}".format(epoch+1))
            for batch in tqdm_notebook(range(nb_pairs//batch_size)):
                batch_begin = batch*batch_size
                batch_end = (batch+1)*batch_size
                if batch_end > nb_pairs:
                    batch_end = nb_pairs
                batch_pos = self.wc_pairs[batch_begin:batch_end]
                batch_neg = self.neg_wc_pairs[self.negativeRate*batch_begin:self.negativeRate*batch_end]
                gradn = grad(thetan, batch_pos, batch_neg)
                thetanp1 = thetan + rho*gradn
                fnp1 = cost_to_optimize(thetanp1, batch_pos, batch_neg)
                thetan = thetanp1
                fn = fnp1
            #print(cost_to_optimize(thetanp1, self.wc_pairs, self.neg_wc_pairs))
        ####### STOCHASTIC GRADIENT DESCENT #######   

        
        ######## GRADIENT DESCENT #######
        #converged = False
        #tol = 1e-6
        #thetan = theta0
        #fn = 1e8
        #rho = stepsize
        #nb_pairs = len(self.wc_pairs)
        #
        #for epoch in range(epochs):
        #    gradn = grad(thetan, self.wc_pairs, self.neg_wc_pairs)
        #    thetanp1 = thetan + rho*gradn
        #    fnp1 = cost_to_optimize(thetanp1, self.wc_pairs, self.neg_wc_pairs)
        #    while fnp1>fn:
        #        rho = rho/2
        #        print("rho: {} ->".format(rho), end=" ")
        #        thetanp1 = thetan + rho*gradn
        #        fnp1 = cost_to_optimize(thetanp1, self.wc_pairs, self.neg_wc_pairs)
        #    print("{:.2f} ->".format(fnp1), end = " ")
        #    rho = 2*rho
        #    thetan = thetanp1
        #    fn = fnp1
        #    print(fn)
        ######## GRADIENT DESCENT #######
        return (fn, thetan)
        
        

In [123]:
my_sg = mSkipGram(sentences)

In [124]:
fn, thetan = my_sg.train(epochs=5, stepsize=0.02, batch_size=512)

Epoch 1


A Jupyter Widget


Epoch 2


A Jupyter Widget


Epoch 3


A Jupyter Widget


Epoch 4


A Jupyter Widget


Epoch 5


A Jupyter Widget




In [125]:
nb_words = len(my_sg.word_voc)
W = thetan[:100*nb_words].reshape(nb_words, 100)

In [126]:
inv_word_voc = {}
for key in my_sg.word_voc.keys():
    value = my_sg.word_voc[key]
    inv_word_voc[value] = key

In [127]:
def most_similar(W, word, word_voc, inv_word_voc):
    idx_word = word_voc[word]
    nb_words = len(word_voc)
    similarities = []
    for candidate in range(nb_words):
        if candidate != idx_word:
            cosine = W[idx_word].dot(W[candidate])/(np.linalg.norm(W[idx_word])*np.linalg.norm(W[candidate]))
            similarities.append((cosine, inv_word_voc[candidate]))
    return sorted(similarities, reverse=True)

In [128]:
most_similar(W, "cat", my_sg.word_voc, inv_word_voc)

KeyError: 'cat'

In [113]:
my_sg.word_voc

{'two': 0,
 'people': 1,
 'on': 2,
 'a': 3,
 'group': 4,
 'of': 5,
 'chinese': 6,
 'tourists': 7,
 'scenery': 8,
 'off': 9,
 'bridge': 10,
 'there': 11,
 'are': 12,
 'four': 13,
 'walking': 14,
 'down': 15,
 'the': 16,
 'street': 17,
 'in': 18,
 'front': 19,
 'yellow': 20,
 'building': 21,
 'man': 22,
 'helping': 23,
 'an': 24,
 'older': 25,
 'woman': 26,
 'to': 27,
 'top': 28,
 'sand': 29,
 'dune': 30,
 'who': 31,
 'is': 32,
 'wearing': 33,
 'performing': 34,
 'with': 35,
 'tub': 36,
 'while': 37,
 'watches': 38,
 'male': 39,
 'ski': 40,
 'dressed': 41,
 'black': 42,
 'sits': 43,
 'under': 44,
 'lift': 45,
 'younger': 46,
 'child': 47,
 'gray': 48,
 'orange': 49,
 'and': 50,
 'white': 51,
 'jacket': 52,
 'stand': 53,
 'next': 54,
 'him': 55,
 'skateboarder': 56,
 'at': 57,
 'park': 58,
 'enjoying': 59,
 'day': 60,
 'dogs': 61,
 'running': 62,
 'grass': 63,
 'person': 64,
 'blue': 65,
 'shirt': 66,
 'pants': 67,
 'sitting': 68,
 'bench': 69,
 'outside': 70,
 'young': 71,
 'does': 72,
 