<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
%load_ext autoreload
%load_ext line_profiler
%autoreload 2

from tqdm import tqdm_notebook, tqdm
from scipy.optimize import minimize
from math import exp, log
import numpy as np
from utils import getPairsAndVocs, getNegPairs, costFunction, gradCost, update_voc, remove_rare_words
import string
import re
from nltk.probability import FreqDist

In [2]:
def text2sentences(path):
    # feel free to make a better tokenization/pre-processing
    sentences = []
    reg = re.compile('[%s]' % re.escape(string.punctuation))
    with open(path) as f:
        for l in f:
            l_wo_punct = reg.sub('', l)
            sentences.append( l_wo_punct.lower().split() )
    np.random.shuffle(sentences)
    return sentences

In [3]:
sentences = text2sentences("data/sentences.txt")[:10000]

In [4]:
sent = sentences[80]
print(sent)

['a', 'man', 'with', 'a', 'brown', 'suit', 'jacket', 'playing', 'the', 'piano', 'and', 'singing', 'into', 'a', 'microphone']


In [5]:
sentences_w_min_count = remove_rare_words(sentences, 5)

In [6]:
class mSkipGram:
    def __init__(self, sentences, nEmbed=100, negativeRate=5, winSize=3, minCount=5):
        sentences_w_min_count = remove_rare_words(sentences, minCount)
        self.nEmbed = nEmbed
        self.negativeRate = negativeRate
        self.wc_pairs, self.word_voc, self.context_voc = getPairsAndVocs(sentences_w_min_count, winSize)
        self.neg_wc_pairs = getNegPairs(self.wc_pairs, self.word_voc, self.context_voc, negativeRate)
        
        
    def train(self, stepsize=0.1, epochs=10, batch_size=1000):
        nb_words = len(list(self.word_voc.keys()))
        nb_contexts = len(list(self.context_voc.keys()))
        
        theta0 = np.random.random(self.nEmbed*(nb_words+nb_contexts))*1e-5
        cost_to_optimize = lambda theta, wc_pairs, neg_wc_pairs: costFunction(theta, self.nEmbed, wc_pairs, neg_wc_pairs, nb_words, nb_contexts)
        grad = lambda theta, wc_pairs, neg_wc_pairs: gradCost(theta, self.nEmbed, wc_pairs, neg_wc_pairs, nb_words, nb_contexts)
        
        ####### STOCHASTIC GRADIENT DESCENT #######
        thetan = theta0
        fn = cost_to_optimize(thetan, self.wc_pairs, self.neg_wc_pairs)
        print(fn)
        rho = stepsize
        nb_pairs = len(self.wc_pairs)        
        
        for epoch in range(epochs):
            print("Epoch {}".format(epoch))
            for batch in tqdm_notebook(range(nb_pairs//batch_size)):
                batch_begin = batch*batch_size
                batch_end = (batch+1)*batch_size
                if batch_end > nb_pairs:
                    batch_end = nb_pairs
                batch_pos = self.wc_pairs[batch_begin:batch_end]
                batch_neg = self.neg_wc_pairs[self.negativeRate*batch_begin:self.negativeRate*batch_end]
                gradn = grad(thetan, batch_pos, batch_neg)
                thetanp1 = thetan + rho*gradn
                fnp1 = cost_to_optimize(thetanp1, batch_pos, batch_neg)
                thetan = thetanp1
                fn = fnp1
            print(cost_to_optimize(thetanp1, self.wc_pairs, self.neg_wc_pairs))
        ####### STOCHASTIC GRADIENT DESCENT #######   

        
        ######## GRADIENT DESCENT #######
        #converged = False
        #tol = 1e-6
        #thetan = theta0
        #fn = 1e8
        #rho = stepsize
        #nb_pairs = len(self.wc_pairs)
        #
        #for epoch in range(epochs):
        #    gradn = grad(thetan, self.wc_pairs, self.neg_wc_pairs)
        #    thetanp1 = thetan + rho*gradn
        #    fnp1 = cost_to_optimize(thetanp1, self.wc_pairs, self.neg_wc_pairs)
        #    while fnp1>fn:
        #        rho = rho/2
        #        print("rho: {} ->".format(rho), end=" ")
        #        thetanp1 = thetan + rho*gradn
        #        fnp1 = cost_to_optimize(thetanp1, self.wc_pairs, self.neg_wc_pairs)
        #    print("{:.2f} ->".format(fnp1), end = " ")
        #    rho = 2*rho
        #    thetan = thetanp1
        #    fn = fnp1
        #    print(fn)
        ######## GRADIENT DESCENT #######
        return (fn, thetan)

        def similarity(self,word1,word2):
        """
            computes similiarity between the two words. unknown words are mapped to one common vector
        :param word1:
        :param word2:
        :return: a float \in [0,1] indicating the similarity (the higher the more similar)
        """
        
        

In [7]:
my_sg = mSkipGram(sentences)

In [9]:
fn, thetan = my_sg.train(epochs=10, stepsize=0.01, batch_size=1024)

2350891.8433662285
Epoch 0


A Jupyter Widget




Exception in thread Thread-4:
Traceback (most recent call last):
  File "/Users/sidou/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/sidou/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/sidou/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration




1557273.6878983402
Epoch 1


A Jupyter Widget


1528558.3878623878
Epoch 2


A Jupyter Widget


1508099.2717592772
Epoch 3


A Jupyter Widget


1493312.63108951
Epoch 4


A Jupyter Widget


1473344.8169790925
Epoch 5


A Jupyter Widget


1442205.4144268006
Epoch 6


A Jupyter Widget


1422952.9978260822
Epoch 7


A Jupyter Widget


1428493.4561192438
Epoch 8


A Jupyter Widget


1430061.5928403372
Epoch 9


A Jupyter Widget


1428589.6245089094


In [10]:
nb_words = len(my_sg.word_voc)
W = thetan[:100*nb_words].reshape(nb_words, 100)

In [11]:
inv_word_voc = {}
for key in my_sg.word_voc.keys():
    value = my_sg.word_voc[key]
    inv_word_voc[value] = key

In [31]:
def most_similar(W, word, word_voc, inv_word_voc):
    idx_word = word_voc[word]
    nb_words = len(word_voc)
    similarities = []
    for candidate in range(nb_words):
        if candidate != idx_word:
            cosine = W[idx_word].dot(W[candidate])/(np.linalg.norm(W[idx_word])*np.linalg.norm(W[candidate]))
            similarities.append((dot, inv_word_voc[candidate]))
    return sorted(similarities, reverse=True)

In [47]:
most_similar(W, "cat", my_sg.word_voc, inv_word_voc)

[(0.93754775842032867, 'log'),
 (0.93587884345069028, 'bicyclist'),
 (0.9297815071162, 'repairs'),
 (0.92347075821180602, 'wire'),
 (0.92345148468256111, 'tattoo'),
 (0.92235335830855669, 'calf'),
 (0.92089621424714152, 'racer'),
 (0.92071087449004463, 'terrier'),
 (0.9199611796957079, 'seat'),
 (0.91926640383416203, 'cyclist'),
 (0.91739163289833614, 'gun'),
 (0.91694493505489705, 'balcony'),
 (0.9157905241412988, 'bull'),
 (0.91553572941164241, 'policeman'),
 (0.91435804339478988, 'lift'),
 (0.91123520524703039, 'bottle'),
 (0.90987357864244667, 'towel'),
 (0.90726481455226915, 'puts'),
 (0.90645518274550951, 'jockey'),
 (0.90518511944602076, 'stomach'),
 (0.90398515646645827, 'canvas'),
 (0.90129704686005974, 'tractor'),
 (0.90096284940616267, 'bow'),
 (0.89876208445127015, 'possibly'),
 (0.89642714244314081, 'frame'),
 (0.89590496520172413, 'pen'),
 (0.89559732090291488, 'ladder'),
 (0.89441628548343732, 'harness'),
 (0.89212311947959289, 'climber'),
 (0.89104161855331521, 'nap'),


In [18]:
my_sg.word_voc

{'two': 0,
 'people': 1,
 'on': 2,
 'a': 3,
 'group': 4,
 'of': 5,
 'chinese': 6,
 'tourists': 7,
 'scenery': 8,
 'off': 9,
 'bridge': 10,
 'there': 11,
 'are': 12,
 'four': 13,
 'walking': 14,
 'down': 15,
 'the': 16,
 'street': 17,
 'in': 18,
 'front': 19,
 'yellow': 20,
 'building': 21,
 'man': 22,
 'helping': 23,
 'an': 24,
 'older': 25,
 'woman': 26,
 'to': 27,
 'top': 28,
 'sand': 29,
 'dune': 30,
 'who': 31,
 'is': 32,
 'wearing': 33,
 'performing': 34,
 'with': 35,
 'tub': 36,
 'while': 37,
 'watches': 38,
 'male': 39,
 'ski': 40,
 'dressed': 41,
 'black': 42,
 'sits': 43,
 'under': 44,
 'lift': 45,
 'younger': 46,
 'child': 47,
 'gray': 48,
 'orange': 49,
 'and': 50,
 'white': 51,
 'jacket': 52,
 'stand': 53,
 'next': 54,
 'him': 55,
 'skateboarder': 56,
 'at': 57,
 'park': 58,
 'enjoying': 59,
 'day': 60,
 'dogs': 61,
 'running': 62,
 'grass': 63,
 'person': 64,
 'blue': 65,
 'shirt': 66,
 'pants': 67,
 'sitting': 68,
 'bench': 69,
 'outside': 70,
 'young': 71,
 'does': 72,
 

In [154]:
nwc = my_sg.neg_wc_pairs

In [20]:
nb_words = len(list(my_sg.word_voc.keys()))
nb_context = len(list(my_sg.context_voc.keys()))
theta0 = np.ones((nb_words+nb_context)*my_sg.nEmbed)*0.1

In [21]:
%lprun -f costFunction costFunction(theta0, my_sg.nEmbed, my_sg.wc_pairs, my_sg.neg_wc_pairs, nb_words, nb_context)

Appel cost function


In [27]:
%lprun -f gradCost gradCost(theta0, my_sg.nEmbed, my_sg.wc_pairs, my_sg.neg_wc_pairs, nb_words, nb_context)

Appel gradient
Début wc pairs
Début neg wc pairs


In [42]:
theta0 = np.concatenate([np.ones(10), np.ones(20)*2])

In [43]:
theta0

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.])

In [60]:
nEmbed = 2
nb_words = 10
nb_contexts = 5
W = theta0[:nEmbed*nb_words].reshape(nb_words, nEmbed)
C = theta0[nEmbed*nb_words:].reshape(nb_contexts, nEmbed)

In [71]:
%timeit sum(W[0,:]*(C[0,:]))

4.16 µs ± 86.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [72]:
%timeit W[0,:].dot((C[0,:]))

1.61 µs ± 13.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [75]:
W

array([[ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       [ 2.,  2.],
       [ 2.,  2.],
       [ 2.,  2.],
       [ 2.,  2.],
       [ 2.,  2.]])

In [74]:
C[0,:].shape

(2,)