In [1]:
import numpy as np
from numpy import newaxis
import time
import matplotlib.pyplot as plt

% matplotlib inline

### Helper functions:

In [2]:
def ohe(X,idx2word):
    ncol = len(idx2word.keys())
    nrow = len(X)
    OHE_X = np.zeros((nrow,ncol))
    for r in range(len(X)):
        if not isinstance(X[r],list):
            OHE_X[r,X[r]] = 1
        else:
            row_val = X[r]
            for c in row_val:
                OHE_X[r,c] = 1
                
    return OHE_X
        

    
def tokenize(x_list):
    #unique tokens:
    unique_x = list(set([j for i in data for j in i]))
    idx2word = dict(enumerate(unique_x))
    word2idx = {i[1]:i[0] for i in idx2word.items()}
    # Encode:
    tokened_x_list = []
    for sentence in x_list:
        temp_sent = []
        for word in sentence:
            token = word2idx.get(word,-1)
            temp_sent.append(token)
        
        tokened_x_list.append(temp_sent)
    return tokened_x_list,idx2word,word2idx
            

    
def skipgram_prep(x_list,context_window=2):
    """
    Use Skipgram method to prepare the data.
    
    Arguments:
        x_list(list): tokenized training data
        
        context_window: the context window on each side. 
        For example, if context_window=2, we will be looking at 2 tokens on the left and 
        2 tokens on the right
    
    Returns:
        processd_data(list):  a list of tuples represents the processed data. Each pair of tuple is a (x,y) pair
        
    """
    processed_data = []
    
    for row in x_list:
        row_len = len(row)
        for i in range(row_len):
            x = row[i]
            start_idx = max(i-context_window,0)
            end_idx = min(row_len,i+context_window+1)
            y = row[start_idx:i] + row[i+1:end_idx] # skip the self
            
            
            temp_xy_pair = zip([x]*len(y),y)
            processed_data.extend(temp_xy_pair)
    
    return processed_data
            
            

        
def softmax(x):
    e_x = np.exp(x)
    return np.divide(e_x,e_x.sum(axis=1).reshape(-1,1))
            
            

def sigmoid(x):
    
    return 1/(1+np.exp(-x))
    
            
    
    

In [3]:

data = [
    'apple and banana are delicious food',
    'play video game in game studio',
    'i have orc warrior in world of warcraft',
    'super mario is the best video game',
    'do you prefer xbox or ps4 or nintentdo switch',
    'which game company is better ubisoft or blizzard',
    'play game on ps4 or xbox',
    'video game is exciting for geeks',
    'warcraft or starcraft or overwatch is best game',
    'is warcraft and world of warcraft the same game',
    'i prefer desktop game console to either xbox or ps4',
    
    
    'food for lunch is fruit apple banana icecream',
    'icecream is too sweet for a meal',
    'dove chocolate is my favorite',
    'which is sweet chocolate or icecream',
    'chocolate or banana or icecream is the most delicious food',
    'go to buy a banana bread for my lunch',
    'banana apple smoothie is the best for lunch or dinner',
    'what to eat for dinner banana or chocolate',
    'banana is less sweet icecream is more sweet',
    'chocolate icecream taste more delicious than banana',
    'chicken sandwich is different from chicken bread',
    
]

data = [i.split(" ") for i in data]


#### Tokenization:

In [4]:
tokenized_data_list,idx2word,word2idx = tokenize(data)

In [5]:
print(idx2word)

{0: 'chocolate', 1: 'delicious', 2: 'the', 3: 'ubisoft', 4: 'are', 5: 'have', 6: 'studio', 7: 'geeks', 8: 'to', 9: 'do', 10: 'bread', 11: 'dinner', 12: 'you', 13: 'nintentdo', 14: 'buy', 15: 'warrior', 16: 'best', 17: 'my', 18: 'super', 19: 'blizzard', 20: 'fruit', 21: 'from', 22: 'xbox', 23: 'on', 24: 'eat', 25: 'go', 26: 'either', 27: 'a', 28: 'console', 29: 'more', 30: 'which', 31: 'better', 32: 'orc', 33: 'same', 34: 'i', 35: 'warcraft', 36: 'favorite', 37: 'play', 38: 'or', 39: 'most', 40: 'game', 41: 'in', 42: 'exciting', 43: 'than', 44: 'chicken', 45: 'sandwich', 46: 'is', 47: 'desktop', 48: 'ps4', 49: 'for', 50: 'too', 51: 'less', 52: 'taste', 53: 'different', 54: 'apple', 55: 'world', 56: 'video', 57: 'food', 58: 'icecream', 59: 'and', 60: 'overwatch', 61: 'meal', 62: 'banana', 63: 'what', 64: 'starcraft', 65: 'sweet', 66: 'prefer', 67: 'switch', 68: 'lunch', 69: 'dove', 70: 'smoothie', 71: 'of', 72: 'company', 73: 'mario'}


#### Use Skipgram to Prepare the Training Data:

In [6]:
prep_data = skipgram_prep(tokenized_data_list,context_window=6)

In [7]:
X = [i[0] for i in prep_data]
Y = [i[1] for i in prep_data]

#### OHE:

In [8]:
ohe_X = ohe(X,idx2word)
ohe_Y = ohe(Y,idx2word)

#### Vanilla Word2vec Model:

First, let's build a naive Word2vec model, means we're gonna use softmax across all vocabularies.

In [9]:

LEARNING_RATE = 0.01
N_VOCAB = len(idx2word)
N_DIM = 10
BATCH_SIZE = len(ohe_X)
N_ITER = 100

# Weights Initialization:
target_embedding_mat = np.random.normal(size=(N_VOCAB,N_DIM)) 
context_embedding_mat = np.random.normal(size=(N_DIM,N_VOCAB))

In [10]:
start_time = time.time()
all_loss = []
for i in range(N_ITER):
    

    # forward pass:
    input_x = ohe_X
    input_y = ohe_Y
    
    target_embedding_layer = input_x.dot(target_embedding_mat)# target
    context_embedding_layer = target_embedding_layer.dot(context_embedding_mat)
    output_layer = softmax(context_embedding_layer)

    # cross entropy loss:
    loss = -np.sum(input_y*np.log(output_layer+1e-9))/BATCH_SIZE # adding smooth term
    if i%1000==0:
        print("Loss: {}".format(loss))
    all_loss.append(loss)
#     print('---')
    


    # Backward Pass
    

    # d_loss/d_dense_layer = d_loss/d_op_layer * d_op_layer/d_dense_layer
    d_context_embedding_layer = output_layer - input_y
#     print(d_context_embedding_layer.shape)

    # d_loss/d_dense_w = d_loss/d_dense_layer * d_dense_layer/d_dense_w
    d_context_embedding_mat =  d_context_embedding_layer.T.dot(target_embedding_layer)
#     print(d_context_embedding_mat.shape)

    # d_loss/x_embedding_layer = d_loss/d_dense_layer * d_dense_layer/x_embedding_layer
    d_target_embedding_layer =  d_context_embedding_layer.dot(context_embedding_mat.T)
#     print(d_emb_layer.shape)
    # d_loss/d_embedding_mat = d_loss/x_embedding_layer * x_embedding_layer/d_embedding_mat
    d_target_embedding_mat = d_target_embedding_layer.T.dot(input_x)
#     print(d_embedding_mat.shape)
#     print('~')


    
    target_embedding_mat -= LEARNING_RATE*d_target_embedding_mat.T
    context_embedding_mat -= LEARNING_RATE*d_context_embedding_mat.T

 
print("{} seconds elapsed".format(time.time()-start_time))

Loss: 8.486907978157337
0.2025301456451416 seconds elapsed


Now the mini word2vec model is ready, let build the query function to check:

In [11]:
def get_word_vector(word ,embedding = target_embedding_mat,word2idx=word2idx,vector_dim=N_DIM):
    
    query_id = word2idx.get(word,-1)
    if query_id>=0:
        return target_embedding_mat[query_id,:]
    else:
        return np.zeros((N_DIM,))-999.
        


#### Now find the most similar word to our query word:

In [12]:
from numpy import dot
from numpy.linalg import norm


def cosine_sim(vx,vy):
    return dot(vx, vy)/(norm(vx)*norm(vy))

In [13]:
def find_similar(query_word,word2idx=word2idx):
    query_vector = get_word_vector(query_word)
    
    result = {}
    for word in word2idx:
        temp_vector = get_word_vector(word)
        sim = cosine_sim(query_vector,temp_vector)
        result[word] = sim
    
    return result
        


In [14]:
result = find_similar('food')


sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:5]

[('food', 1.0),
 ('are', 0.8420530411244231),
 ('fruit', 0.7851886539880504),
 ('delicious', 0.7467649215627195),
 ('apple', 0.7115520750711977)]

In [15]:
result = find_similar('warcraft')


sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:5]

[('warcraft', 0.9999999999999999),
 ('of', 0.8978974950173058),
 ('world', 0.8612742236866162),
 ('same', 0.8465236418751103),
 ('game', 0.686469478986442)]

In [16]:
result = find_similar('game')


sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:5]

[('game', 1.0),
 ('mario', 0.8354462582423213),
 ('super', 0.792367719816544),
 ('overwatch', 0.7248624149689566),
 ('warcraft', 0.686469478986442)]

In [17]:

result = find_similar('xbox')


sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:5]

[('xbox', 1.0),
 ('prefer', 0.941457654396786),
 ('ps4', 0.8340296104429354),
 ('you', 0.8103957816646575),
 ('switch', 0.7678654775122083)]

In [18]:
del target_embedding_mat

### So what's the problem here -- the above algorithm will never work for real world problem!!!

We only have 44 vocabularies in this vanilla example. What if we have millions of vocabs? The softmax operation becomes very expensive. To tackle this  issue, several methods are proposed, in order to approximate softmax, such as: Hiearchical Softmax, Negative Sampling or NCE.



The idea of negative sampling is simply instead of training the word embedding using all vocabulary, we sample several negative samples (non-context) and construct a logistic regression to train it to distinguish between positive and negative examples

##### Negative Sampling helper function

should improve later because the sampling in this case is unweighted, but it is supposed to be weighted by the frequency

In [19]:
# def sample_negative(xy_pairs,n_negative,idx2word):

#     pos_context = {}
#     grand_negative_samples = []
#     for x,y in xy_pairs:

#         if x not in pos_context:
#             good_pair = [i[1] for i in xy_pairs if i[0]==x]
#             pos_context[x] = good_pair

#         ## Sample:
#         temp_neg_samples = []
#         while len(temp_neg_samples)< n_negative:
#             temp_idx = np.random.choice(list(idx2word.keys()))
#             if temp_idx!=x and temp_idx not in pos_context[x]:
#                 temp_neg_samples.append(temp_idx)

#         grand_negative_samples.append(temp_neg_samples)
#     return np.array(grand_negative_samples)
    

#### Hyper Parameters:

In [29]:
# N_NEGATIVE_WORD = 5
# LEARNING_RATE = 0.005
# N_VOCAB = len(idx2word)
# N_DIM = 10
# BATCH_SIZE = len(ohe_X)
# N_ITER = 1000

# # Weights Initialization:

# target_embedding_mat = np.random.normal(size=(N_VOCAB,N_DIM)) 
# context_embedding_mat = np.random.normal(size=(N_VOCAB,N_DIM)) 
# # dense_w = np.random.normal(size=(N_DIM,N_NEGATIVE_WORD)) # context

In [30]:
# ohe_X = ohe(X,idx2word)
# ohe_Y = ohe(Y,idx2word)

# # pos_words_mask = ohe_Y.copy()



#### Word2vec Model with Negative Sampling:

We use the mask above to optimize only the positive word and the negative words we sampled

In [31]:
# from IPython import display
# import pdb
# import math

In [32]:
# def sigmoid(z):
#     if z > 6:
#         return 1.0
#     elif z < -6:
#         return 0.0
#     else:
#         return 1 / (1 + math.exp(-z))

In [33]:
# # %matplotlib notebook
# %matplotlib inline
# start_time = time.time()
# all_loss = []

# for j in range(N_ITER): 
    
#     print(j)
#     neg_words = sample_negative(
#     xy_pairs=prep_data,
#     n_negative=N_NEGATIVE_WORD,
#     idx2word = idx2word)

#     agg_loss = 0
#     for i in range(len(Y)): # Iterate through each example
        
#         #Xs:
#         # target&negsamples
#         pos_x_id = [X[i]]
#         neg_id = neg_words[i].tolist() # negative id
#         # All X id:
#         all_x_id = neg_id+pos_x_id # posalways last


#         temp_y = [0. for i in range(len(all_x_id))]
#         temp_y[-1] = 1. # we always make the last element positive
        
        
#         pos_context_id = [Y[i]] # use X to predict pos context:
#         y_vector = target_embedding_mat[pos_context_id]
        
#         # Construct x,y:
#         example_loss = 0
        
#         neu1e = np.zeros((1,N_DIM))

#         for k in range(len(temp_y)):
    
#             z = np.dot(context_embedding_mat[all_x_id[k:k+1]].ravel(), target_embedding_mat[pos_context_id].ravel())
#             p = sigmoid(z)
#             g = 0.01 * (temp_y[k] - p)
            
#             #context_embedding_mat =syn1
#             neu1e += g * context_embedding_mat[all_x_id[k:k+1]]  
#             context_embedding_mat[all_x_id[k:k+1]]   += g * target_embedding_mat[pos_context_id] # Update syn1
            
        

#         target_embedding_mat[pos_x_id] += neu1e


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

#### Check the result:

In [34]:
# result = find_similar('food')
# sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:5]


[('food', 1.0),
 ('apple', 0.9017387986012271),
 ('lunch', 0.7784069777045173),
 ('banana', 0.7756066036098015),
 ('you', 0.6105443293498584)]

In [39]:
# result = find_similar('chocolate')

# sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:5]

[('chocolate', 1.0),
 ('than', 0.6203320560361113),
 ('super', 0.6031072393171127),
 ('sweet', 0.5307124026975621),
 ('play', 0.5223128090581242)]

In [40]:
# result = find_similar('blizzard')
# sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:5]

[('blizzard', 1.0),
 ('smoothie', 0.6359585385078282),
 ('favorite', 0.5456623424351543),
 ('are', 0.5405172809264033),
 ('studio', 0.5276412944275504)]

In [36]:
# result = find_similar('game')
# sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:5]

[('game', 1.0),
 ('dove', 0.8121263925766136),
 ('in', 0.7808958944881573),
 ('ubisoft', 0.7610482597359217),
 ('company', 0.745290968987958)]

In [37]:
# result = find_similar('xbox')
# sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:5]



[('xbox', 1.0),
 ('ps4', 0.9834315583379238),
 ('prefer', 0.9187180946389587),
 ('desktop', 0.7292426351353775),
 ('dove', 0.6776818430293864)]