In [97]:
import numpy as np

### Helper functions:

In [98]:
def ohe(X,idx2word):
    ncol = len(idx2word.keys())
    nrow = len(X)
    OHE_X = np.zeros((nrow,ncol))
    for r in range(len(X)):
        if not isinstance(X[r],list):
            OHE_X[r,X[r]] = 1
        else:
            row_val = X[r]
            for c in row_val:
                OHE_X[r,c] = 1
                
    return OHE_X
        

    
def tokenize(x_list):
    #unique tokens:
    unique_x = list(set([j for i in data for j in i]))
    idx2word = dict(enumerate(unique_x))
    word2idx = {i[1]:i[0] for i in idx2word.items()}
    # Encode:
    tokened_x_list = []
    for sentence in x_list:
        temp_sent = []
        for word in sentence:
            token = word2idx.get(word,-1)
            temp_sent.append(token)
        
        tokened_x_list.append(temp_sent)
    return tokened_x_list,idx2word,word2idx
            

    
def skipgram_prep(x_list,context_window=2):
    """
    Use Skipgram method to prepare the data.
    
    Arguments:
        x_list(list): tokenized training data
        
        context_window: the context window on each side. 
        For example, if context_window=2, we will be looking at 2 tokens on the left and 
        2 tokens on the right
    
    Returns:
        processd_data(list):  a list of tuples represents the processed data. Each pair of tuple is a (x,y) pair
        
    """
    processed_data = []
    
    for row in x_list:
        row_len = len(row)
        for i in range(row_len):
            x = row[i]
            start_idx = max(i-context_window,0)
            end_idx = min(row_len,i+context_window+1)
            y = row[start_idx:i] + row[i+1:end_idx] # skip the self
            
            
            temp_xy_pair = zip([x]*len(y),y)
            processed_data.extend(temp_xy_pair)
    
    return processed_data
            
            

        
def softmax(x):
    e_x = np.exp(x)
    return np.divide(e_x,e_x.sum(axis=1).reshape(-1,1))
            
            
    
    
            
    
    

In [99]:

data = [
    'apple and banana are delicious food',
    'play video game in game studio',
    'i have orc warrior in world of warcraft',
    'super mario is the best video game',
    'do you prefer xbox or ps4 or nintentdo switch',
    'which game company is better ubisoft or blizzard',
    'play game on ps4 or xbox',
    'video game is exciting for geeks',
    'warcraft or starcraft or overwatch is best game',
    'is warcraft and world of warcraft the same game',
    'i prefer desktop game console to either xbox or ps4',
    
    
    'food for lunch is fruit apple banana icecream',
    'icecream is too sweet for a meal',
    'dove chocolate is my favorite',
    'which is sweet chocolate or icecream',
    'chocolate or banana or icecream is the most delicious food',
    'go to buy a banana bread for my lunch',
    'banana apple smoothie is the best for lunch or dinner',
    'what to eat for dinner banana or chocolate',
    'banana is less sweet icecream is more sweet',
    'chocolate icecream taste more delicious than banana',
    'chicken sandwich is different from chicken bread',
    
]

data = [i.split(" ") for i in data]


#### Tokenization:

In [100]:
tokenized_data_list,idx2word,word2idx = tokenize(data)

In [101]:
print(idx2word)

{0: 'prefer', 1: 'starcraft', 2: 'a', 3: 'go', 4: 'warrior', 5: 'buy', 6: 'smoothie', 7: 'chicken', 8: 'sandwich', 9: 'xbox', 10: 'super', 11: 'mario', 12: 'to', 13: 'geeks', 14: 'the', 15: 'sweet', 16: 'fruit', 17: 'have', 18: 'console', 19: 'dove', 20: 'meal', 21: 'play', 22: 'on', 23: 'warcraft', 24: 'of', 25: 'ps4', 26: 'blizzard', 27: 'for', 28: 'exciting', 29: 'favorite', 30: 'same', 31: 'icecream', 32: 'eat', 33: 'more', 34: 'different', 35: 'desktop', 36: 'most', 37: 'from', 38: 'nintentdo', 39: 'orc', 40: 'in', 41: 'studio', 42: 'what', 43: 'are', 44: 'apple', 45: 'my', 46: 'or', 47: 'is', 48: 'either', 49: 'banana', 50: 'game', 51: 'too', 52: 'company', 53: 'taste', 54: 'ubisoft', 55: 'and', 56: 'best', 57: 'food', 58: 'lunch', 59: 'less', 60: 'bread', 61: 'delicious', 62: 'than', 63: 'dinner', 64: 'you', 65: 'switch', 66: 'video', 67: 'i', 68: 'overwatch', 69: 'which', 70: 'chocolate', 71: 'world', 72: 'do', 73: 'better'}


#### Use Skipgram to Prepare the Training Data:

In [102]:
prep_data = skipgram_prep(tokenized_data_list,context_window=6)

In [103]:
X = [i[0] for i in prep_data]
Y = [i[1] for i in prep_data]

#### OHE:

In [104]:
ohe_X = ohe(X,idx2word)
ohe_Y = ohe(Y,idx2word)

#### Naive Word2vec Model:

First, let's build a naive Word2vec model, means we're gonna use softmax across all vocabularies.

In [112]:
# Hyper Parameters:
LEARNING_RATE = 0.01
N_VOCAB = len(idx2word)
N_DIM = 10
BATCH_SIZE = len(ohe_X)

# Weights Initialization:
embedding_mat = np.random.normal(size=(N_VOCAB,N_DIM)) 
dense_w = np.random.normal(size=(N_DIM,N_VOCAB))

In [113]:
all_loss = []
for i in range(1500):
    

    # forward pass:
    input_x = ohe_X
    input_y = ohe_Y
    x_embedding_layer = input_x.dot(embedding_mat)# query word embedding X
#     print(x_embedding_layer.shape)
    dense_layer = x_embedding_layer.dot(dense_w)
#     print(dense_layer.shape)
    output_layer = softmax(dense_layer)

    # cross entropy loss:
    loss = -np.sum(input_y*np.log(output_layer+1e-9))/BATCH_SIZE # adding smooth term
    if i%100==0:
        print(f"Loss: {loss}")
    all_loss.append(loss)
#     print('---')
    


    # Backward Pass
    

    # d_loss/d_dense_layer = d_loss/d_op_layer * d_op_layer/d_dense_layer
    d_dense = output_layer - input_y
#     print(d_dense.shape)

    # d_loss/d_dense_w = d_loss/d_dense_layer * d_dense_layer/d_dense_w
    d_dense_w =  d_dense.T.dot(x_embedding_layer).T
#     print(d_dense_w.shape)

    # d_loss/x_embedding_layer = d_loss/d_dense_layer * d_dense_layer/x_embedding_layer
    d_emb_layer =  d_dense.dot(dense_w.T)
#     print(d_emb_layer.shape)
    # d_loss/d_embedding_mat = d_loss/x_embedding_layer * x_embedding_layer/d_embedding_mat
    d_embedding_mat = d_emb_layer.T.dot(input_x)
#     print(d_embedding_mat.shape)
#     print('~')


    
    embedding_mat -= LEARNING_RATE*d_embedding_mat.T
    dense_w -= LEARNING_RATE*d_dense_w
    

Loss: 7.939235267658686
Loss: 2.890509146952381
Loss: 2.8084968702492077
Loss: 2.784884496026286
Loss: 2.777874359768054
Loss: 2.773318426613309
Loss: 2.7703245545204656
Loss: 2.774656720494175
Loss: 2.7635576135418134
Loss: 2.766479310088463
Loss: 2.7683019537472418
Loss: 2.7595713581987953
Loss: 2.7667158065504105
Loss: 2.764545418555038
Loss: 2.7580792021073135


Now the mini word2vec model is ready, let build the query function to check:

In [114]:
def get_word_vector(word ,embedding = embedding_mat,word2idx=word2idx,vector_dim=N_DIM):
    
    query_id = word2idx.get(word,-1)
    if query_id>=0:
        return embedding_mat[query_id,:]
    else:
        return np.zeros((N_DIM,))-999.
        
    

In [115]:
# Try with in vocab word:
query_word = 'xbox'
get_word_vector(query_word)

array([-1.76500087,  1.07809326, -1.61388524,  0.63570399, -0.25055495,
       -0.42878406, -0.20876328, -0.83644503,  1.39433293, -0.60998938])

In [116]:
# Try with in Out-of_vocabulary word:
query_word = 'lol'
get_word_vector(query_word)

array([-999., -999., -999., -999., -999., -999., -999., -999., -999.,
       -999.])

#### Now find the most similar word to our query word:

In [117]:
from numpy import dot
from numpy.linalg import norm


def cosine_sim(vx,vy):
    return dot(vx, vy)/(norm(vx)*norm(vy))

In [118]:
def find_similar(query_word,word2idx=word2idx):
    query_vector = get_word_vector(query_word)
    
    result = {}
    for word in word2idx:
        temp_vector = get_word_vector(word)
#         print(word)
#         print(temp_vector)
        sim = cosine_sim(query_vector,temp_vector)
        result[word] = sim
    
    return result
        


In [119]:
result = find_similar('food')


sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:3]

[('food', 1.0000000000000002),
 ('delicious', 0.799574238439338),
 ('banana', 0.696540293747587)]

In [120]:
result = find_similar('warcraft')


sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:3]

[('warcraft', 1.0),
 ('world', 0.7461912120209175),
 ('starcraft', 0.7454474518475237)]

In [121]:
result = find_similar('game')


sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:3]

[('game', 0.9999999999999998),
 ('video', 0.7487766536647733),
 ('overwatch', 0.7384422164865021)]

In [122]:

result = find_similar('xbox')


sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:3]

[('xbox', 1.0000000000000002),
 ('i', 0.7077697803104328),
 ('ps4', 0.6796021254332196)]

### So what's the problem here -- the above algorithm will never work for real world problem!!!

We only have 44 vocabularies in this vanilla example. What if we have millions of vocabs? The softmax operation becomes very expensive. To tackle this  issue, several algorithms are proposed, in order to do the approximation of softmax, such as: Hiearchical Softmax, Negative Sampling or NCE.



##### Negative Sampling helper function

In [186]:
def sample_negative(xy_pairs,n_negative,idx2word):

    pos_context = {}
    grand_negative_samples = []
    for x,y in xy_pairs:

        if x not in pos_context:
            good_pair = [i[1] for i in xy_pairs if i[0]==x]
            pos_context[x] = good_pair

        ## Sample:
        temp_neg_samples = []
        while len(temp_neg_samples)< n_negative:
            temp_idx = np.random.choice(list(idx2word.keys()))
            if temp_idx!=x and temp_idx not in pos_context[x]:
                temp_neg_samples.append(temp_idx)

        grand_negative_samples.append(temp_neg_samples)
    return np.array(grand_negative_samples)
    

#### Hyper Parameters:

In [187]:
N_NEGATIVE_WORD = 3
LEARNING_RATE = 0.01
N_VOCAB = len(idx2word)
N_DIM = 10
BATCH_SIZE = len(ohe_X)

# Weights Initialization:
embedding_mat = np.random.normal(size=(N_VOCAB,N_DIM)) 
dense_w = np.random.normal(size=(N_DIM,N_VOCAB))

In [188]:
ohe_X = ohe(X,idx2word)
ohe_Y = ohe(Y,idx2word)

pos_words_mask = ohe_Y.copy()



#### Word2vec Model with Negative Sampling:

We use the mask above to optimize only the positive word and the negative words we sampled

In [189]:
all_loss = []
for i in range(1000):
    neg_words = sample_negative(
    xy_pairs=prep_data,
    n_negative=N_NEGATIVE_WORD,
    idx2word = idx2word)

    neg_words_mask = ohe(neg_words.tolist(),idx2word)
    all_mask = pos_words_mask+neg_words_mask

    

    # forward pass:
    input_x = ohe_X
    input_y = ohe_Y
    x_embedding_layer = input_x.dot(embedding_mat)# query word embedding X
    dense_layer = x_embedding_layer.dot(dense_w)
    output_layer = softmax(dense_layer)

    output_layer = output_layer*all_mask
    # cross entropy loss:
#     print(output_layer)
    loss = -np.sum(input_y*np.log(output_layer+1e-9))/BATCH_SIZE # adding smooth term
    if i%2==0:
        print(f"Loss: {loss}")
    all_loss.append(loss)
#     print('---')
    


    # Backward Pass
    

    # d_loss/d_dense_layer = d_loss/d_op_layer * d_op_layer/d_dense_layer
    d_dense = output_layer - input_y
#     print(d_dense.shape)

    # d_loss/d_dense_w = d_loss/d_dense_layer * d_dense_layer/d_dense_w
    d_dense_w =  d_dense.T.dot(x_embedding_layer).T
#     print(d_dense_w.shape)

    # d_loss/x_embedding_layer = d_loss/d_dense_layer * d_dense_layer/x_embedding_layer
    d_emb_layer =  d_dense.dot(dense_w.T)
#     print(d_emb_layer.shape)

    # d_loss/d_embedding_mat = d_loss/x_embedding_layer * x_embedding_layer/d_embedding_mat
    d_embedding_mat = d_emb_layer.T.dot(input_x)
#     print(d_embedding_mat.shape)
#     print('~')


    
    embedding_mat -= LEARNING_RATE*d_embedding_mat.T
    dense_w -= LEARNING_RATE*d_dense_w
    

Loss: 8.134581618402228
Loss: 7.390820574127913
Loss: 7.146451696102426
Loss: 7.991370780662379
Loss: 9.94129644357155
Loss: 12.105866663919734
Loss: 15.110796887869324




Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan


KeyboardInterrupt: 