In [1]:
import numpy as np

### Helper functions:

In [2]:
def ohe(X,idx2word):
    ncol = len(idx2word.keys())
    nrow = len(X)
    OHE_X = np.zeros((nrow,ncol))
    for r in range(len(X)):
        if not isinstance(X[r],list):
            OHE_X[r,X[r]] = 1
        else:
            row_val = X[r]
            for c in row_val:
                OHE_X[r,c] = 1
                
    return OHE_X
        

    
def tokenize(x_list):
    #unique tokens:
    unique_x = list(set([j for i in data for j in i]))
    idx2word = dict(enumerate(unique_x))
    word2idx = {i[1]:i[0] for i in idx2word.items()}
    # Encode:
    tokened_x_list = []
    for sentence in x_list:
        temp_sent = []
        for word in sentence:
            token = word2idx.get(word,-1)
            temp_sent.append(token)
        
        tokened_x_list.append(temp_sent)
    return tokened_x_list,idx2word,word2idx
            

    
def skipgram_prep(x_list,context_window=2):
    """
    Use Skipgram method to prepare the data.
    
    Arguments:
        x_list(list): tokenized training data
        
        context_window: the context window on each side. 
        For example, if context_window=2, we will be looking at 2 tokens on the left and 
        2 tokens on the right
    
    Returns:
        processd_data(list):  a list of tuples represents the processed data. Each pair of tuple is a (x,y) pair
        
    """
    processed_data = []
    
    for row in x_list:
        row_len = len(row)
        for i in range(row_len):
            x = row[i]
            start_idx = max(i-context_window,0)
            end_idx = min(row_len,i+context_window+1)
            y = row[start_idx:i] + row[i+1:end_idx] # skip the self
            
            
            temp_xy_pair = zip([x]*len(y),y)
            processed_data.extend(temp_xy_pair)
    
    return processed_data
            
            

        
def softmax(x):
    e_x = np.exp(x)
    return np.divide(e_x,e_x.sum(axis=1).reshape(-1,1))
            
            
    
    
            
    
    

In [3]:

data = [
    'apple banana are delicious food',
    'video game go play in game studio',
    'lunch food is fruit apple banana icecream',
    'warcraft or starcraft or overwatch best game',
    'chocolate or banana or icecream the most delicious food',
    'banana apple smoothie is the best for lunch or dinner',
    'video game is good for geeks',
    'what to eat for dinner banana or chocolate',
    'which game company is better ubisoft or blizzard',
    'play game on ps4 or xbox',
    'banana is less sweet icecream is more sweet',
    'chocolate icecream taste more delicious than banana'
    
]

data = [i.split(" ") for i in data]


#### Tokenization:

In [4]:
tokenized_data_list,idx2word,word2idx = tokenize(data)

#### Use Skipgram to Prepare the Training Data:

In [5]:
prep_data = skipgram_prep(tokenized_data_list,context_window=2)

In [6]:
X = [i[0] for i in prep_data]
Y = [i[1] for i in prep_data]

#### OHE:

In [7]:
ohe_X = ohe(X,idx2word)
ohe_Y = ohe(Y,idx2word)

#### Naive Word2vec Model:

First, let's build a naive Word2vec model, means we're gonna use softmax across all vocabularies.

In [8]:
# Hyper Parameters:
N_NEGATIVE = 3
LEARNING_RATE = 0.01
N_VOCAB = len(idx2word)
N_DIM = 16
BATCH_SIZE = len(ohe_X)

# Weights Initialization:
embedding_mat = np.random.normal(size=(N_VOCAB,N_DIM)) 
dense_w = np.random.normal(size=(N_DIM,N_VOCAB))

In [9]:
all_loss = []
for i in range(1500):
    

    # forward pass:
    input_x = ohe_X
    input_y = ohe_Y
    x_embedding_layer = input_x.dot(embedding_mat)# query word embedding X
#     print(x_embedding_layer.shape)
    dense_layer = x_embedding_layer.dot(dense_w)
#     print(dense_layer.shape)
    output_layer = softmax(dense_layer)

    # cross entropy loss:
    loss = -np.sum(input_y*np.log(output_layer+1e-9))/BATCH_SIZE # adding smooth term
    if i%100==0:
        print(f"Loss: {loss}")
    all_loss.append(loss)
#     print('---')
    


    # Backward Pass

    # d_loss/d_dense_layer = d_loss/d_op_layer * d_op_layer/d_dense_layer
    d_dense = output_layer - input_y
#     print(d_dense.shape)

    # d_loss/d_dense_w = d_loss/d_dense_layer * d_dense_layer/d_dense_w
    d_dense_w =  d_dense.T.dot(x_embedding_layer).T
#     print(d_dense_w.shape)

    # d_loss/x_embedding_layer = d_loss/d_dense_layer * d_dense_layer/x_embedding_layer
    d_emb_layer =  d_dense.dot(dense_w.T)
#     print(d_emb_layer.shape)
    # d_loss/d_embedding_mat = d_loss/x_embedding_layer * x_embedding_layer/d_embedding_mat
    d_embedding_mat = d_emb_layer.T.dot(input_x)
#     print(d_embedding_mat.shape)
#     print('~')


    
    embedding_mat -= LEARNING_RATE*d_embedding_mat.T
    dense_w -= LEARNING_RATE*d_dense_w
    

Loss: 9.378688108625873
Loss: 2.005298530982752
Loss: 1.9282251934894516
Loss: 1.915202004986637
Loss: 1.9103938902718987
Loss: 1.9079612745844892
Loss: 1.9067267756355368
Loss: 1.9116364898998748
Loss: 1.9093111376997192
Loss: 1.9079385961499742
Loss: 1.9070468916300318
Loss: 1.9064129969572752
Loss: 1.9059291852573528
Loss: 1.9055409406255481
Loss: 1.9052187142658792


Now the mini word2vec model is ready, let build the query function to check:

In [10]:
def get_word_vector(word ,embedding = embedding_mat,word2idx=word2idx,vector_dim=N_DIM):
    
    query_id = word2idx.get(word,-1)
    if query_id>=0:
        return embedding_mat[query_id,:]
    else:
        return np.zeros((N_DIM,))-999.
        
    

In [11]:
# Try with in vocab word:
query_word = 'xbox'
get_word_vector(query_word)

array([-1.10479476,  0.69152043,  0.0222014 ,  0.47398416,  0.85253254,
        1.29816081,  0.46473506, -0.17165976,  0.02458933, -0.58116457,
       -0.40560783,  2.78396632, -0.96417779,  2.04935327,  0.82896536,
       -0.92053599])

In [12]:
# Try with in Out-of_vocabulary word:
query_word = 'lol'
get_word_vector(query_word)

array([-999., -999., -999., -999., -999., -999., -999., -999., -999.,
       -999., -999., -999., -999., -999., -999., -999.])

#### Now find the most similar word to our query word:

In [13]:
from numpy import dot
from numpy.linalg import norm


def cosine_sim(vx,vy):
    return dot(vx, vy)/(norm(vx)*norm(vy))

In [14]:
def find_similar(query_word,word2idx=word2idx):
    query_vector = get_word_vector(query_word)
    
    result = {}
    for word in word2idx:
        temp_vector = get_word_vector(word)
#         print(word)
#         print(temp_vector)
        sim = cosine_sim(query_vector,temp_vector)
        result[word] = sim
    
    return result
        


In [15]:
result = find_similar('food')


sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:3]

[('food', 1.0), ('apple', 0.5053182732410727), ('less', 0.34573572640324446)]

In [16]:
result = find_similar('warcraft')


sorted(list(result.items()),key=lambda x: x[1],reverse=True)[:3]

[('warcraft', 0.9999999999999999),
 ('ubisoft', 0.4935534265673933),
 ('overwatch', 0.467640114458129)]

### So what's the problem here -- the above algorithm will never work for real world problem!!!

We only have 44 vocabularies in this vanilla example. What if we have millions of vocabs? The softmax operation becomes very expensive. To tackle this  issue, several algorithms are proposed, in order to do the approximation of softmax, such as: Hiearchical Softmax, Negative Sampling or NCE.



 Coming Soon....

#### Negative Sampling:

In [17]:
# def sample_negative(xy_pairs,n_negative,idx2word):

#     pos_context = {}
#     grand_negative_samples = []
#     for x,y in xy_pairs:

#         if x not in pos_context:
#             good_pair = [i[1] for i in xy_pairs if i[0]==x]
#             pos_context[x] = good_pair

#         ## Sample:
#         temp_neg_samples = []
#         while len(temp_neg_samples)< n_negative:
#             temp_idx = np.random.choice(list(idx2word.keys()))
#             if temp_idx!=x and temp_idx not in pos_context[x]:
#                 temp_neg_samples.append(temp_idx)

#         grand_negative_samples.append(temp_neg_samples)
#     return np.array(grand_negative_samples)
    