In [678]:
import numpy as np
from collections import defaultdict
from tqdm import tqdm

In [685]:
class word2vec():
    def __init__(self):
        self.lr = settings['learning_rate']
        self.epoch = settings['epochs']
        self.window_size = settings['window_size']
        self.dimension = settings['n']
        
    def generate_training_data(self,setting,corpus):
        word_count = defaultdict(int)
        for row in corpus:
            for word in row:
                word_count[word] += 1
#         print(word_count)
                
        self.word_len = len(word_count.keys())
        print(self.word_len)
        self.word_list = list(word_count.keys())
        self.word_index = dict((word,i) for i,word in enumerate(self.word_list))
        print(self.word_index)
        self.index_word = dict((i,word) for i,word in enumerate(self.word_list))
#         print(self.word_index)

        
        training_data = []
        for sentence in corpus:
            sent_len = len(sentence)
            
            for i,word in enumerate(sentence):
#                 print(i,word)
                w_target = self.word2onehot(sentence[i])
                w_context = []
                
                for j in range(i - self.window_size,i + self.window_size):
                    if j!= i and j>=0 and j<= sent_len-1:
                        w_context.append(self.word2onehot(sentence[j]))
                training_data.append([w_target,w_context])
       
        
        return np.array(training_data)
       

   
    def word2onehot(self,word):
#         print(word)
        word_vec = np.zeros(self.word_len)
        word_index1 = self.word_index[word]
        word_vec[word_index1] = 1
        return word_vec
    

    def train(self,train_data):
#         self.w1 = getW1
#         self.w2 = getW2
        self.w1 = np.random.uniform(-1, 1, (self.word_len, self.dimension))
        self.w2 = np.random.uniform(-1, 1, (self.dimension, self.word_len))
        
        
        for i in tqdm(range(self.epoch)):
  
            for w_t ,w_c in train_data:
                y_u,h,u = self.ford_prop(w_t)

                EI = np.sum([np.subtract(y_u,word) for word in w_c],axis=0)
                self.back_prop(EI,h,w_t)

            
            
            
    def ford_prop(self,x):
        h = np.dot(x,self.w1)
        u = np.dot(h,self.w2)
        
        y_u = self.softmax(u)
        
        return y_u,h,u
    
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)
    
    def back_prop(self,e,h,x):
        dl_dw2 = np.outer(h,e)
        dl_dw1 =np.outer(x,np.dot(self.w2,e))
        
        self.w1 = self.w1 - (self.lr*dl_dw1)
        self.w2 = self.w2 - (self.lr * dl_dw2)
        
    def word_vec(self,x):
        w_index = self.word_index[x]
        return self.w1[w_index]
    


    def cosinevec_sim(self, word, top_n):
        v_w1 = self.word_vec(word)
        word_sim = {}

        for i in range(self.word_len):
            # Find the similary score for each word in vocab
            v_w2 = self.w1[i]
            theta_sum = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_sum / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)

        for word, sim in words_sorted[:top_n]:
            print(word, sim)
            
    def eculvec_sim(self,word,top_n):
        v_w1 = self.word_vec(word)
        word_sim = {}
        
        for i in range(self.word_len):
            v_w2 = self.w1[i]
            eculedian = np.linalg.norm(v_w1-v_w2)
            word_sim[self.index_word[i]] = eculedian

            
        word_sorted = sorted(word_sim.items(),key=lambda k:k[1],reverse=False)
        
        for word , sim in word_sorted[:top_n]:
            print(word,sim)
            
            
    
    
  
        

In [686]:
settings = {
	'window_size': 2,			# context window +- center word
	'n': 3,					# dimensions of word embeddings, also refer to size of hidden layer
	'epochs': 200,				# number of training epochs
	'learning_rate': 0.01		# learning rate
}

In [735]:
text = "the day is friday and the day is sunday and the day is thursday and the day is wednesday and the day is Monday"

In [736]:
corpus = [[word.lower() for word in text.split()]]

In [737]:
w2v = word2vec()

# Numpy ndarray with one-hot representation for [target_word, context_words]
training_data = w2v.generate_training_data(settings, corpus)

# Training


9
{'the': 0, 'day': 1, 'is': 2, 'friday': 3, 'and': 4, 'sunday': 5, 'thursday': 6, 'wednesday': 7, 'monday': 8}


In [738]:
w2v.train(training_data)

100%|██████████| 200/200 [00:01<00:00, 181.93it/s]


In [739]:
x = 'monday'

In [740]:
w2v.word_vec(x)

array([0.13425783, 0.76880298, 1.03569743])

In [741]:
w2v.cosinevec_sim(x, 7)

monday 0.9999999999999999
sunday 0.9931167574750261
wednesday 0.9861512227140041
friday 0.9843981006903764
thursday 0.9647388764063038
the 0.8952606251157212
day 0.1488610658718091


In [742]:
w2v.eculvec_sim(x, 7)

monday 0.0
sunday 0.41494697338446235
wednesday 0.41661141714484284
friday 0.42189882178500543
thursday 0.5589979559899514
the 0.6926240672802179
and 2.383202480009218


In [637]:
good = np.array([2,2])
bad = np.array([4,4])
person = np.array([3,2])

In [640]:
bad+person

array([7, 6])

In [641]:
good+person

array([5, 4])

In [635]:
good

array([[2, 2],
       [3, 2]])

In [636]:
np.sum(good,axis=0)

array([5, 4])