In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [12]:
class wordToVector:
    
    def __init__(self, word_embedding_size, corpus, window_size = 2):
        
        self.generateData(corpus, window_size)
    
        self.weights_word_matrix = np.random.randn(word_embedding_size, len(self.bag_of_words))
        self.weights_context_matrix = np.random.randn(len(self.bag_of_words), word_embedding_size)
 

    def SGD(self, epocs = 1000, eta = 0.01):
    
        for _ in range(0, epocs):
            for each_train_example in self.train_set:
                a = each_train_example[0] # input word
                y = each_train_example[1:][0] # output words (context words)
                
                nabla_word_matrix = np.zeros(self.weights_word_matrix.shape)
                nabla_context_matrix = np.zeros(self.weights_context_matrix.shape)
                
                nabla_word_matrix, nabla_context_matrix = self.backprop(a.reshape(-1, 1), y)
                
                self.weights_word_matrix = self.weights_word_matrix - eta*nabla_word_matrix
                self.weights_context_matrix = self.weights_context_matrix - eta*nabla_context_matrix
                
            print("Epoc {} Completed".format(_))
            
            
    def forwardPass(self, a):
        
        u_w = self.weights_word_matrix[:, np.argmax(a)]
        
        z = self.weights_context_matrix @ u_w
        
        y_hat = self.softmax(z)
        
        res = {self.index_word_map[i]: y_hat[i] for i in range(len(y_hat))}
        
        return dict(sorted(res.items(), key=lambda x: x[1], reverse = True))
    
    def backprop(self, a, y):
        '''
        Forward Pass
        '''
        
        u_w = self.weights_word_matrix[:, np.argmax(a)]
        
        z = self.weights_context_matrix @ u_w
        
        y_hat = self.softmax(z)
        
        u_w = u_w.reshape(-1, 1)
        
        '''Backward Pass'''
        
        delta = self.cost_derivative(y_hat, y).reshape(-1, 1)
        
        nabla_context_matrix = np.dot(delta, u_w.T)
        
        nabla_word_matrix = np.dot(self.weights_context_matrix.T, delta) @ a.T
        
        return nabla_word_matrix, nabla_context_matrix
        
        
    def cost_derivative(self, y_hat, y):
        return np.sum(y_hat - y, axis=0)
    
    def softmax(self, z):
    
        z = z - np.max(z)
        temp = []
        for index, each_pre_activation in enumerate(z):
        
            temp.append( np.exp(each_pre_activation)/np.exp(z).sum() )
    
        return np.array(temp)
    
    
    def generateData(self, corpus, window_size):
        bag_of_words = {}
        index = 0
        for word in corpus:
            if word not in bag_of_words:
                bag_of_words[word] = index
                index += 1
            
        index_word_map = {}
    
        for key, value in bag_of_words.items():
            index_word_map[value] = key
            
        train_set = []    
        for index, word in enumerate(corpus):
            word_one_hot = oneHotEncoding(word, bag_of_words)
            context_before_target = corpus[index - window_size: index]
            context_after_target = corpus[index + 1: index + 1 + window_size]
        
            if len(context_before_target) == 0:
                context_before_target.extend(corpus[:index])
            
            context_before_target.extend(context_after_target)
        
            context = []
        
            for context_word in context_before_target:
                context.append(oneHotEncoding(context_word, bag_of_words))
        
            train_set.append([word_one_hot, context])
        
        self.train_set = train_set
        self.bag_of_words = bag_of_words
        self.index_word_map = index_word_map
        pass
    
    
def oneHotEncoding(word, bag):
    vec = np.zeros(len(bag))
    vec[bag[word]] = 1
    return vec

In [13]:
corpus = ['the','quick','brown','fox','jumped','over','the','lazy','dog']

In [14]:
wordToVec = wordToVector(2, corpus, 2)

In [15]:
wordToVec.SGD(500)

Epoc 0 Completed
Epoc 1 Completed
Epoc 2 Completed
Epoc 3 Completed
Epoc 4 Completed
Epoc 5 Completed
Epoc 6 Completed
Epoc 7 Completed
Epoc 8 Completed
Epoc 9 Completed
Epoc 10 Completed
Epoc 11 Completed
Epoc 12 Completed
Epoc 13 Completed
Epoc 14 Completed
Epoc 15 Completed
Epoc 16 Completed
Epoc 17 Completed
Epoc 18 Completed
Epoc 19 Completed
Epoc 20 Completed
Epoc 21 Completed
Epoc 22 Completed
Epoc 23 Completed
Epoc 24 Completed
Epoc 25 Completed
Epoc 26 Completed
Epoc 27 Completed
Epoc 28 Completed
Epoc 29 Completed
Epoc 30 Completed
Epoc 31 Completed
Epoc 32 Completed
Epoc 33 Completed
Epoc 34 Completed
Epoc 35 Completed
Epoc 36 Completed
Epoc 37 Completed
Epoc 38 Completed
Epoc 39 Completed
Epoc 40 Completed
Epoc 41 Completed
Epoc 42 Completed
Epoc 43 Completed
Epoc 44 Completed
Epoc 45 Completed
Epoc 46 Completed
Epoc 47 Completed
Epoc 48 Completed
Epoc 49 Completed
Epoc 50 Completed
Epoc 51 Completed
Epoc 52 Completed
Epoc 53 Completed
Epoc 54 Completed
Epoc 55 Completed
Ep

Epoc 456 Completed
Epoc 457 Completed
Epoc 458 Completed
Epoc 459 Completed
Epoc 460 Completed
Epoc 461 Completed
Epoc 462 Completed
Epoc 463 Completed
Epoc 464 Completed
Epoc 465 Completed
Epoc 466 Completed
Epoc 467 Completed
Epoc 468 Completed
Epoc 469 Completed
Epoc 470 Completed
Epoc 471 Completed
Epoc 472 Completed
Epoc 473 Completed
Epoc 474 Completed
Epoc 475 Completed
Epoc 476 Completed
Epoc 477 Completed
Epoc 478 Completed
Epoc 479 Completed
Epoc 480 Completed
Epoc 481 Completed
Epoc 482 Completed
Epoc 483 Completed
Epoc 484 Completed
Epoc 485 Completed
Epoc 486 Completed
Epoc 487 Completed
Epoc 488 Completed
Epoc 489 Completed
Epoc 490 Completed
Epoc 491 Completed
Epoc 492 Completed
Epoc 493 Completed
Epoc 494 Completed
Epoc 495 Completed
Epoc 496 Completed
Epoc 497 Completed
Epoc 498 Completed
Epoc 499 Completed


In [16]:
query = oneHotEncoding('fox', wordToVec.bag_of_words)

In [17]:
wordToVec.forwardPass(query)

{'quick': 0.24130559594689963,
 'jumped': 0.23338292894935525,
 'over': 0.20402902281476096,
 'brown': 0.15414204468039297,
 'dog': 0.10460490517629999,
 'lazy': 0.05617327609857089,
 'fox': 0.004968858047229815,
 'the': 0.0013933682864905323}