## Import Required Modules

In [1]:
import numpy as np
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rojin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Softmax Activation Function

In [2]:
def softmax(x):
    '''Function to compute the Softmax values for each sets of scores in x. This implementation provides better numerical stability.'''
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def softmax_1(x):
    '''Function to compute the Softmax values for each sets of scores in x'''
    e_x = np.exp(x)
    return e_x / np.sum(e_x, axis=0)

## One Hot Encoding

In [3]:
def onehotencoding(self, word):
    '''Function to covert a word to one-hot-encoded value'''
    word_vec = [0 for i in range(0, self.v_count)]
    word_index = self.word_index[word]
    word_vec[word_index] = 1
    return word_vec

## Word2Vec

In [4]:
class word2vec():
    '''Implementation of Skip-Gram Word2Vec model'''
    def __init__(self):
        self.N = 10
        self.X_train = []
        self.y_train = []
        
        self.window_size = 2
        self.alpha = 0.001
        
        self.words = []
        self.word_index = {}
    
    def initialize(self, V, data):
        '''Function to initialze the neural network'''
        self.V = V
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        self.words = data
        for i in range(len(data)):
            self.word_index[data[i]] = i
            
    def feed_forward(self, X):
        '''Function for feed-forward step'''
        self.h = np.dot(self.W.T, X).reshape(self.N, 1) 
        self.u = np.dot(self.W1.T, self.h)
        self.y = softmax(self.u)   
        return self.y
    
    def backpropagate(self, x, t):
        '''Function for back propagation using Stochastic Gradient Descent step'''
        e = self.y - np.asarray(t).reshape(self.V, 1) #e.shape is V X 1
        # Calculate partial derivative of loss function wrt W1
        dEdW1 = np.dot(self.h, e.T)
        
        X = np.array(x).reshape(self.V, 1)
        # Calculate partial derivative of loss function wrt W
        dEdW = np.dot(X, np.dot(self.W1, e).T)
        
        # Update the weights
        self.W1 = self.W1 - self.alpha * dEdW1
        self.W = self.W - self.alpha * dEdW
    
    def train(self, epochs):
        '''Function to train the Word2Vec model'''
        
        # Generate Training data
        generate_training_data(training_data,w2v)

        # Loop through each epoch
        for x in range(1,epochs):
            
            # Initialize Loss
            self.loss = 0
            
            # Loop through each training sample
            for j in range(len(self.X_train)):
                
                # Forward Pass
                self.feed_forward(self.X_train[j])
                
                # Backpropagation
                self.backpropagate(self.X_train[j],self.y_train[j])
                
                C = 0
                for m in range(self.V): 
                    if(self.y_train[j][m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                        
                # Calculate Loss        
                self.loss += C*np.log(np.sum(np.exp(self.u)))
                
            print("Epoch: ", x, " Loss: ", self.loss)
            self.alpha *= 1/((1+self.alpha*x))
            
    def predict(self, word, number_of_predictions):
        '''Function to predict context words using Word2Vec model'''
        # Check if word is contained in the dictionary
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X)
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i
                
            # Sort top context words in the output    
            sorted_output = [] 
            for k in sorted(output, reverse=True): 
                sorted_output.append(self.words[output[k]]) 
                if(len(sorted_output)>=number_of_predictions): 
                    break
            return sorted_output 
        else: 
            print("Error: Word not found in dicitonary")
    

## Data Preprocessing

In [5]:
def preprocessing(corpus):
    '''Function for data preprocessing'''
    processed = []
    stop_words = stopwords.words('english')
    # Split text corpus into sentences
    sentences = corpus.split(".")
    # Loop through each sentence
    for i in range(len(sentences)):
        # Remove leading and trailing characters
        sentences[i] = sentences[i].strip()
        # Split sentence into list of words
        sentence = sentences[i].split()
        # Remove punctuations
        x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
        # Convert to lower case
        x = [word.lower() for word in x]
        processed.append(x) 
    return processed    

## Train the Model

In [16]:
def generate_training_data(sentences, w2v):
    '''Function to generate training data for Word2Vec'''
    data = {}
    
    # Loop throuch each sentence
    for sentence in sentences:
        # Loop through each word
        for word in sentence:
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
    V = len(data) # Size of Vocabulary
    data = sorted(list(data.keys()))
    
    vocab = {}
    # Store words into vocabulary
    for i in range(len(data)): 
        vocab[data[i]] = i 
    
    print('\nVocabulary:', vocab, '\n')
    
    # Loop through each sentence 
    for sentence in sentences: 
        for i in range(len(sentence)): 
            center_word = [0 for x in range(V)] 
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)] 
            # Get the context words from the sliding window  
            for j in range(i-w2v.window_size, i+w2v.window_size): 
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1
                    
            w2v.X_train.append(center_word) 
            w2v.y_train.append(context)
            
    w2v.initialize(V, data)
    return w2v.X_train, w2v.y_train

## Run the model

In [29]:
text_corpus = "Welcome students to the Department of Computer Science. We have great faculty and professors. We will have a welcome program today."

# Number of epochs
epochs = 1000

# Data preprocessing  
training_data = preprocessing(text_corpus)

# Word2Vec
w2v = word2vec()

# Train the model
w2v.train(epochs)

# Predict using model  
print('\nThe predicted context words are', w2v.predict("welcome", 5))


Vocabulary: {'computer': 0, 'department': 1, 'faculty': 2, 'great': 3, 'professors': 4, 'program': 5, 'science': 6, 'students': 7, 'today': 8, 'we': 9, 'welcome': 10} 

Epoch:  1  Loss:  72.69769323019779
Epoch:  2  Loss:  72.5930390857227
Epoch:  3  Loss:  72.48883010829633
Epoch:  4  Loss:  72.38516577226034
Epoch:  5  Loss:  72.28214303588062
Epoch:  6  Loss:  72.17985580507238
Epoch:  7  Loss:  72.07839443648926
Epoch:  8  Loss:  71.9778452862734
Epoch:  9  Loss:  71.87829030964802
Epoch:  10  Loss:  71.77980671532326
Epoch:  11  Loss:  71.68246667742335
Epoch:  12  Loss:  71.58633710637427
Epoch:  13  Loss:  71.4914794789621
Epoch:  14  Loss:  71.39794972661537
Epoch:  15  Loss:  71.30579817991847
Epoch:  16  Loss:  71.2150695664476
Epoch:  17  Loss:  71.12580305825394
Epoch:  18  Loss:  71.0380323647145
Epoch:  19  Loss:  70.95178586602292
Epoch:  20  Loss:  70.8670867823065
Epoch:  21  Loss:  70.78395337321435
Epoch:  22  Loss:  70.702399162815
Epoch:  23  Loss:  70.62243318475

Epoch:  305  Loss:  66.61642138279477
Epoch:  306  Loss:  66.61457614673746
Epoch:  307  Loss:  66.61274281370493
Epoch:  308  Loss:  66.61092126968713
Epoch:  309  Loss:  66.60911140211529
Epoch:  310  Loss:  66.60731309983943
Epoch:  311  Loss:  66.60552625310612
Epoch:  312  Loss:  66.60375075353689
Epoch:  313  Loss:  66.60198649410688
Epoch:  314  Loss:  66.60023336912384
Epoch:  315  Loss:  66.59849127420772
Epoch:  316  Loss:  66.59676010627035
Epoch:  317  Loss:  66.59503976349585
Epoch:  318  Loss:  66.593330145321
Epoch:  319  Loss:  66.59163115241626
Epoch:  320  Loss:  66.58994268666707
Epoch:  321  Loss:  66.58826465115538
Epoch:  322  Loss:  66.58659695014155
Epoch:  323  Loss:  66.58493948904675
Epoch:  324  Loss:  66.5832921744354
Epoch:  325  Loss:  66.58165491399814
Epoch:  326  Loss:  66.58002761653498
Epoch:  327  Loss:  66.57841019193874
Epoch:  328  Loss:  66.57680255117896
Epoch:  329  Loss:  66.5752046062859
Epoch:  330  Loss:  66.57361627033487
Epoch:  331  Los

Epoch:  607  Loss:  66.33459483205661
Epoch:  608  Loss:  66.33412599511357
Epoch:  609  Loss:  66.33365869825899
Epoch:  610  Loss:  66.33319293393018
Epoch:  611  Loss:  66.3327286946137
Epoch:  612  Loss:  66.33226597284518
Epoch:  613  Loss:  66.33180476120864
Epoch:  614  Loss:  66.3313450523365
Epoch:  615  Loss:  66.33088683890868
Epoch:  616  Loss:  66.33043011365268
Epoch:  617  Loss:  66.3299748693429
Epoch:  618  Loss:  66.32952109880038
Epoch:  619  Loss:  66.3290687948924
Epoch:  620  Loss:  66.32861795053212
Epoch:  621  Loss:  66.32816855867821
Epoch:  622  Loss:  66.32772061233453
Epoch:  623  Loss:  66.32727410454964
Epoch:  624  Loss:  66.32682902841657
Epoch:  625  Loss:  66.3263853770724
Epoch:  626  Loss:  66.32594314369796
Epoch:  627  Loss:  66.3255023215174
Epoch:  628  Loss:  66.32506290379801
Epoch:  629  Loss:  66.32462488384967
Epoch:  630  Loss:  66.32418825502461
Epoch:  631  Loss:  66.32375301071713
Epoch:  632  Loss:  66.32331914436323
Epoch:  633  Loss:

Epoch:  895  Loss:  66.24289261128477
Epoch:  896  Loss:  66.24267699995139
Epoch:  897  Loss:  66.24246187000163
Epoch:  898  Loss:  66.24224721982648
Epoch:  899  Loss:  66.2420330478241
Epoch:  900  Loss:  66.24181935239972
Epoch:  901  Loss:  66.24160613196568
Epoch:  902  Loss:  66.24139338494135
Epoch:  903  Loss:  66.24118110975309
Epoch:  904  Loss:  66.24096930483421
Epoch:  905  Loss:  66.240757968625
Epoch:  906  Loss:  66.24054709957258
Epoch:  907  Loss:  66.24033669613095
Epoch:  908  Loss:  66.24012675676084
Epoch:  909  Loss:  66.23991727992987
Epoch:  910  Loss:  66.23970826411235
Epoch:  911  Loss:  66.23949970778921
Epoch:  912  Loss:  66.23929160944816
Epoch:  913  Loss:  66.23908396758347
Epoch:  914  Loss:  66.23887678069599
Epoch:  915  Loss:  66.23867004729314
Epoch:  916  Loss:  66.23846376588887
Epoch:  917  Loss:  66.2382579350036
Epoch:  918  Loss:  66.23805255316421
Epoch:  919  Loss:  66.23784761890391
Epoch:  920  Loss:  66.23764313076242
Epoch:  921  Los