## Import Required Modules

In [1]:
import numpy as np
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rojin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Softmax Activation Function

In [2]:
def softmax(x):
    '''Function to compute the Softmax values for each sets of scores in x. This implementation provides better numerical stability.'''
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def softmax_1(x):
    '''Function to compute the Softmax values for each sets of scores in x'''
    e_x = np.exp(x)
    return e_x / np.sum(e_x, axis=0)

## One Hot Encoding

In [3]:
def onehotencoding(self, word):
    '''Function to covert a word to one-hot-encoded value'''
    word_vec = [0 for i in range(0, self.v_count)]
    word_index = self.word_index[word]
    word_vec[word_index] = 1
    return word_vec

## Word2Vec

In [4]:
class word2vec():
    '''Implementation of Skip-Gram Word2Vec model'''
    def __init__(self):
        self.N = 10
        self.X_train = []
        self.y_train = []
        
        self.window_size = 2
        self.alpha = 0.001
        
        self.words = []
        self.word_index = {}
    
    def initialize(self, V, data):
        '''Function to initialze the neural network'''
        self.V = V
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        self.words = data
        for i in range(len(data)):
            self.word_index[data[i]] = i
            
    def feed_forward(self, X):
        '''Function for feed-forward step'''
        self.h = np.dot(self.W.T, X).reshape(self.N, 1) 
        self.u = np.dot(self.W1.T, self.h)
        self.y = softmax(self.u)   
        return self.y
    
    def backpropagate(self, x, t):
        '''Function for back propagation using Stochastic Gradient Descent step'''
        e = self.y - np.asarray(t).reshape(self.V, 1) #e.shape is V X 1
        # Calculate partial derivative of loss function wrt W1
        dEdW1 = np.dot(self.h, e.T)
        
        X = np.array(x).reshape(self.V, 1)
        # Calculate partial derivative of loss function wrt W
        dEdW = np.dot(X, np.dot(self.W1, e).T)
        
        # Update the weights
        self.W1 = self.W1 - self.alpha * dEdW1
        self.W = self.W - self.alpha * dEdW
    
    def train(self, epochs):
        '''Function to train the Word2Vec model'''
        # Loop through each epoch
        for x in range(1,epochs):
            
            # Initialize Loss
            self.loss = 0
            
            # Loop through each training sample
            for j in range(len(self.X_train)):
                # Forward Pass
                self.feed_forward(self.X_train[j])
                
                # Backpropagation
                self.backpropagate(self.X_train[j],self.y_train[j]) 
                C = 0
                for m in range(self.V): 
                    if(self.y_train[j][m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                        
                # Calculate Loss        
                self.loss += C*np.log(np.sum(np.exp(self.u)))
                
            print("Epoch: ", x, " Loss: ", self.loss)
            self.alpha *= 1/((1+self.alpha*x))
            
    def predict(self, word, number_of_predictions):       
        # Check if word is contained in the dictionary
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X)
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i
                
            # Sort top context words in the output    
            sorted_output = [] 
            for k in sorted(output, reverse=True): 
                sorted_output.append(self.words[output[k]]) 
                if(len(sorted_output)>=number_of_predictions): 
                    break
            return sorted_output 
        else: 
            print("Error: Word not found in dicitonary")
    

## Data Preprocessing

In [5]:
def preprocessing(corpus):
    processed = []
    stop_words = stopwords.words('english')
    # Split text corpus into sentences
    sentences = corpus.split(".")
    # Loop through each sentence
    for i in range(len(sentences)):
        # Remove leading and trailing characters
        sentences[i] = sentences[i].strip()
        # Split sentence into list of words
        sentence = sentences[i].split()
        # Remove punctuations
        x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
        # Convert to lowe
        x = [word.lower() for word in x] 
        processed.append(x) 
    return processed    

## Train the Model

In [6]:
def generate_training_data(sentences, w2v):
    data = {}
    # Loop throuch each sentence
    for sentence in sentences:
        # Loop through each word
        for word in sentence: 
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
    V = len(data) # Size of Vocabulary
    data = sorted(list(data.keys()))
    
    vocab = {}
    # Store words into vocabulary
    for i in range(len(data)): 
        vocab[data[i]] = i 
       
    # Loop through each sentence 
    for sentence in sentences: 
        for i in range(len(sentence)): 
            center_word = [0 for x in range(V)] 
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)] 
              
            for j in range(i-w2v.window_size,i+w2v.window_size): 
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1
            w2v.X_train.append(center_word) 
            w2v.y_train.append(context) 
    w2v.initialize(V,data) 
    return w2v.X_train,w2v.y_train

In [7]:
text_corpus = "" 
text_corpus += "Welcome to the Department of Computer Science. We have great faculty and professors. We will have a welcome program today."

# Number of epochs
epochs = 1000

# Data preprocessing  
training_data = preprocessing(text_corpus)

# Word2Vec
w2v = word2vec()

# Generate Training data
generate_training_data(training_data,w2v)

# Train the model
w2v.train(epochs)

# Predict using model  
print(w2v.predict("welcome", 5))

Epoch:  1  Loss:  61.802825656965155
Epoch:  2  Loss:  61.70624913849453
Epoch:  3  Loss:  61.6100843314518
Epoch:  4  Loss:  61.51442327288594
Epoch:  5  Loss:  61.41935566267462
Epoch:  6  Loss:  61.32496836682249
Epoch:  7  Loss:  61.23134495739262
Epoch:  8  Loss:  61.13856529489851
Epoch:  9  Loss:  61.04670515794645
Epoch:  10  Loss:  60.955835923791305
Epoch:  11  Loss:  60.86602430229532
Epoch:  12  Loss:  60.77733212460163
Epoch:  13  Loss:  60.689816186693214
Epoch:  14  Loss:  60.603528146935865
Epoch:  15  Loss:  60.518514475734484
Epoch:  16  Loss:  60.434816454583995
Epoch:  17  Loss:  60.35247022108936
Epoch:  18  Loss:  60.271506855968795
Epoch:  19  Loss:  60.19195250764474
Epoch:  20  Loss:  60.113828549763035
Epoch:  21  Loss:  60.03715176685352
Epoch:  22  Loss:  59.96193456334297
Epoch:  23  Loss:  59.88818519123573
Epoch:  24  Loss:  59.815907991975784
Epoch:  25  Loss:  59.74510364827202
Epoch:  26  Loss:  59.67576944199605
Epoch:  27  Loss:  59.60789951462478
Ep

Epoch:  317  Loss:  56.179800834437586
Epoch:  318  Loss:  56.17823003696549
Epoch:  319  Loss:  56.1766690057993
Epoch:  320  Loss:  56.175117650722115
Epoch:  321  Loss:  56.1735758826178
Epoch:  322  Loss:  56.172043613454065
Epoch:  323  Loss:  56.170520756266455
Epoch:  324  Loss:  56.169007225141975
Epoch:  325  Loss:  56.167502935203686
Epoch:  326  Loss:  56.16600780259497
Epoch:  327  Loss:  56.164521744464494
Epoch:  328  Loss:  56.1630446789512
Epoch:  329  Loss:  56.161576525169686
Epoch:  330  Loss:  56.160117203195846
Epoch:  331  Loss:  56.158666634052615
Epoch:  332  Loss:  56.157224739696126
Epoch:  333  Loss:  56.155791443002116
Epoch:  334  Loss:  56.154366667752335
Epoch:  335  Loss:  56.152950338621565
Epoch:  336  Loss:  56.15154238116452
Epoch:  337  Loss:  56.15014272180307
Epoch:  338  Loss:  56.14875128781391
Epoch:  339  Loss:  56.14736800731609
Epoch:  340  Loss:  56.14599280925903
Epoch:  341  Loss:  56.1446256234105
Epoch:  342  Loss:  56.14326638034511
Ep

Epoch:  651  Loss:  55.92285482504328
Epoch:  652  Loss:  55.922480454265504
Epoch:  653  Loss:  55.92210723099727
Epoch:  654  Loss:  55.92173514997886
Epoch:  655  Loss:  55.92136420598249
Epoch:  656  Loss:  55.920994393812336
Epoch:  657  Loss:  55.920625708303994
Epoch:  658  Loss:  55.9202581443245
Epoch:  659  Loss:  55.91989169677187
Epoch:  660  Loss:  55.91952636057505
Epoch:  661  Loss:  55.919162130693586
Epoch:  662  Loss:  55.91879900211736
Epoch:  663  Loss:  55.91843696986652
Epoch:  664  Loss:  55.918076028991024
Epoch:  665  Loss:  55.91771617457064
Epoch:  666  Loss:  55.91735740171462
Epoch:  667  Loss:  55.91699970556139
Epoch:  668  Loss:  55.91664308127852
Epoch:  669  Loss:  55.91628752406236
Epoch:  670  Loss:  55.91593302913792
Epoch:  671  Loss:  55.91557959175855
Epoch:  672  Loss:  55.91522720720589
Epoch:  673  Loss:  55.9148758707895
Epoch:  674  Loss:  55.914525577846725
Epoch:  675  Loss:  55.91417632374251
Epoch:  676  Loss:  55.91382810386913
Epoch:  

Epoch:  982  Loss:  55.840616100354026
Epoch:  983  Loss:  55.84045166023629
Epoch:  984  Loss:  55.84028755486756
Epoch:  985  Loss:  55.84012378322741
Epoch:  986  Loss:  55.839960344299556
Epoch:  987  Loss:  55.8397972370718
Epoch:  988  Loss:  55.839634460536075
Epoch:  989  Loss:  55.83947201368841
Epoch:  990  Loss:  55.83930989552886
Epoch:  991  Loss:  55.83914810506147
Epoch:  992  Loss:  55.83898664129444
Epoch:  993  Loss:  55.838825503239846
Epoch:  994  Loss:  55.838664689913756
Epoch:  995  Loss:  55.83850420033625
Epoch:  996  Loss:  55.838344033531286
Epoch:  997  Loss:  55.83818418852679
Epoch:  998  Loss:  55.8380246643545
Epoch:  999  Loss:  55.837865460050104
['department', 'we', 'program', 'computer', 'professors']
