## Import Required Modules

In [1]:
import numpy as np
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rojin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Softmax Activation Function

In [2]:
def softmax(x):
    '''Function to compute the Softmax values for each sets of scores in x. This implementation provides better numerical stability.'''
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def softmax_1(x):
    '''Function to compute the Softmax values for each sets of scores in x'''
    e_x = np.exp(x)
    return e_x / np.sum(e_x, axis=0)

## One Hot Encoding

In [3]:
def onehotencoding(self, word):
    '''Function to covert a word to one-hot-encoded value'''
    word_vec = [0 for i in range(0, self.v_count)]
    word_index = self.word_index[word]
    word_vec[word_index] = 1
    return word_vec

## Word2Vec

In [4]:
class word2vec():
    '''Implementation of Skip-Gram Word2Vec model'''
    def __init__(self):
        self.N = 10
        self.X_train = []
        self.y_train = []
        
        self.window_size = 2
        self.alpha = 0.001
        
        self.words = []
        self.word_index = {}
    
    def initialize(self, V, data):
        '''Function to initialze the neural network'''
        self.V = V
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        self.words = data
        for i in range(len(data)):
            self.word_index[data[i]] = i
            
    def feed_forward(self, X):
        '''Function for feed-forward step'''
        self.h = np.dot(self.W.T, X).reshape(self.N, 1) 
        self.u = np.dot(self.W1.T, self.h)
        self.y = softmax(self.u)   
        return self.y
    
    def backpropagate(self, x, t):
        '''Function for back propagation using Stochastic Gradient Descent step'''
        e = self.y - np.asarray(t).reshape(self.V, 1) #e.shape is V X 1
        # Calculate partial derivative of loss function wrt W1
        dEdW1 = np.dot(self.h, e.T)
        
        X = np.array(x).reshape(self.V, 1)
        # Calculate partial derivative of loss function wrt W
        dEdW = np.dot(X, np.dot(self.W1, e).T)
        
        # Update the weights
        self.W1 = self.W1 - self.alpha * dEdW1
        self.W = self.W - self.alpha * dEdW
    
    def train(self, epochs):
        '''Function to train the Word2Vec model'''
        # Loop through each epoch
        for x in range(1,epochs):
            
            # Initialize Loss
            self.loss = 0
            
            # Loop through each training sample
            for j in range(len(self.X_train)):
                # Forward Pass
                self.feed_forward(self.X_train[j])
                
                # Backpropagation
                self.backpropagate(self.X_train[j],self.y_train[j]) 
                C = 0
                for m in range(self.V): 
                    if(self.y_train[j][m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                        
                # Calculate Loss        
                self.loss += C*np.log(np.sum(np.exp(self.u)))
                
            print("Epoch: ", x, " Loss: ", self.loss)
            self.alpha *= 1/((1+self.alpha*x))
            
    def predict(self, word, number_of_predictions):
        '''Function to predict context words using Word2Vec model'''
        # Check if word is contained in the dictionary
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X)
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i
                
            # Sort top context words in the output    
            sorted_output = [] 
            for k in sorted(output, reverse=True): 
                sorted_output.append(self.words[output[k]]) 
                if(len(sorted_output)>=number_of_predictions): 
                    break
            return sorted_output 
        else: 
            print("Error: Word not found in dicitonary")
    

## Data Preprocessing

In [5]:
def preprocessing(corpus):
    '''Function for data preprocessing'''
    processed = []
    stop_words = stopwords.words('english')
    # Split text corpus into sentences
    sentences = corpus.split(".")
    # Loop through each sentence
    for i in range(len(sentences)):
        # Remove leading and trailing characters
        sentences[i] = sentences[i].strip()
        # Split sentence into list of words
        sentence = sentences[i].split()
        # Remove punctuations
        x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
        # Convert to lower case
        x = [word.lower() for word in x]
        processed.append(x) 
    return processed    

## Train the Model

In [16]:
def generate_training_data(sentences, w2v):
    '''Function to generate training data for Word2Vec'''
    data = {}
    
    # Loop throuch each sentence
    for sentence in sentences:
        # Loop through each word
        for word in sentence:
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
    V = len(data) # Size of Vocabulary
    data = sorted(list(data.keys()))
    
    vocab = {}
    # Store words into vocabulary
    for i in range(len(data)): 
        vocab[data[i]] = i 
    
    print('\nVocabulary:', vocab, '\n')
    
    # Loop through each sentence 
    for sentence in sentences: 
        for i in range(len(sentence)): 
            center_word = [0 for x in range(V)] 
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)] 
              
            for j in range(i-w2v.window_size,i+w2v.window_size): 
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1
                    
            w2v.X_train.append(center_word) 
            w2v.y_train.append(context)
            
    w2v.initialize(V, data)
    return w2v.X_train, w2v.y_train

## Run the model

In [17]:
text_corpus = "Welcome to the Department of Computer Science. We have great faculty and professors. We will have a welcome program today."

# Number of epochs
epochs = 1000

# Data preprocessing  
training_data = preprocessing(text_corpus)

# Word2Vec
w2v = word2vec()

# Generate Training data
generate_training_data(training_data,w2v)

# Train the model
w2v.train(epochs)

# Predict using model  
print(w2v.predict("welcome", 5))


Vocabulary: {'computer': 0, 'department': 1, 'faculty': 2, 'great': 3, 'professors': 4, 'program': 5, 'science': 6, 'today': 7, 'we': 8, 'welcome': 9} 

Epoch:  1  Loss:  58.85458113051705
Epoch:  2  Loss:  58.78457304208971
Epoch:  3  Loss:  58.714811439017396
Epoch:  4  Loss:  58.64536360964764
Epoch:  5  Loss:  58.57629535656976
Epoch:  6  Loss:  58.50767062753499
Epoch:  7  Loss:  58.439551170757525
Epoch:  8  Loss:  58.37199621906913
Epoch:  9  Loss:  58.30506220668431
Epoch:  10  Loss:  58.2388025215414
Epoch:  11  Loss:  58.17326729535166
Epoch:  12  Loss:  58.108503232637844
Epoch:  13  Loss:  58.04455347921334
Epoch:  14  Loss:  57.98145752976072
Epoch:  15  Loss:  57.91925117344453
Epoch:  16  Loss:  57.85796647585016
Epoch:  17  Loss:  57.79763179499536
Epoch:  18  Loss:  57.73827182871855
Epoch:  19  Loss:  57.67990769041429
Epoch:  20  Loss:  57.62255700985579
Epoch:  21  Loss:  57.566234055717516
Epoch:  22  Loss:  57.510949876373346
Epoch:  23  Loss:  57.45671245559139


Epoch:  310  Loss:  54.69744321385437
Epoch:  311  Loss:  54.69620055534294
Epoch:  312  Loss:  54.6949657760744
Epoch:  313  Loss:  54.69373880184906
Epoch:  314  Loss:  54.69251955938983
Epoch:  315  Loss:  54.69130797632777
Epoch:  316  Loss:  54.69010398118826
Epoch:  317  Loss:  54.688907503377415
Epoch:  318  Loss:  54.68771847316846
Epoch:  319  Loss:  54.68653682168871
Epoch:  320  Loss:  54.68536248090652
Epoch:  321  Loss:  54.68419538361863
Epoch:  322  Loss:  54.68303546343767
Epoch:  323  Loss:  54.68188265477983
Epoch:  324  Loss:  54.68073689285292
Epoch:  325  Loss:  54.67959811364447
Epoch:  326  Loss:  54.67846625391013
Epoch:  327  Loss:  54.677341251162275
Epoch:  328  Loss:  54.676223043658744
Epoch:  329  Loss:  54.67511157039185
Epoch:  330  Loss:  54.67400677107762
Epoch:  331  Loss:  54.67290858614504
Epoch:  332  Loss:  54.67181695672571
Epoch:  333  Loss:  54.67073182464357
Epoch:  334  Loss:  54.66965313240473
Epoch:  335  Loss:  54.668580823187696
Epoch:  3

Epoch:  648  Loss:  54.49507891025352
Epoch:  649  Loss:  54.49479234375352
Epoch:  650  Loss:  54.49450665884827
Epoch:  651  Loss:  54.49422185148218
Epoch:  652  Loss:  54.49393791762439
Epoch:  653  Loss:  54.4936548532687
Epoch:  654  Loss:  54.493372654433294
Epoch:  655  Loss:  54.49309131716067
Epoch:  656  Loss:  54.49281083751722
Epoch:  657  Loss:  54.49253121159341
Epoch:  658  Loss:  54.49225243550325
Epoch:  659  Loss:  54.49197450538431
Epoch:  660  Loss:  54.491697417397525
Epoch:  661  Loss:  54.49142116772696
Epoch:  662  Loss:  54.491145752579634
Epoch:  663  Loss:  54.490871168185414
Epoch:  664  Loss:  54.49059741079682
Epoch:  665  Loss:  54.49032447668885
Epoch:  666  Loss:  54.49005236215874
Epoch:  667  Loss:  54.48978106352593
Epoch:  668  Loss:  54.48951057713183
Epoch:  669  Loss:  54.48924089933963
Epoch:  670  Loss:  54.48897202653416
Epoch:  671  Loss:  54.48870395512176
Epoch:  672  Loss:  54.48843668153012
Epoch:  673  Loss:  54.48817020220806
Epoch:  6

Epoch:  984  Loss:  54.43158084415211
Epoch:  985  Loss:  54.43145655705449
Epoch:  986  Loss:  54.43133252229946
Epoch:  987  Loss:  54.43120873911986
Epoch:  988  Loss:  54.43108520675154
Epoch:  989  Loss:  54.43096192443356
Epoch:  990  Loss:  54.43083889140794
Epoch:  991  Loss:  54.430716106919775
Epoch:  992  Loss:  54.43059357021728
Epoch:  993  Loss:  54.43047128055166
Epoch:  994  Loss:  54.43034923717704
Epoch:  995  Loss:  54.43022743935071
Epoch:  996  Loss:  54.430105886332804
Epoch:  997  Loss:  54.429984577386456
Epoch:  998  Loss:  54.42986351177781
Epoch:  999  Loss:  54.4297426887759
['professors', 'we', 'department', 'today', 'program']
