## Import Required Modules

In [119]:
import numpy as np
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rojin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Softmax Activation Function

In [135]:
def softmax(x):
    '''Function to compute the Softmax values for each sets of scores in x. This implementation provides better numerical stability.'''
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

## Data Preprocessing

In [112]:
def preprocessing(corpus):
    '''Function for data preprocessing'''
    processed = []
    
    stop_words = stopwords.words('english')
    
    # Split text corpus into sentences
    sentences = corpus.split(".")
    
    # Loop through each sentence
    for i in range(len(sentences)):
        
        # Remove leading and trailing characters
        sentences[i] = sentences[i].strip()
        
        # Split sentence into list of words
        sentence = sentences[i].split()
        
        # Remove punctuations
        x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
        
        # Convert to lower case
        x = [word.lower() for word in x]
        
        processed.append(x) 
        
    print('\nProcessed sentence is:',  processed)
        
    return processed    

## Word2Vec

In [155]:
class word2vec():
    '''Implementation of Skip-Gram Word2Vec model'''
    def __init__(self):
        self.N = 5 # dimension of word embeddings
        self.learning_rate = 0.01 # learning rate
        self.epochs = 5000 # number of training epochs
        self.window = 2 # window size
        pass
    
    def generate_training_data(self, corpus):
        '''Function to generate training data for Word2Vec'''
    
        print('\n------GENERATE TRANING DATA------')
        
        # Generate word counts
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1
                
        # Get vocabulary size
        self.V = len(word_counts.keys())
        
        # Get lookup dict
        self.words_list = sorted(list(word_counts.keys()),reverse=False)
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
        
        training_data = []
        
        # Loop through each sentence in the corpus
        for sentence in corpus:
            sent_len = len(sentence)

            # Loop through each word in the sentence
            for i, word in enumerate(sentence):
        
                w_target = self.onehotencoding(sentence[i])

                # Loop through context window
                w_context = []
                for j in range( i-self.window, i+self.window+1 ):
                    if j!=i and j <= sent_len-1 and j >= 0:
                        w_context.append( self.onehotencoding(sentence[j]) )
                        
                training_data.append([w_target, w_context])     
        return np.array(training_data)
        
    def onehotencoding(self, word):
        '''Function to covert a word to one-hot-encoded value'''
        word_vec = [0 for i in range(0, self.V)]
        word_index = self.word_index[word]
        word_vec[word_index] = 1
        return word_vec
    
    def feed_forward(self, x):
        '''Function for feed-forward step'''
        h = np.dot(self.W.T, x)
        u = np.dot(self.W1.T, h)
        y = softmax(u)
        
        print('\n----FEEDFORWARD STEP------')
        print('\n Size of y is: ', y.shape)
        return y, h, u
    
    def backpropagate(self, e, h, x):
        '''Function for back propagation using Stochastic Gradient Descent step'''
    
        # Calculate partial derivative of loss function wrt W1
        dEdW1 = np.outer(h, e)
        
        # Calculate partial derivative of loss function wrt W
        dEdW = np.outer(x, np.dot(self.W1, e.T))
        
        # Update the weights
        self.W = self.W - (self.learning_rate * dEdW)
        self.W1 = self.W1 - (self.learning_rate * dEdW1)
        
        print('\n----BACKPROPAGATION STEP------')
        print('\n Size of W and W1 is: ', self.W.shape, self.W1.shape)  
        pass          
    
    def train(self, training_data, learning_rate = 0.01, epochs= 100):
        '''Function to train the Word2Vec model'''
        
        self.epochs = epochs
        self.learning_rate = learning_rate
        
        # Initialize weight matrices
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        
        print('\n----INITIALIZE WEIGHTS------')
        print('\n Size of W and W1 are: ', self.W.shape, self.W1.shape)
        
        # Loop through each epoch
        for i in range(0, self.epochs):
            
            # Initialize Loss
            self.loss = 0
            
            # CYCLE THROUGH EACH TRAINING SAMPLE
            for w_t, w_c in training_data:

                # FORWARD PASS
                y_pred, h, u = self.feed_forward(w_t)
                
                # CALCULATE ERROR
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

                # BACKPROPAGATION
                self.backpropagate(EI, h, w_t)

                # CALCULATE LOSS
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
                #self.loss += -2*np.log(len(w_c)) -np.sum([u[word.index(1)] for word in w_c]) + (len(w_c) * np.log(np.sum(np.exp(u))))
                
            print( 'EPOCH:',i);
            print( '\nLOSS:', self.loss);
            
        pass
            
    def predict(self, word, number_of_predictions):
        '''Function to predict context words using Word2Vec model'''
        
        # Check if word is contained in the dictionary
        if word in self.words_list: 
            
            index = self.word_index[word]
            v_W = self.W[index]
            
            # Loop through words in vocabulary
            word_sim = {}
            for i in range(self.V):
                v_W1 = self.W[i]
                theta_num = np.dot(v_W, v_W1)
                theta_den = np.linalg.norm(v_W) * np.linalg.norm(v_W1)
                theta = theta_num / theta_den
                
                word = self.index_word[i]
                word_sim[word] = theta
                
            words_sorted = sorted(word_sim.items(), key= lambda x:x[1], reverse=True)
            
            for word, sim in words_sorted[:number_of_predictions]:
                print(word, sim)
        else:
            print("Error: Word not found in dicitonary")
            
        pass

## Train the Model

In [156]:
np.random.seed(0) # set the seed for reproducibility

corpus = [['the','quick','brown','fox','jumped','over','the','lazy','dog']]

# INITIALIZE W1V MODEL
w2v = word2vec()

# generate training data
training_data = w2v.generate_training_data(corpus)

# train word2vec model
w2v.train(training_data)


------GENERATE TRANING DATA------

----INITIALIZE WEIGHTS------

 Size of W and W1 are:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----B

EPOCH: 29

LOSS: 61.07444341301829

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----

EPOCH: 59

LOSS: 57.0911277800063

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----F

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)
EPOCH: 94

LOSS: 52.74213655314484

----FEEDFORWARD STEP------

 Size of y is:  (8,)

----BACKPROPAGATION STEP------

 Size of W and W1 is:  (8, 5) (5, 8)

----F

## Predict on the model

In [158]:
# Predict using model  
print('\nThe predicted context words are', w2v.predict("fox", 2))

fox 1.0
over 0.46526382689168677

The predicted context words are None
