## Import Required Modules

In [119]:
import numpy as np
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rojin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Softmax Activation Function

In [3]:
def softmax(x):
    '''Function to compute the Softmax values for each sets of scores in x. This implementation provides better numerical stability.'''
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

## One Hot Encoding

In [4]:
def onehotencoding(self, word):
    '''Function to covert a word to one-hot-encoded value'''
    word_vec = [0 for i in range(0, self.v_count)]
    word_index = self.word_index[word]
    word_vec[word_index] = 1
    return word_vec

## Word2Vec

In [121]:
class word2vec():
    '''Implementation of Skip-Gram Word2Vec model'''
    def __init__(self):
        self.N = 5 # dimension of word embeddings
        self.eta = 0.01 # learning rate
        self.epochs = 5000 # number of training epochs
        self.window = 2 # window size
        pass
    
    def generate_training_data(sentences, w2v):
    '''Function to generate training data for Word2Vec'''
    
        print('\n------GENERATE TRANING DATA------')
        
        # Generate word counts
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1
                
        # Get vocabulary size
        self.V = len(word_counts.keys())
        
        # Get lookup dict
        self.words_list = sorted(list(word_counts.keys()),reverse=False)
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
        
        training_data = []
        
        # Loop through each sentence in the corpus
        for sentence in corpus:
            sent_len = len(sentence)

            # Loop through each word in the sentence
            for i, word in enumerate(sentence):
        
                w_target = self.word2onehot(sentence[i])

                # Loop through context window
                w_context = []
                for j in range(i-self.window, i+self.window+1):
                    if j!=i and j<=sent_len-1 and j>=0:
                        w_context.append(self.word2onehot(sentence[j]))
                        
                training_data.append([w_target, w_context])
                
        return np.array(training_data)
        
    
  
    def initialize(self, V, data):
        '''Function to initialze the neural network'''
        self.V = V
        
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        
        self.words = data
        
        for i in range(len(data)):
            self.word_index[data[i]] = i
        
        print('\n----INITIALIZE WEIGHTS------')
        print('\n Size of W and W1 are: ', self.W.shape, self.W1.shape)
            
            
    def feed_forward(self, x):
        '''Function for feed-forward step'''
        self.h = np.dot(self.W.T, x)
        self.u = np.dot(self.W1.T, self.h)
        self.y = softmax(self.u)
        
        print('\n----FEEDFORWARD STEP------')
        print('\n Size of y is: ', self.y.shape)
        return self.y
    
    def backpropagate(self, e, h, x):
        '''Function for back propagation using Stochastic Gradient Descent step'''
    
        # Calculate partial derivative of loss function wrt W1
        dEdW1 = np.dot(self.h, e.T)
        
        # Calculate partial derivative of loss function wrt W
        dEdW = np.dot(x, np.dot(self.W1, e.T))
        
        # Update the weights
        self.W = self.W - (self.learning_rate * dEdW)
        self.W1 = self.W1 - (self.learning_rate * dEdW1)
        
        print('\n----BACKPROPAGATION STEP------')
        print('\n Size of W and W1 is: ', self.W.shape, self.W1.shape)  
        
        pass
    
    def train(self, training_data, learning_rate = 0.01, epochs= 100):
        '''Function to train the Word2Vec model'''
        
        self.epochs = epochs
    
        # Loop through each epoch
        for i in range(0, self.epochs):
            
            # Initialize Loss
            self.loss = 0
            
            # CYCLE THROUGH EACH TRAINING SAMPLE
            for w_t, w_c in training_data:

                # FORWARD PASS
                y_pred, h, u = self.feed_forward(w_t)
                
                # CALCULATE ERROR
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

                # BACKPROPAGATION
                self.backpropagate(EI, h, w_t)

                # CALCULATE LOSS
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
                #self.loss += -2*np.log(len(w_c)) -np.sum([u[word.index(1)] for word in w_c]) + (len(w_c) * np.log(np.sum(np.exp(u))))
                
            print( 'EPOCH:',i);
            
            print( '\nLOSS:', self.loss);
        pass
            
    def predict(self, word, number_of_predictions):
        
        '''Function to predict context words using Word2Vec model'''
        
        # Check if word is contained in the dictionary
        if word in self.words: 
            
            index = self.word_index[word]
            v_w1 = self.w1[index]
            # CYCLE THROUGH VOCAB
            
            word_sim = {}
            for i in range(self.V):
                v_w2 = self.W[i]
                
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X)
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i
                
            # Sort top context words in the output    
            sorted_output = [] 
            for k in sorted(output, reverse=True): 
                sorted_output.append(self.words[output[k]]) 
                if(len(sorted_output)>=number_of_predictions): 
                    break
            return sorted_output 
        else: 
            print("Error: Word not found in dicitonary")
            
    
    

## Data Preprocessing

In [112]:
def preprocessing(corpus):
    '''Function for data preprocessing'''
    processed = []
    
    stop_words = stopwords.words('english')
    
    # Split text corpus into sentences
    sentences = corpus.split(".")
    
    # Loop through each sentence
    for i in range(len(sentences)):
        
        # Remove leading and trailing characters
        sentences[i] = sentences[i].strip()
        
        # Split sentence into list of words
        sentence = sentences[i].split()
        
        # Remove punctuations
        x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
        
        # Convert to lower case
        x = [word.lower() for word in x]
        
        processed.append(x) 
        
    print('\nProcessed sentence is:',  processed)
        
    return processed    

## Train the Model

## Run the model

In [118]:
text_corpus = "Welcome students to the Department of Computer Science. We have great faculty and professors. We will have a welcome program today."

# Number of epochs
epochs = 1000

# Data preprocessing  
training_data = preprocessing(text_corpus)

# Word2Vec
w2v = word2vec()

# Train the model
w2v.train(epochs)



Processed sentence is: [['welcome', 'students', 'department', 'computer', 'science'], ['we', 'great', 'faculty', 'professors'], ['we', 'welcome', 'program', 'today'], []]

------GENERATE TRANING DATA------

Vocabulary is: {'computer': 0, 'department': 1, 'faculty': 2, 'great': 3, 'professors': 4, 'program': 5, 'science': 6, 'students': 7, 'today': 8, 'we': 9, 'welcome': 10} 


Vocabulary size is (V): 11 


----INITIALIZE FUNCTION------

Size of W and W1 are:  (11, 10) (10, 11)

Size of X_train and y_train is: 13 13

X_train is: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]] 

y_train



In [105]:
# Predict using model  
print('\nThe predicted context words are', w2v.predict("welcome", 5))


----FEEDFORWARD FUNCTION------

Size of y is:  (11, 1)

The predicted context words are ['great', 'welcome']


In [92]:
w2v.word_index.items()

dict_items([('computer', 0), ('department', 1), ('faculty', 2), ('great', 3), ('professors', 4), ('program', 5), ('science', 6), ('students', 7), ('today', 8), ('we', 9), ('welcome', 10)])

In [96]:
len(w2v.y_train)

13