## Import Required Modules

In [2]:
import numpy as np
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rojin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Softmax Activation Function

In [3]:
def softmax(x):
    '''Function to compute the Softmax values for each sets of scores in x. This implementation provides better numerical stability.'''
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def softmax_1(x):
    '''Function to compute the Softmax values for each sets of scores in x'''
    e_x = np.exp(x)
    return e_x / np.sum(e_x, axis=0)

## One Hot Encoding

In [4]:
def onehotencoding(self, word):
    '''Function to covert a word to one-hot-encoded value'''
    word_vec = [0 for i in range(0, self.v_count)]
    word_index = self.word_index[word]
    word_vec[word_index] = 1
    return word_vec

## Word2Vec

In [65]:
class word2vec():
    '''Implementation of Skip-Gram Word2Vec model'''
    def __init__(self):
        self.N = 10
        self.X_train = []
        self.y_train = []
        
        self.window_size = 2
        self.learning_rate = 0.01
        
        self.words = []
        self.word_index = {}
    
    def initialize(self, V, data):
        '''Function to initialze the neural network'''
        self.V = V
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        self.words = data
        
        for i in range(len(data)):
            self.word_index[data[i]] = i
        
        print('\n----INITIALIZE FUNCTION------')
        print('\nSize of W and W1 are: ', self.W.shape, self.W1.shape)
            
            
    def feed_forward(self, X):
        '''Function for feed-forward step'''
        self.h = np.dot(self.W.T, X).reshape(self.N, 1) 
        self.u = np.dot(self.W1.T, self.h)
        self.y = softmax(self.u) 
        print('\n----FEEDFORWARD FUNCTION------')
        print('\nSize of y is: ', self.y.shape)
        return self.y
    
    def backpropagate(self, x, t):
        '''Function for back propagation using Stochastic Gradient Descent step'''
        e = self.y - np.asarray(t).reshape(self.V, 1) #e.shape is V X 1
        
        # Calculate partial derivative of loss function wrt W1
        dEdW1 = np.dot(self.h, e.T)
        
        X = np.array(x).reshape(self.V, 1)
        
        # Calculate partial derivative of loss function wrt W
        dEdW = np.dot(X, np.dot(self.W1, e).T)
        
        # Update the weights
        self.W1 = self.W1 - self.learning_rate * dEdW1
        self.W = self.W - self.learning_rate * dEdW
    
    def train(self, learning_rate = 0.01, epochs= 100):
        
        self.learning_rate = learning_rate
        self.epochs = epochs
        
        '''Function to train the Word2Vec model'''
        
        # Generate Training data
        generate_training_data(training_data,w2v)
        
        X_real = self.X_train
        y_real = self.y_train
              
        # Loop through each epoch
        for Pass in range(1,epochs):
                
            # Pick a random index
            index = np.random.randint(y_real)
            print(index)
            
            # Pick a row from that index
            X_train = X_real[index]
            y_train = y_real[index]
            
            # Initialize Loss
            self.loss = 0
            
            # Forward Pass
            self.feed_forward(X_train)
                
            # Backpropagation
            self.backpropagate(X_train, y_train)
                
            C = 0
            for m in range(self.V): 
                if(y_train[m]): 
                    self.loss += -1*self.u[m][0] 
                    C += 1
                        
            # Calculate Loss        
            self.loss += C * np.log(np.sum(np.exp(self.u)))
                
            print("Epoch: ", Pass, " Loss: ", self.loss)
            
            #self.alpha *= 1/((1+self.alpha*Pass))
            
    def predict(self, word, number_of_predictions):
        '''Function to predict context words using Word2Vec model'''
        # Check if word is contained in the dictionary
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X)
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i
                
            # Sort top context words in the output    
            sorted_output = [] 
            for k in sorted(output, reverse=True): 
                sorted_output.append(self.words[output[k]]) 
                if(len(sorted_output)>=number_of_predictions): 
                    break
            return sorted_output 
        else: 
            print("Error: Word not found in dicitonary")
    

## Data Preprocessing

In [47]:
def preprocessing(corpus):
    '''Function for data preprocessing'''
    processed = []
    stop_words = stopwords.words('english')
    
    # Split text corpus into sentences
    sentences = corpus.split(".")
    
    # Loop through each sentence
    for i in range(len(sentences)):
        
        # Remove leading and trailing characters
        sentences[i] = sentences[i].strip()
        
        # Split sentence into list of words
        sentence = sentences[i].split()
        
        # Remove punctuations
        x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
        
        # Convert to lower case
        x = [word.lower() for word in x]
        
        processed.append(x) 
        
    return processed    

## Train the Model

In [70]:
def generate_training_data(sentences, w2v):
    '''Function to generate training data for Word2Vec'''
    
    print('\n------GENERATE TRANING DATA------')
    
    data = {}
    
    # Loop throuch each sentence
    for sentence in sentences:
        
        # Loop through each word
        for word in sentence:
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
                
    V = len(data) # Size of Vocabulary
    data = sorted(list(data.keys()))
    
    vocab = {}
    
    # Store words into vocabulary
    for i in range(len(data)): 
        vocab[data[i]] = i 
    
    print('\nVocabulary is:', vocab, '\n')
    print('\nVocabulary size is (V):', len(vocab),'\n')
    
    # Loop through each sentence to find center words and context words 
    for sentence in sentences: 
        for i in range(len(sentence)): 
            center_word = [0 for x in range(V)] 
            center_word[vocab[sentence[i]]] = 1
            
            # Get the context words from the sliding window 
            context = [0 for x in range(V)]
            for j in range(i-w2v.window_size, i+w2v.window_size): 
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1
                    
            w2v.X_train.append(center_word) 
            w2v.y_train.append(context)
            
    w2v.initialize(V, data)
    
    print('\nSize of X_train and y_train is:', len(w2v.X_train), len(w2v.y_train))
    
    print('\nX_train is:', w2v.X_train, '\n\ny_train is:', w2v.y_train)
    
    return w2v.X_train, w2v.y_train

## Run the model

In [71]:
text_corpus = "Welcome students to the Department of Computer Science. We have great faculty and professors. We will have a welcome program today."

# Number of epochs
epochs = 1000

# Data preprocessing  
training_data = preprocessing(text_corpus)

# Word2Vec
w2v = word2vec()

# Train the model
w2v.train(epochs)

# Predict using model  
print('\nThe predicted context words are', w2v.predict("welcome", 5))


------GENERATE TRANING DATA------

Vocabulary is: {'computer': 0, 'department': 1, 'faculty': 2, 'great': 3, 'professors': 4, 'program': 5, 'science': 6, 'students': 7, 'today': 8, 'we': 9, 'welcome': 10} 


Vocabulary size is (V): 11 


----INITIALIZE FUNCTION------

Size of W and W1 are:  (11, 10) (10, 11)

Size of X_train and y_train is: 13 13

X_train is: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]] 

y_train is: [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0

ValueError: low >= high

NameError: name 'y_real' is not defined