## Import Required Modules

In [1]:
import numpy as np
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rojin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Softmax Activation Function

In [2]:
def softmax(x):
    '''Function to compute the Softmax values for each sets of scores in x'''
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def softmax_1(x):
    '''Function to compute the Softmax values for each sets of scores in x'''
    e_x = np.exp(x)
    return e_x / np.sum(e_x, axis=0)

## One Hot Encoding

In [3]:
def onehotencoding(self, word):
    '''Function to covert a word to one-hot-encoded value'''
    word_vec = [0 for i in range(0, self.v_count)]
    word_index = self.word_index[word]
    word_vec[word_index] = 1
    return word_vec

## Word2Vec

In [4]:
class word2vec(object):
    def __init__(self):
        self.N = 10
        self.X_train = []
        self.y_train = []
        self.window_size = 2
        self.alpha = 0.001
        self.words = []
        self.word_index = {}
    
    def initialize(self,V,data):
        self.V = V
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        self.words = data
        for i in range(len(data)):
            self.word_index[data[i]] = i
            
    def feed_forward(self, X): 
        self.h = np.dot(self.W.T, X).reshape(self.N, 1) 
        self.u = np.dot(self.W1.T, self.h)
        self.y = softmax(self.u)   
        return self.y
    
    def backpropagate(self, x, t):
        e = self.y - np.asarray(t).reshape(self.V, 1) #e.shape is V X 1
        # Calculate partial derivative of loss function wrt W1
        dEdW1 = np.dot(self.h, e.T)
        
        X = np.array(x).reshape(self.V, 1)
        # Calculate partial derivative of loss function wrt W
        dEdW = np.dot(X, np.dot(self.W1, e).T)
        
        # Update the weights
        self.W1 = self.W1 - self.alpha * dEdW1
        self.W = self.W - self.alpha * dEdW
    
    def train(self, epochs):
        # Loop through each epoch
        for x in range(1,epochs):
            
            # Initialize Loss
            self.loss = 0
            
            # Loop through each training sample
            for j in range(len(self.X_train)):
                # Forward Pass
                self.feed_forward(self.X_train[j])
                
                # Backpropagation
                self.backpropagate(self.X_train[j],self.y_train[j]) 
                C = 0
                for m in range(self.V): 
                    if(self.y_train[j][m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                        
                # Calculate Loss        
                self.loss += C*np.log(np.sum(np.exp(self.u)))
                
            print("Epoch: ", x, " Loss: ", self.loss)
            self.alpha *= 1/((1+self.alpha*x))
            
    def predict(self, word, number_of_predictions):       
        # Check if word is contained in the dictionary
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X)
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i
                
            # Sort top context words in the output    
            sorted_output = [] 
            for k in sorted(output, reverse=True): 
                sorted_output.append(self.words[output[k]]) 
                if(len(sorted_output)>=number_of_predictions): 
                    break
            return sorted_output 
        else: 
            print("Error: Word not found in dicitonary")
    

## Data Preprocessing

In [5]:
def preprocessing(corpus):
    processed = []
    stop_words = stopwords.words('english')
    # Split text corpus into sentences
    sentences = corpus.split(".")
    # Loop through each sentence
    for i in range(len(sentences)):
        # Remove leading and trailing characters
        sentences[i] = sentences[i].strip()
        # Split sentence into list of words
        sentence = sentences[i].split()
        # Remove punctuations
        x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
        # Convert to lowe
        x = [word.lower() for word in x] 
        processed.append(x) 
    return processed    

## Train the Model

In [6]:
def generate_training_data(sentences, w2v):
    data = {}
    # Loop throuch each sentence
    for sentence in sentences:
        # Loop through each word
        for word in sentence: 
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
    V = len(data) # Size of Vocabulary
    data = sorted(list(data.keys()))
    
    vocab = {}
    # Store words into vocabulary
    for i in range(len(data)): 
        vocab[data[i]] = i 
       
    # Loop through each sentence 
    for sentence in sentences: 
        for i in range(len(sentence)): 
            center_word = [0 for x in range(V)] 
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)] 
              
            for j in range(i-w2v.window_size,i+w2v.window_size): 
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1
            w2v.X_train.append(center_word) 
            w2v.y_train.append(context) 
    w2v.initialize(V,data) 
    return w2v.X_train,w2v.y_train

In [16]:
text_corpus = "" 
text_corpus += "Welcome to the Department of Computer Science. We have great faculty and professors. We will have a welcome program today."

# Number of epochs
epochs = 1000

# Data preprocessing  
training_data = preprocessing(text_corpus)

# Word2Vec
w2v = word2vec()

# Generate Training data
generate_training_data(training_data,w2v)

# Train the model
w2v.train(epochs)

# Predict using model  
print(w2v.predict("welcome", 5))

Epoch:  1  Loss:  54.72114233862495
Epoch:  2  Loss:  54.6483665924774
Epoch:  3  Loss:  54.57589018858346
Epoch:  4  Loss:  54.503782754863494
Epoch:  5  Loss:  54.432112203572494
Epoch:  6  Loss:  54.36094435345277
Epoch:  7  Loss:  54.290342579033975
Epoch:  8  Loss:  54.22036749155833
Epoch:  9  Loss:  54.1510766552335
Epoch:  10  Loss:  54.08252434166894
Epoch:  11  Loss:  54.01476132446768
Epoch:  12  Loss:  53.947834715054206
Epoch:  13  Loss:  53.8817878399533
Epoch:  14  Loss:  53.81666015891921
Epoch:  15  Loss:  53.75248722257378
Epoch:  16  Loss:  53.68930066756233
Epoch:  17  Loss:  53.62712824669158
Epoch:  18  Loss:  53.565993891080275
Epoch:  19  Loss:  53.50591780103166
Epoch:  20  Loss:  53.44691656212924
Epoch:  21  Loss:  53.3890032829488
Epoch:  22  Loss:  53.33218775077104
Epoch:  23  Loss:  53.27647660174998
Epoch:  24  Loss:  53.22187350213465
Epoch:  25  Loss:  53.16837933734076
Epoch:  26  Loss:  53.11599240591044
Epoch:  27  Loss:  53.06470861567084
Epoch:  2

Epoch:  337  Loss:  50.45095514836132
Epoch:  338  Loss:  50.44990433356813
Epoch:  339  Loss:  50.44885967819238
Epoch:  340  Loss:  50.44782112854474
Epoch:  341  Loss:  50.4467886315545
Epoch:  342  Loss:  50.44576213476074
Epoch:  343  Loss:  50.44474158630364
Epoch:  344  Loss:  50.44372693491595
Epoch:  345  Loss:  50.442718129914596
Epoch:  346  Loss:  50.4417151211924
Epoch:  347  Loss:  50.44071785921002
Epoch:  348  Loss:  50.43972629498796
Epoch:  349  Loss:  50.43874038009866
Epoch:  350  Loss:  50.43776006665891
Epoch:  351  Loss:  50.43678530732218
Epoch:  352  Loss:  50.43581605527115
Epoch:  353  Loss:  50.434852264210406
Epoch:  354  Loss:  50.433893888359314
Epoch:  355  Loss:  50.432940882444726
Epoch:  356  Loss:  50.43199320169417
Epoch:  357  Loss:  50.431050801828945
Epoch:  358  Loss:  50.430113639057296
Epoch:  359  Loss:  50.42918167006789
Epoch:  360  Loss:  50.42825485202321
Epoch:  361  Loss:  50.42733314255311
Epoch:  362  Loss:  50.42641649974856
Epoch:  

Epoch:  683  Loss:  50.27069915280515
Epoch:  684  Loss:  50.27044238529855
Epoch:  685  Loss:  50.27018636831433
Epoch:  686  Loss:  50.26993109857128
Epoch:  687  Loss:  50.26967657280729
Epoch:  688  Loss:  50.26942278777916
Epoch:  689  Loss:  50.269169740262555
Epoch:  690  Loss:  50.26891742705165
Epoch:  691  Loss:  50.26866584495931
Epoch:  692  Loss:  50.268414990816716
Epoch:  693  Loss:  50.268164861473274
Epoch:  694  Loss:  50.26791545379663
Epoch:  695  Loss:  50.267666764672306
Epoch:  696  Loss:  50.26741879100377
Epoch:  697  Loss:  50.2671715297122
Epoch:  698  Loss:  50.266924977736444
Epoch:  699  Loss:  50.266679132032756
Epoch:  700  Loss:  50.26643398957483
Epoch:  701  Loss:  50.26618954735355
Epoch:  702  Loss:  50.265945802376955
Epoch:  703  Loss:  50.265702751670055
Epoch:  704  Loss:  50.26546039227477
Epoch:  705  Loss:  50.26521872124974
Epoch:  706  Loss:  50.26497773567029
Epoch:  707  Loss:  50.26473743262824
Epoch:  708  Loss:  50.264497809231806
Epoc