In [84]:
import numpy as np 
import string
import nltk
nltk.download('stopwords') 
from nltk.corpus import stopwords  
   
def softmax(x): 
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x)) 
    return e_x / e_x.sum() 
   
class word2vec(object): 
    def __init__(self): 
        self.N = 10
        self.X_train = [] 
        self.y_train = [] 
        self.window_size = 2
        self.alpha = 0.025
        self.words = [] 
        self.word_index = {} 
   
    def initialize(self,V,data): 
        self.V = V 
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N)) 
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V)) 
           
        self.words = data 
        for i in range(len(data)): 
            self.word_index[data[i]] = i 
   
       
    def feed_forward(self,X): 
#         print(self.W.T, X)
#         print(self.W.T.shape, X)

        self.h = np.dot(self.W.T,X).reshape(self.N,1) 
        self.u = np.dot(self.W1.T,self.h) 
#         print(self.u) 
        self.y = softmax(self.u)   
        return self.y 
           
    def backpropagate(self,x,t): 
        e = self.y - np.asarray(t).reshape(self.V,1) 
        # e.shape is V x 1 
        dLdW1 = np.dot(self.h,e.T) 
        X = np.array(x).reshape(self.V,1) 
        dLdW = np.dot(X, np.dot(self.W1,e).T) 
        self.W1 = self.W1 - self.alpha*dLdW1 
        self.W = self.W - self.alpha*dLdW 
           
    def train(self,epochs): 
        for x in range(1,epochs):         
            self.loss = 0
            for j in range(len(self.X_train)): 
                self.feed_forward(self.X_train[j]) 
                self.backpropagate(self.X_train[j],self.y_train[j]) 
                C = 0
                for m in range(self.V): 
                    if(self.y_train[j][m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u))) 
            print("epoch ",x, " loss = ",self.loss) 
            self.alpha *= 1/( (1+self.alpha*x) ) 
              
    def predict(self,word,number_of_predictions): 
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X) 
            print('prediction:--', prediction)
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i 
            print('output:--', output)   
            top_context_words = [] 
            for k in sorted(output,reverse=True): 
                top_context_words.append(self.words[output[k]]) 
                if(len(top_context_words)>=number_of_predictions): 
                    break
       
            return top_context_words 
        else: 
            print("Word not found in dicitonary")
def preprocessing(corpus): 
    stop_words = set(stopwords.words('english'))     
    training_data = [] 
    sentences = corpus.split(".") 
    for i in range(len(sentences)): 
        sentences[i] = sentences[i].strip() 
        sentence = sentences[i].split() 
        x = [word.strip(string.punctuation) for word in sentence 
                                     if word not in stop_words] 
        x = [word.lower() for word in x] 
        training_data.append(x) 
    return training_data

def prepare_data_for_training(sentences,w2v): 
    data = {} 
    for sentence in sentences: 
        for word in sentence: 
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
    V = len(data) 
    data = sorted(list(data.keys())) 
    print(data)
    print('data===============')
    vocab = {} 
    for i in range(len(data)): 
        vocab[data[i]] = i 
       
    #for i in range(len(words)): 
    for sentence in sentences: 
        print(sentence)
        for i in range(len(sentence)): 
            center_word = [0 for x in range(V)] 
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)] 
              
            for j in range(i-w2v.window_size,i+w2v.window_size): 
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1
#                     print('Neighbours:-', sentence[j])
            w2v.X_train.append(center_word) 
            w2v.y_train.append(context)
#             print(i)
#             print('Center word:',sentence[i])
            
#             print(center_word)
            
#             print(context)
#             print('==========================')
    w2v.initialize(V,data) 
    print(w2v.X_train)
    print(data)
   
    return w2v.X_train,w2v.y_train

corpus = "" 
corpus += "The earth revolves around the sun. The moon revolves around the earth"
epochs = 1000
  
training_data = preprocessing(corpus) 
print('training_data', training_data)
w2v = word2vec() 
  
prepare_data_for_training(training_data,w2v) 
w2v.train(epochs)  
  
print(w2v.predict("around",3))


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yskcusat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


training_data [['the', 'earth', 'revolves', 'around', 'sun'], ['the', 'moon', 'revolves', 'around', 'earth']]
['around', 'earth', 'moon', 'revolves', 'sun', 'the']
['the', 'earth', 'revolves', 'around', 'sun']
['the', 'moon', 'revolves', 'around', 'earth']
[[0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]]
['around', 'earth', 'moon', 'revolves', 'sun', 'the']
epoch  1  loss =  45.32584692915351
epoch  2  loss =  43.293559568534675
epoch  3  loss =  41.637868548532985
epoch  4  loss =  40.2830161231592
epoch  5  loss =  39.16863833519433
epoch  6  loss =  38.247202828763285
epoch  7  loss =  37.48122060940387
epoch  8  loss =  36.840956765941236
epoch  9  loss =  36.30271220174109
epoch  10  loss =  35.847543416102276
epoch  11  loss =  35.46028378695405
epoch  12  loss =  35.12877496916792
epoch  13  loss =  34.843254167582245
epoch  14  

epoch  226  loss =  31.928926394388583
epoch  227  loss =  31.928415500807922
epoch  228  loss =  31.927909326385915
epoch  229  loss =  31.92740780624514
epoch  230  loss =  31.926910876688495
epoch  231  loss =  31.926418475172618
epoch  232  loss =  31.925930540281886
epoch  233  loss =  31.925447011703255
epoch  234  loss =  31.92496783020159
epoch  235  loss =  31.92449293759586
epoch  236  loss =  31.924022276735755
epoch  237  loss =  31.923555791479114
epoch  238  loss =  31.923093426669762
epoch  239  loss =  31.922635128116077
epoch  240  loss =  31.922180842570015
epoch  241  loss =  31.921730517706713
epoch  242  loss =  31.921284102104657
epoch  243  loss =  31.920841545226263
epoch  244  loss =  31.920402797399035
epoch  245  loss =  31.919967809797196
epoch  246  loss =  31.919536534423738
epoch  247  loss =  31.919108924092956
epoch  248  loss =  31.918684932413445
epoch  249  loss =  31.918264513771494
epoch  250  loss =  31.917847623314803
epoch  251  loss =  31.91743

epoch  637  loss =  31.856687357275884
epoch  638  loss =  31.85662736293266
epoch  639  loss =  31.85656756028226
epoch  640  loss =  31.8565079484079
epoch  641  loss =  31.856448526398538
epoch  642  loss =  31.856389293349043
epoch  643  loss =  31.856330248359914
epoch  644  loss =  31.856271390537422
epoch  645  loss =  31.85621271899347
epoch  646  loss =  31.856154232845526
epoch  647  loss =  31.856095931216682
epoch  648  loss =  31.856037813235524
epoch  649  loss =  31.855979878036095
epoch  650  loss =  31.855922124757903
epoch  651  loss =  31.85586455254583
epoch  652  loss =  31.85580716055012
epoch  653  loss =  31.8557499479263
epoch  654  loss =  31.85569291383516
epoch  655  loss =  31.855636057442762
epoch  656  loss =  31.85557937792031
epoch  657  loss =  31.85552287444414
epoch  658  loss =  31.85546654619572
epoch  659  loss =  31.855410392361595
epoch  660  loss =  31.855354412133305
epoch  661  loss =  31.855298604707375
epoch  662  loss =  31.855242969285303

epoch  849  loss =  31.847179210089983
epoch  850  loss =  31.847145776686766
epoch  851  loss =  31.847112423105475
epoch  852  loss =  31.847079149060665
epoch  853  loss =  31.84704595426826
epoch  854  loss =  31.84701283844555
epoch  855  loss =  31.846979801311125
epoch  856  loss =  31.846946842584956
epoch  857  loss =  31.8469139619883
epoch  858  loss =  31.846881159243765
epoch  859  loss =  31.846848434075252
epoch  860  loss =  31.846815786207987
epoch  861  loss =  31.84678321536847
epoch  862  loss =  31.846750721284472
epoch  863  loss =  31.846718303685105
epoch  864  loss =  31.846685962300697
epoch  865  loss =  31.84665369686286
epoch  866  loss =  31.84662150710448
epoch  867  loss =  31.846589392759682
epoch  868  loss =  31.846557353563824
epoch  869  loss =  31.846525389253515
epoch  870  loss =  31.84649349956657
epoch  871  loss =  31.846461684242072
epoch  872  loss =  31.846429943020286
epoch  873  loss =  31.846398275642677
epoch  874  loss =  31.8463666818

In [87]:
w2v.predict('earth', 3)

prediction:-- [[0.3329471 ]
 [0.03930083]
 [0.1299016 ]
 [0.29369915]
 [0.04515947]
 [0.15899184]]
output:-- {0.3329471043380219: 0, 0.039300833792269445: 1, 0.12990160457628439: 2, 0.04515946804601789: 4, 0.15899183636636988: 5, 0.29369915288103654: 3}


['around', 'revolves', 'the']