<a href="https://colab.research.google.com/github/sanket143/Notebooks/blob/master/Word2Vec%20-%20IE406.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np 
import pandas as pd
import string 
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
def softmax(x):
    e_x = np.exp(x - np.max(x)) 
    return e_x / e_x.sum() 

class word2vec(object): 
    def __init__(self):
        self.N = 100
        self.X_train = [] 
        self.y_train = []
        self.window_size = 3
        self.alpha = 0.001
        self.words = [] 
        self.word_index = {} 

    def initialize(self,V,data): 
        self.V = V 
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N)) 
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V)) 

        self.words = data 
        for i in range(len(data)): 
            self.word_index[data[i]] = i 

    
    def feed_forward(self,X): 
        self.h = np.dot(self.W.T,X).reshape(self.N,1) 
        self.u = np.dot(self.W1.T,self.h) 
        #print(self.u) 
        self.y = softmax(self.u) 
        return self.y
        
    def backpropagate(self,x,t): 
        e = self.y - np.asarray(t).reshape(self.V,1) 
        # e.shape is V x 1 
        dLdW1 = np.dot(self.h,e.T) 
        #print(t)
        X = np.array(x).reshape(self.V,1) 
        dLdW = np.dot(X, np.dot(self.W1,e).T)
        self.W1 = self.W1 - self.alpha*dLdW1 
        self.W = self.W - self.alpha*dLdW 
        
    def train(self,epochs): 
        for x in range(1,epochs):
            self.loss = 0
            for j in range(len(self.X_train)): 
                self.feed_forward(self.X_train[j]) 
                self.backpropagate(self.X_train[j],self.y_train[j]) 
                C = 0
                for m in range(self.V): 
                    if(self.y_train[j][m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u))) 
            print("epoch ",x, " loss = ",self.loss) 
            self.alpha *= 1/( (1+self.alpha*x) ) 
            
    def predict(self,word,number_of_predictions): 
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X) 
            print(prediction)
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i 
            
            top_context_words = [] 
            for k in sorted(output,reverse=True): 
                top_context_words.append(self.words[output[k]]) 
                if(len(top_context_words)>=number_of_predictions): 
                    break
    
            return top_context_words 
        else: 
            print("Word not found in dicitonary")

In [0]:
def preprocessing(corpus): 
    stop_words = set(stopwords.words('english'))
    stop_words.add('<unk>')
    training_data = []
    sentences = corpus
    for i in range(len(sentences)): 
        sentences[i] = sentences[i].strip() 
        sentence = sentences[i].split() 
        x = [word.strip(string.punctuation) for word in sentence 
                                    if word not in stop_words] 
        x = [word.lower() for word in x] 
        training_data.append(x) 
    return training_data 
    

def prepare_data_for_training(sentences,w2v): 
    data = {} 
    for sentence in sentences: 
        for word in sentence: 
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
    V = len(data) 
    data = sorted(list(data.keys())) 
    vocab = {} 
    for i in range(len(data)): 
        vocab[data[i]] = i 
    
    #for i in range(len(words)): 
    for sentence in sentences: 
        for i in range(len(sentence)): 
            center_word = [0 for x in range(V)] 
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)] 
            
            for j in range(i-w2v.window_size,i+w2v.window_size): 
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1
            w2v.X_train.append(center_word) 
            w2v.y_train.append(context) 
    w2v.initialize(V,data) 

    return w2v.X_train,w2v.y_train

In [0]:
!wget https://raw.githubusercontent.com/sanket143/Data/master/data2.csv
df = pd.read_csv("data2.csv")

--2020-06-07 12:31:55--  https://raw.githubusercontent.com/sanket143/Data/master/data2.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5185500 (4.9M) [text/plain]
Saving to: ‘data2.csv.1’


2020-06-07 12:31:55 (13.6 MB/s) - ‘data2.csv.1’ saved [5185500/5185500]



In [0]:
corpus = df["title"][0:1000]
epochs = 100


training_data = preprocessing(corpus) 
w2v = word2vec() 

prepare_data_for_training(training_data,w2v) 
w2v.train(epochs)

epoch  1  loss =  509951.98315352725
epoch  2  loss =  504422.7775331249
epoch  3  loss =  502321.612200829
epoch  4  loss =  504080.3851099931
epoch  5  loss =  508277.9313711393
epoch  6  loss =  515460.10509864555
epoch  7  loss =  526798.137082982
epoch  8  loss =  543645.2949072206
epoch  9  loss =  567388.5397416024
epoch  10  loss =  599526.9684703425
epoch  11  loss =  641986.5158582601
epoch  12  loss =  697185.53168564
epoch  13  loss =  768071.3412893066
epoch  14  loss =  858396.2602019691
epoch  15  loss =  972958.4655230884
epoch  16  loss =  1117860.400232592
epoch  17  loss =  1301161.6807556453
epoch  18  loss =  1533507.7767743156




epoch  19  loss =  inf
epoch  20  loss =  inf
epoch  21  loss =  inf
epoch  22  loss =  inf
epoch  23  loss =  inf
epoch  24  loss =  inf
epoch  25  loss =  inf
epoch  26  loss =  inf
epoch  27  loss =  inf
epoch  28  loss =  inf
epoch  29  loss =  inf
epoch  30  loss =  inf
epoch  31  loss =  inf
epoch  32  loss =  inf
epoch  33  loss =  inf


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


epoch  34  loss =  inf
epoch  35  loss =  inf
epoch  36  loss =  inf
epoch  37  loss =  inf
epoch  38  loss =  inf


In [0]:
pd.set_option("max_columns", None) # TODO this is for debug
pd.set_option('display.width', 10000) # TODO this is for debug
def search(word):
  ind = w2v.word_index[word]
  wordV = w2v.W - w2v.W[ind]
  wordV = np.abs(wordV)
  wordV = wordV.sum(axis=1)
  wordV = wordV.argsort()[1:5]
  return [w2v.words[xi] for xi in wordV]
print(search('man'))

['carla', 'skills', 'removed', 'minute']
