In [0]:
import numpy as np
from collections import defaultdict

In [0]:
class Word2Vec:
    def __init__(self,epochs):
        self.n = 15
        self.epochs = epochs
        self.learning_rate = 0.01
        self.window = 3
    
    def prep_training_data(self,corpus):
        self.word_count = defaultdict(int)
        for row in corpus:
            tokens = row.split()
            for word in tokens:
                self.word_count[word]  += 1
        
        self.vocab_count = len(self.word_count.keys())
        self.word_list = sorted(self.word_count.keys(),reverse=False)
        self.word_index = dict((word,i) for word,i in enumerate(self.word_list))
        self.index_word = dict((i,word) for word,i in enumerate(self.word_list))
        
        training_data = []
        
        for row in corpus:
            tokens = row.split()
            sentence_length = len(tokens)
            for i,word in enumerate(tokens):
                word_target = self.onehotvector(word)
                word_context = []
                for j in range(i-self.window,i+self.window+1):
                    if j!=i and j<sentence_length-1 and j>=0:
                        word_context.append(self.onehotvector(tokens[j]))
                training_data.append([word_target,word_context])
        return training_data,self.word_list
        
    
    def forward(self,x):
        h = np.dot(self.w1.T,x)
        u_c = np.dot(self.w2.T,h)
        y = self.softmax(u_c)
        return y,u_c,h
    
    def backward(self,e,h,x):
        dl_w1 = np.outer(h,e)
        dl_w2 = np.outer(x,np.dot(self.w2,e.T))
        self.w1 = self.w1 - (self.learning_rate * dl_w1)
        self.w2 = self.w2 - (self.learning_rate * dl_w2)
        pass

    
    def softmax(self,x):
        exp = np.exp(x - np.max(x))
        return exp/exp.sum(axis=0)
    
        
    def train(self,training_data):
        self.w1 = np.random.uniform(-0.8,0.8,(self.vocab_count,self.n))
        self.w2 = np.random.uniform(-0.8,0.8,(self.n,self.vocab_count))
        
        for i in range(0,self.epochs):
            self.loss = 0
            for word_target,word_context in training_data:
                y_pred,u_c,h = self.forward(word_target)
                EI = np.sum([np.subtract(y_pred,word) for word in word_context],axis=0)
                self.backward(EI,h,word_target)
                self.loss += -np.sum([u_c[word.index(1)] for word in word_context]) + len(word_context) * np.log(np.sum(np.exp(u_c)))
            print("Epoch:",i,"Loss:",self.loss)
        pass
        
    def onehotvector(self,word):
        word_vec = [0 for i in range(0,self.vocab_count)]
        index = self.index_word[word]
        word_vec[index] = 1
        return word_vec
        
    def word_vector(self,word):
        w_index = self.index_word[word]
        vector = self.w1[w_index]
        return vector
    
    def display_onehot(self):
        display_list = {}
        for word in self.word_list:
            one_hot_word = self.onehotvector(word)
            display_list[word] = one_hot_word
        return display_list
        
    
    def word_sim(self,word,topn):
        word_similarity = {}
        index = self.index_word[word]
        vector1 = self.w1[index]
        for i in range(self.vocab_count):
            vector2 = self.w1[i]
            dot_pdt = np.dot(vector1,vector2)
            magnitude = np.linalg.norm(vector1) * np.linalg.norm(vector2)
            similarity = dot_pdt/magnitude
            word_similarity[self.word_index[i]] = similarity
        sim_words = sorted(word_similarity.items(),key=lambda word :word[1],reverse=True)
        print(sim_words[:topn])

In [4]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]

word2vec = Word2Vec(8)
training_data,word_list = word2vec.prep_training_data(corpus)
word2vec.train(training_data)

Epoch: 0 Loss: 175.30514012989875
Epoch: 1 Loss: 174.69866547992837
Epoch: 2 Loss: 174.23252170019964
Epoch: 3 Loss: 173.89205487148124
Epoch: 4 Loss: 173.66686427450838
Epoch: 5 Loss: 173.5499277269982
Epoch: 6 Loss: 173.53700875284775
Epoch: 7 Loss: 173.6262662982898


### One hot encoded vectors

In [5]:
print("Sample of training data:",training_data[0:3])
print("\nVocabulary:",word_list)

Sample of training data: [[[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]], [[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]], [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]]]]

Vocabulary: ['a', 'berlin', 'capital', 'france', 'germany', 'he', 'is', 'king', 'man', 'paris', 'poland', 'queen', 'she', 'warsaw', 'woman']


In [6]:
#Representation of vocabulary as one hot encoded vector
word2vec.display_onehot()

{'a': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'berlin': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'capital': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'france': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'germany': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'he': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'is': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 'king': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 'man': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 'paris': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 'poland': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 'queen': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 'she': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 'warsaw': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 'woman': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}

### Find similar words

In [7]:
#word2vec.word_vector('he')
word2vec.word_sim('capital',3)

[('capital', 1.0000000000000002), ('germany', 0.4793138889758929), ('woman', 0.2613962270962048)]


### Using gensim for word2vec

In [8]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from matplotlib import pyplot

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [9]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [0]:
model = Word2Vec(common_texts,size=100,window=5,sg=1,min_count=1,alpha=0.025)
# input: common_texts
# size: dimension of vector
# window: size of window
# sg: 1 for skip gram, 0 for cbow
# min_count: specify the minimum frequency of words that should be considered
# alpha: learning rate

### List of words in vocab

In [11]:
list(model.wv.vocab)

['human',
 'interface',
 'computer',
 'survey',
 'user',
 'system',
 'response',
 'time',
 'eps',
 'trees',
 'graph',
 'minors']

In [13]:
print(list(model.wv.vocab)[0],vectors[0])

human [-8.4078533e-04  7.3818484e-04  2.8219493e-04 -2.5752909e-03
  3.1461527e-03  4.2359149e-03  3.7395945e-05  2.4014621e-03
 -1.6622512e-03  4.4716941e-03 -4.0361248e-03  9.4162242e-04
  1.8091237e-03  4.4012014e-03 -4.4064531e-03 -4.2356872e-03
 -2.5811817e-03  8.6736411e-04  1.7827799e-04  1.6309498e-03
  3.8797068e-03 -2.9843773e-03  4.3783961e-03  4.8892866e-03
  4.0653595e-03  4.6112523e-03  4.4137607e-03 -3.5070870e-03
 -1.4383691e-04  3.5296250e-03 -4.3766564e-03 -2.6324235e-03
  4.6682917e-03  4.5868112e-03 -4.0839841e-03 -3.9637261e-03
  4.0467991e-03 -1.0017374e-03 -4.0663057e-03 -3.2315750e-03
  3.3967344e-03  1.8175585e-03  3.0093880e-03  4.4311779e-03
 -2.8737427e-03 -2.3823194e-03  1.9071057e-03  2.2540952e-03
  2.1133767e-03  1.4397701e-03 -7.7868404e-04  3.2279287e-03
 -2.7105834e-03  4.0984307e-03  7.6502300e-04 -7.7057688e-04
  2.8253566e-03 -1.7193158e-03 -3.8816449e-03 -4.6067848e-04
  2.1390640e-03  2.7580806e-03  3.9993748e-03 -2.9359162e-03
  4.0492630e-03  9