In [1]:
#　W2V-CBOW Model
from __future__ import absolute_import , division, print_function, unicode_literals
import tensorflow as tf
import numpy as np
print(tf.__version__)


2.5.0-dev20201101


In [78]:
class Word2Vec:
    def __init__(self, vocab_size=0, embedding_dim=16, optimizer='sgd', epochs=10000):
        self.vocab_size=vocab_size
        self.embedding_dim=5
        self.epochs=epochs
        if optimizer=='adam':
            self.optimizer = tf.optimizers.Adam()
        else:
            self.optimizer = tf.optimizers.SGD(learning_rate=0.1)
    # Training method takes 2 inputs x_train and y_train that are one-hot vectors. 
    #We continuously optimize the weight and bias using tf.GradientTape().
    def train(self, x_train=None, y_train=None):
        self.W1 = tf.Variable(tf.random.normal([self.vocab_size, self.embedding_dim]))
        self.b1 = tf.Variable(tf.random.normal([self.embedding_dim]))
        
        self.W2 = tf.Variable(tf.random.normal([self.embedding_dim, self.vocab_size]))
        self.b2 = tf.Variable(tf.random.normal([self.vocab_size]))
        
        for _ in range(self.epochs):
            with tf.GradientTape() as t:
                hidden_layer = tf.add(tf.matmul(x_train, self.W1), self.b1)
                output_layer = tf.nn.softmax(tf.add(tf.matmul(hidden_layer, self.W2), self.b2))
                cross_entropy_loss = tf.reduce_mean(-tf.math.reduce_sum(y_train * tf.math.log(
                output_layer),axis=1))
                grads = t.gradient(cross_entropy_loss,[self.W1, self.b1, self.W2, self.b2])
                self.optimizer.apply_gradients(zip(grads, [self.W1, self.b1, self.W2, self.b2]))
                if(_ % 100 == 0):
                    print(cross_entropy_loss)
                
    def vectorized(self,word_idx):
        return(self.W1 + self.b1)[word_idx]
    

In [79]:
# ex
corpus_raw = 'He is the king . The king is royal . She is the royal  queen '

In [80]:
#conver to lower case
corpus_raw = corpus_raw.lower()
raw_sentences = corpus_raw.split('.')
sentences = []
for sentence in raw_sentences:
    sentences.append(sentence.split())
sentences


[['he', 'is', 'the', 'king'],
 ['the', 'king', 'is', 'royal'],
 ['she', 'is', 'the', 'royal', 'queen']]

In [81]:
data = []
WINDOW_SIZE = 2
for sentence in sentences:
    for word_index, word in enumerate(sentence):
        for nb_word in sentence[max(word_index - WINDOW_SIZE, 0): min(word_index + WINDOW_SIZE
                               , len(sentence)) + 1]:
            if nb_word != word:
                data.append([word, nb_word])
data

[['he', 'is'],
 ['he', 'the'],
 ['is', 'he'],
 ['is', 'the'],
 ['is', 'king'],
 ['the', 'he'],
 ['the', 'is'],
 ['the', 'king'],
 ['king', 'is'],
 ['king', 'the'],
 ['the', 'king'],
 ['the', 'is'],
 ['king', 'the'],
 ['king', 'is'],
 ['king', 'royal'],
 ['is', 'the'],
 ['is', 'king'],
 ['is', 'royal'],
 ['royal', 'king'],
 ['royal', 'is'],
 ['she', 'is'],
 ['she', 'the'],
 ['is', 'she'],
 ['is', 'the'],
 ['is', 'royal'],
 ['the', 'she'],
 ['the', 'is'],
 ['the', 'royal'],
 ['the', 'queen'],
 ['royal', 'is'],
 ['royal', 'the'],
 ['royal', 'queen'],
 ['queen', 'the'],
 ['queen', 'royal']]

In [82]:
'''
For convenience, create 2 helper dictionaries, i.e. word2int and int2word.
They do the simple mapping between words and corresponding integer values.
'''
words = []
for word in corpus_raw.split():
    if word != '.':
        words.append(word)
words = set(words) # remove repeat
word2int = {}
int2word={}
vocab_size = len(words) # gives the total number of unique words
for i, word in enumerate(words):
    word2int[word] = i
    int2word[i] = word

    


In [83]:
# Encode to x_train and y_train
# function to convert numbers to one hot vectors
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

x_train = [] # input word
y_train = [] # output word
for data_word in data:
    x_train.append(to_one_hot(word2int[data_word[0]],vocab_size))
    y_train.append(to_one_hot(word2int[data_word[1]],vocab_size))

# convert to numpy arrays
x_train = np.asarray(x_train, dtype='float32')
y_train = np.asarray(y_train, dtype='float32')


In [84]:
# train
w2v = Word2Vec(vocab_size=vocab_size, optimizer='adam', epochs=10000)
w2v.train(x_train, y_train)
# training process 



tf.Tensor(6.0455637, shape=(), dtype=float32)
tf.Tensor(4.504267, shape=(), dtype=float32)
tf.Tensor(3.5028133, shape=(), dtype=float32)
tf.Tensor(2.9092138, shape=(), dtype=float32)
tf.Tensor(2.523383, shape=(), dtype=float32)
tf.Tensor(2.2378201, shape=(), dtype=float32)
tf.Tensor(2.0203726, shape=(), dtype=float32)
tf.Tensor(1.8567343, shape=(), dtype=float32)
tf.Tensor(1.7347165, shape=(), dtype=float32)
tf.Tensor(1.6440392, shape=(), dtype=float32)
tf.Tensor(1.5762101, shape=(), dtype=float32)
tf.Tensor(1.5245409, shape=(), dtype=float32)
tf.Tensor(1.4843237, shape=(), dtype=float32)
tf.Tensor(1.4526641, shape=(), dtype=float32)
tf.Tensor(1.4278345, shape=(), dtype=float32)
tf.Tensor(1.4084945, shape=(), dtype=float32)
tf.Tensor(1.3933729, shape=(), dtype=float32)
tf.Tensor(1.3813945, shape=(), dtype=float32)
tf.Tensor(1.3717698, shape=(), dtype=float32)
tf.Tensor(1.3639414, shape=(), dtype=float32)
tf.Tensor(1.3575104, shape=(), dtype=float32)
tf.Tensor(1.3521835, shape=(), dtype

In [85]:
w2v.vectorized(word2int['queen'])

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([-2.004537  ,  0.32331777, -0.6099627 , -0.5219996 , -3.464445  ],
      dtype=float32)>