# WordToVec tutorial

+ Take a 3 layer neural network. (1 input layer + 1 hidden layer + 1 output layer)
+ Feed it a word and train it to predict its neighbouring word.
+ Remove the last (output layer) and keep the input and hidden layer.
+ Now, input a word from within the vocabulary. The output given at the hidden layer is the ‘word embedding’ of the input word.




In [1]:
import numpy as np
import tensorflow as tf

corpus_raw = "He is the king . The king is royal . She is the royal  queen "
corpus = corpus_raw.lower()

In [2]:
words = []
for i in corpus.split():
    if( i != '.' ):
        words.append(i)
words = set(words)
word2int = {}
int2word = {}
vocab_size = len(words)

for i,word in enumerate(words):
    #print(i,word)
    word2int[word] = i 
    int2word[i] = word


In [3]:
print(word2int['queen'])

5


In [4]:
print(int2word[0])

royal


In [5]:
raw_sentences = corpus.split(".")
sentences = []
for i in raw_sentences:
    sentences.append(i.split())
print(sentences)


[['he', 'is', 'the', 'king'], ['the', 'king', 'is', 'royal'], ['she', 'is', 'the', 'royal', 'queen']]


In [6]:
data = []
WINDOW_SZ = 2
for i in sentences:
    for idx, word in enumerate(i):
        for nb_word in i[max(idx - WINDOW_SZ, 0) : min(idx + WINDOW_SZ, len(i)) + 1]:
            if (nb_word != word):
                data.append([word, nb_word])
                #print("  ",[word, nb_word])

In [7]:
print(len(data))
data

34


[['he', 'is'],
 ['he', 'the'],
 ['is', 'he'],
 ['is', 'the'],
 ['is', 'king'],
 ['the', 'he'],
 ['the', 'is'],
 ['the', 'king'],
 ['king', 'is'],
 ['king', 'the'],
 ['the', 'king'],
 ['the', 'is'],
 ['king', 'the'],
 ['king', 'is'],
 ['king', 'royal'],
 ['is', 'the'],
 ['is', 'king'],
 ['is', 'royal'],
 ['royal', 'king'],
 ['royal', 'is'],
 ['she', 'is'],
 ['she', 'the'],
 ['is', 'she'],
 ['is', 'the'],
 ['is', 'royal'],
 ['the', 'she'],
 ['the', 'is'],
 ['the', 'royal'],
 ['the', 'queen'],
 ['royal', 'is'],
 ['royal', 'the'],
 ['royal', 'queen'],
 ['queen', 'the'],
 ['queen', 'royal']]

In [9]:
def to_one_hot(data_point_idx, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_idx] = 1
    return temp

x_train = []
y_train = []

for i in data:
    x_train.append(to_one_hot(word2int[i[0]],vocab_size))
    y_train.append(to_one_hot(word2int[i[1]],vocab_size))
    
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
#print(x_train)
print(x_train.shape,y_train.shape)

(34, 7) (34, 7)


In [10]:
x = tf.placeholder(tf.float32,shape=[None,vocab_size])
y_label = tf.placeholder(tf.float32, shape=[None,vocab_size])

In [15]:
EMBEDDING_DIM = 5

W1 = tf.Variable(tf.random_normal([vocab_size,EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM]))
hidden_representation = tf.add(tf.matmul(x,W1),b1)

W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM,vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation,W2),b2))
