## Word embedding techniques using Embedding Layers in Keras

In [8]:
# for tensorflow greater than 2.0 and keras
from tensorflow.keras.preprocessing.text import one_hot

In [5]:
## initialize sentenses
sent = ['the glass of milk', 
        'the glass of juice',
        'the cup of tea',
       'I am good boy',
       'I am a good developer',
       'understand the meaning of words',
       'your videos are good',]

In [6]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [10]:
# vocabular size = size of dictionary
voc_size = 10000

## One hot representation 

In [13]:
onehot_repr = [one_hot(words, voc_size)for words in sent]

In [14]:
print(onehot_repr)

[[4839, 8986, 7330, 9150], [4839, 8986, 7330, 2866], [4839, 8603, 7330, 8983], [4075, 8817, 9449, 8930], [4075, 8817, 7842, 9449, 9079], [7378, 4839, 193, 7330, 4233], [7982, 651, 5593, 9449]]


## Word Embedding Representation

In [17]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences # to maintain same no. of words in all sentences 
from tensorflow.keras.models import Sequential

In [16]:
import numpy as np

In [19]:
sent_length = 8 # make all the sentences of words 8 
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=sent_length) # pre will add zeros in front to make word length as 8
print(embedded_docs)

[[   0    0    0    0 4839 8986 7330 9150]
 [   0    0    0    0 4839 8986 7330 2866]
 [   0    0    0    0 4839 8603 7330 8983]
 [   0    0    0    0 4075 8817 9449 8930]
 [   0    0    0 4075 8817 7842 9449 9079]
 [   0    0    0 7378 4839  193 7330 4233]
 [   0    0    0    0 7982  651 5593 9449]]


In [20]:
dim = 10 # no. of features used in word embedding

In [23]:
model = Sequential()
model.add(Embedding(voc_size,10,input_length=sent_length)) # 10 is the feature size
model.compile('adam','mse') # adam optimizer and performance matrix as mean squared error

In [24]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [25]:
print(model.predict(embedded_docs))

[[[-1.45812258e-02  2.60839500e-02  3.74434479e-02 -3.02891266e-02
   -1.92089919e-02 -1.34790055e-02 -2.08009966e-02  2.14239992e-02
   -1.32226236e-02 -6.92493841e-03]
  [-1.45812258e-02  2.60839500e-02  3.74434479e-02 -3.02891266e-02
   -1.92089919e-02 -1.34790055e-02 -2.08009966e-02  2.14239992e-02
   -1.32226236e-02 -6.92493841e-03]
  [-1.45812258e-02  2.60839500e-02  3.74434479e-02 -3.02891266e-02
   -1.92089919e-02 -1.34790055e-02 -2.08009966e-02  2.14239992e-02
   -1.32226236e-02 -6.92493841e-03]
  [-1.45812258e-02  2.60839500e-02  3.74434479e-02 -3.02891266e-02
   -1.92089919e-02 -1.34790055e-02 -2.08009966e-02  2.14239992e-02
   -1.32226236e-02 -6.92493841e-03]
  [ 1.00072399e-02 -4.49082740e-02 -8.70543718e-03  1.08242147e-02
   -1.77398920e-02  3.84515040e-02  9.16447490e-03  4.93046679e-02
   -3.24755907e-02 -1.35244131e-02]
  [-1.83848143e-02  4.44278121e-05  4.88730185e-02  4.84626032e-02
   -3.20308432e-02 -2.13057287e-02 -3.53070721e-02  3.25859450e-02
    3.79435681e-

In [26]:
embedded_docs[0]

array([   0,    0,    0,    0, 4839, 8986, 7330, 9150])

In [27]:
print(model.predict(embedded_docs[0]))

[[[-1.4581226e-02  2.6083950e-02  3.7443448e-02 -3.0289127e-02
   -1.9208992e-02 -1.3479006e-02 -2.0800997e-02  2.1423999e-02
   -1.3222624e-02 -6.9249384e-03]]

 [[-1.4581226e-02  2.6083950e-02  3.7443448e-02 -3.0289127e-02
   -1.9208992e-02 -1.3479006e-02 -2.0800997e-02  2.1423999e-02
   -1.3222624e-02 -6.9249384e-03]]

 [[-1.4581226e-02  2.6083950e-02  3.7443448e-02 -3.0289127e-02
   -1.9208992e-02 -1.3479006e-02 -2.0800997e-02  2.1423999e-02
   -1.3222624e-02 -6.9249384e-03]]

 [[-1.4581226e-02  2.6083950e-02  3.7443448e-02 -3.0289127e-02
   -1.9208992e-02 -1.3479006e-02 -2.0800997e-02  2.1423999e-02
   -1.3222624e-02 -6.9249384e-03]]

 [[ 1.0007240e-02 -4.4908274e-02 -8.7054372e-03  1.0824215e-02
   -1.7739892e-02  3.8451504e-02  9.1644749e-03  4.9304668e-02
   -3.2475591e-02 -1.3524413e-02]]

 [[-1.8384814e-02  4.4427812e-05  4.8873018e-02  4.8462603e-02
   -3.2030843e-02 -2.1305729e-02 -3.5307072e-02  3.2585945e-02
    3.7943568e-02 -1.4991391e-02]]

 [[-4.1931115e-02  1.4682200