## Word embedding techniques using embedding layer in Keras

In [6]:
# tensorflow version > 2.0
from tensorflow.keras.preprocessing.text import one_hot

In [7]:
sent = ['the glass of milk', 'the glass of juice', 'the cup of tea', 'I am a good boy', 'I am a good developer', 'understand the meaning of words', 'your videos are good', ]

In [8]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [10]:
# vocabulary size
voc_size = 10000

### One hot representation

In [11]:
onehot_rep = [one_hot(words, voc_size) for words in sent]
print(onehot_rep)

[[8920, 1512, 5362, 7750], [8920, 1512, 5362, 5273], [8920, 1023, 5362, 2341], [5296, 3481, 1287, 5621, 5655], [5296, 3481, 1287, 5621, 9929], [1669, 8920, 8476, 5362, 6644], [98, 9123, 475, 5621]]


### Word embedding representation 

In [13]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [14]:
sent_length = 8
embedded_docs = pad_sequences(onehot_rep, padding='pre', maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 8920 1512 5362 7750]
 [   0    0    0    0 8920 1512 5362 5273]
 [   0    0    0    0 8920 1023 5362 2341]
 [   0    0    0 5296 3481 1287 5621 5655]
 [   0    0    0 5296 3481 1287 5621 9929]
 [   0    0    0 1669 8920 8476 5362 6644]
 [   0    0    0    0   98 9123  475 5621]]


In [16]:
dim = 10

In [21]:
model = Sequential()
model.add(Embedding(voc_size, dim, input_length=sent_length))
model.compile('adam', 'mse')

In [22]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [23]:
print(model.predict(embedded_docs))

[[[-0.04509416 -0.00434046 -0.00124819 -0.01185756 -0.0224798
    0.02854453 -0.02488885 -0.00516669  0.02510669  0.04806307]
  [-0.04509416 -0.00434046 -0.00124819 -0.01185756 -0.0224798
    0.02854453 -0.02488885 -0.00516669  0.02510669  0.04806307]
  [-0.04509416 -0.00434046 -0.00124819 -0.01185756 -0.0224798
    0.02854453 -0.02488885 -0.00516669  0.02510669  0.04806307]
  [-0.04509416 -0.00434046 -0.00124819 -0.01185756 -0.0224798
    0.02854453 -0.02488885 -0.00516669  0.02510669  0.04806307]
  [ 0.03316135  0.04189768 -0.01518018 -0.01161121  0.00036918
   -0.02039734 -0.01193009  0.03565073  0.0182827  -0.02784704]
  [ 0.03556056  0.03417692 -0.00707073 -0.04980939 -0.03030684
    0.01292321  0.04729016 -0.02896677  0.01573681 -0.01189633]
  [ 0.04627365 -0.02463529  0.01213408  0.01917055  0.02938125
    0.01984915 -0.00811135 -0.00049955  0.02920249 -0.01308435]
  [-0.00500458 -0.0342152  -0.00011948 -0.01788266 -0.04758141
    0.01135888 -0.04689253 -0.01367781  0.01536569 -

In [24]:
embedded_docs[0]

array([   0,    0,    0,    0, 8920, 1512, 5362, 7750], dtype=int32)

In [27]:
print(model.predict(embedded_docs)[0])

[[-0.04509416 -0.00434046 -0.00124819 -0.01185756 -0.0224798   0.02854453
  -0.02488885 -0.00516669  0.02510669  0.04806307]
 [-0.04509416 -0.00434046 -0.00124819 -0.01185756 -0.0224798   0.02854453
  -0.02488885 -0.00516669  0.02510669  0.04806307]
 [-0.04509416 -0.00434046 -0.00124819 -0.01185756 -0.0224798   0.02854453
  -0.02488885 -0.00516669  0.02510669  0.04806307]
 [-0.04509416 -0.00434046 -0.00124819 -0.01185756 -0.0224798   0.02854453
  -0.02488885 -0.00516669  0.02510669  0.04806307]
 [ 0.03316135  0.04189768 -0.01518018 -0.01161121  0.00036918 -0.02039734
  -0.01193009  0.03565073  0.0182827  -0.02784704]
 [ 0.03556056  0.03417692 -0.00707073 -0.04980939 -0.03030684  0.01292321
   0.04729016 -0.02896677  0.01573681 -0.01189633]
 [ 0.04627365 -0.02463529  0.01213408  0.01917055  0.02938125  0.01984915
  -0.00811135 -0.00049955  0.02920249 -0.01308435]
 [-0.00500458 -0.0342152  -0.00011948 -0.01788266 -0.04758141  0.01135888
  -0.04689253 -0.01367781  0.01536569 -0.01945748]]