### Word Embedding Techniques using Embedding Layers in Keras

In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
## sentences
sent = [
    'the glass of tea',
    'the glass of milk',
    'I am a good boy',
    'I am a good developer',
    'understand the meaning of work',
    'king is not poor'
]

In [3]:
sent

['the glass of tea',
 'the glass of milk',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of work',
 'king is not poor']

In [4]:
## vocabulary size
voc_size = 10000

### One Hot Representation

In [5]:
one_hot_repr = [one_hot(words,voc_size) for words in sent]
print(one_hot_repr)

[[7517, 5240, 1938, 8141], [7517, 5240, 1938, 4234], [4366, 3375, 1859, 3926, 972], [4366, 3375, 1859, 3926, 8071], [1021, 7517, 3386, 1938, 3801], [8599, 3893, 178, 7541]]


### Word Embedding Representation

In [7]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [8]:
import numpy as np

In [11]:
## adding padding to the one-hot embedding
sent_length=8
embedded_docs = pad_sequences(one_hot_repr, padding='pre', maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 7517 5240 1938 8141]
 [   0    0    0    0 7517 5240 1938 4234]
 [   0    0    0 4366 3375 1859 3926  972]
 [   0    0    0 4366 3375 1859 3926 8071]
 [   0    0    0 1021 7517 3386 1938 3801]
 [   0    0    0    0 8599 3893  178 7541]]


In [14]:
dim = 10

In [15]:
model = Sequential()
model.add(Embedding(voc_size, dim, input_length=sent_length))
model.compile('adam','mse')

In [16]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [17]:
print(model.predict(embedded_docs))

[[[-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123
    0.02098389 -0.00743326  0.02181137  0.00641491  0.00676714]
  [-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123
    0.02098389 -0.00743326  0.02181137  0.00641491  0.00676714]
  [-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123
    0.02098389 -0.00743326  0.02181137  0.00641491  0.00676714]
  [-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123
    0.02098389 -0.00743326  0.02181137  0.00641491  0.00676714]
  [ 0.01172306  0.00328093 -0.03564436  0.00923502 -0.04707932
    0.02555976 -0.00678955  0.00647219  0.04087794 -0.00232433]
  [ 0.04883796  0.03918253 -0.0395037   0.00737399  0.03266025
   -0.00896648 -0.00986991 -0.00872814  0.04745665  0.03232664]
  [-0.03391016  0.00794322 -0.04591559 -0.0034533   0.01801318
    0.00038097 -0.01386524  0.0447487   0.03609313 -0.04877665]
  [-0.04675622 -0.00449618  0.03596831 -0.00319067  0.03268057
   -0.03331033 -0.03510378 -0.03914671  0.033639

In [18]:
embedded_docs[0]

array([   0,    0,    0,    0, 7517, 5240, 1938, 8141])

In [19]:
print(model.predict(embedded_docs)[0])

[[-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123  0.02098389
  -0.00743326  0.02181137  0.00641491  0.00676714]
 [-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123  0.02098389
  -0.00743326  0.02181137  0.00641491  0.00676714]
 [-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123  0.02098389
  -0.00743326  0.02181137  0.00641491  0.00676714]
 [-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123  0.02098389
  -0.00743326  0.02181137  0.00641491  0.00676714]
 [ 0.01172306  0.00328093 -0.03564436  0.00923502 -0.04707932  0.02555976
  -0.00678955  0.00647219  0.04087794 -0.00232433]
 [ 0.04883796  0.03918253 -0.0395037   0.00737399  0.03266025 -0.00896648
  -0.00986991 -0.00872814  0.04745665  0.03232664]
 [-0.03391016  0.00794322 -0.04591559 -0.0034533   0.01801318  0.00038097
  -0.01386524  0.0447487   0.03609313 -0.04877665]
 [-0.04675622 -0.00449618  0.03596831 -0.00319067  0.03268057 -0.03331033
  -0.03510378 -0.03914671  0.03363978 -0.00565875]]

In [37]:
print(model.predict(embedded_docs)[0])

[[-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123  0.02098389
  -0.00743326  0.02181137  0.00641491  0.00676714]
 [-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123  0.02098389
  -0.00743326  0.02181137  0.00641491  0.00676714]
 [-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123  0.02098389
  -0.00743326  0.02181137  0.00641491  0.00676714]
 [-0.04108484 -0.04218247  0.00765065  0.04972715  0.01062123  0.02098389
  -0.00743326  0.02181137  0.00641491  0.00676714]
 [ 0.01172306  0.00328093 -0.03564436  0.00923502 -0.04707932  0.02555976
  -0.00678955  0.00647219  0.04087794 -0.00232433]
 [ 0.04883796  0.03918253 -0.0395037   0.00737399  0.03266025 -0.00896648
  -0.00986991 -0.00872814  0.04745665  0.03232664]
 [-0.03391016  0.00794322 -0.04591559 -0.0034533   0.01801318  0.00038097
  -0.01386524  0.0447487   0.03609313 -0.04877665]
 [-0.04675622 -0.00449618  0.03596831 -0.00319067  0.03268057 -0.03331033
  -0.03510378 -0.03914671  0.03363978 -0.00565875]]

This list is the word embedding of the first line -> 'the glass of tea' into list of 10 dimensions.
The first four list of this will be same because the value of the first 4 content is 0.

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [40]:
vec1 = [model.predict(embedded_docs)[0][-1]]  #tea
vec2 = [model.predict(embedded_docs)[1][-1]]  #milk

In [41]:
## cosine similarity between milk and tea
cosine_similarity(vec1,vec2)

array([[0.5605608]], dtype=float32)

In [42]:
## choosing same word from the list => 'the'
vec1 = [model.predict(embedded_docs)[0][0]]
vec2 = [model.predict(embedded_docs)[1][0]]

In [43]:
cosine_similarity(vec1,vec2)

array([[1.]], dtype=float32)