### Word Embedding Techniques using Embedding Layer in Keras

In [1]:
#Importing Libraries
#(in tensorflow > 2.0 keras is embedded with Tensorflow)
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
#Sentences
sent = ['the glass of milk',
       'the glass of juice',
       'the cup of tea',
       'I am a good boy',
       'I am a good developer',
       'understand the meaning of words',
       'your videos are good']

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
#vocabulary size
voc_size = 10000

### One Hot Representation

In [5]:
onehot_repr = [one_hot(words,voc_size) for words in sent]
print(onehot_repr)

[[5973, 4087, 6287, 3966], [5973, 4087, 6287, 5050], [5973, 2419, 6287, 6313], [9072, 3035, 9366, 2791, 6096], [9072, 3035, 9366, 2791, 7384], [459, 5973, 5490, 6287, 351], [9231, 3363, 1258, 2791]]


Every sentence is now converted into index of words, like "the" in first sentence represent 4376th index from the vocab size of 10000

### Word Embedding Representation (Feature Representation)

In [6]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [7]:
import numpy as np

In [8]:
#defining sentence length
sent_length = 8

In [9]:
#Create a pad sequences, by giving the one hot representation of words
#padding = 'pre' means that when we have unequal no. of words in each sentence it applies zero at the beginningto make them equal
#Equality of words in sentence is done using sent_length, it will be for each sentence
embedded_docs = pad_sequences(onehot_repr,padding="pre",maxlen = sent_length)

In [10]:
print(embedded_docs)

[[   0    0    0    0 5973 4087 6287 3966]
 [   0    0    0    0 5973 4087 6287 5050]
 [   0    0    0    0 5973 2419 6287 6313]
 [   0    0    0 9072 3035 9366 2791 6096]
 [   0    0    0 9072 3035 9366 2791 7384]
 [   0    0    0  459 5973 5490 6287  351]
 [   0    0    0    0 9231 3363 1258 2791]]


In [11]:
#dimension(features) for embedding layer
dim = 10

In [12]:
#Creating embedding layer
model = Sequential()
model.add(Embedding(voc_size , 10, input_length = sent_length))
model.compile('adam','mse')

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.predict(embedded_docs)

array([[[ 0.02232139,  0.0225432 ,  0.04847057, -0.03510857,
          0.02724767,  0.0261192 , -0.03115741,  0.04033149,
         -0.02258241, -0.01621906],
        [ 0.02232139,  0.0225432 ,  0.04847057, -0.03510857,
          0.02724767,  0.0261192 , -0.03115741,  0.04033149,
         -0.02258241, -0.01621906],
        [ 0.02232139,  0.0225432 ,  0.04847057, -0.03510857,
          0.02724767,  0.0261192 , -0.03115741,  0.04033149,
         -0.02258241, -0.01621906],
        [ 0.02232139,  0.0225432 ,  0.04847057, -0.03510857,
          0.02724767,  0.0261192 , -0.03115741,  0.04033149,
         -0.02258241, -0.01621906],
        [ 0.04191056, -0.02273332,  0.0130606 ,  0.04986943,
          0.03904248, -0.02882572, -0.00778221, -0.01431376,
         -0.02636433,  0.02917648],
        [-0.01295085,  0.03412055, -0.03694151,  0.02249383,
         -0.0117681 , -0.01703361, -0.03395631,  0.00410701,
          0.02120705,  0.02876255],
        [-0.02042896,  0.0207788 ,  0.04886479,  0.0

In [15]:
embedded_docs[0]

array([   0,    0,    0,    0, 5973, 4087, 6287, 3966])

In [16]:
print(model.predict(embedded_docs[0]))

[[[ 0.02232139  0.0225432   0.04847057 -0.03510857  0.02724767
    0.0261192  -0.03115741  0.04033149 -0.02258241 -0.01621906]]

 [[ 0.02232139  0.0225432   0.04847057 -0.03510857  0.02724767
    0.0261192  -0.03115741  0.04033149 -0.02258241 -0.01621906]]

 [[ 0.02232139  0.0225432   0.04847057 -0.03510857  0.02724767
    0.0261192  -0.03115741  0.04033149 -0.02258241 -0.01621906]]

 [[ 0.02232139  0.0225432   0.04847057 -0.03510857  0.02724767
    0.0261192  -0.03115741  0.04033149 -0.02258241 -0.01621906]]

 [[ 0.04191056 -0.02273332  0.0130606   0.04986943  0.03904248
   -0.02882572 -0.00778221 -0.01431376 -0.02636433  0.02917648]]

 [[-0.01295085  0.03412055 -0.03694151  0.02249383 -0.0117681
   -0.01703361 -0.03395631  0.00410701  0.02120705  0.02876255]]

 [[-0.02042896  0.0207788   0.04886479  0.00454923  0.04459969
    0.03039061  0.03140981 -0.03276013  0.00227018  0.02610714]]

 [[ 0.01111563  0.03558116 -0.00549183  0.04755631 -0.00678561
    0.03880532  0.02517665 -0.02600