## Word Embedding Techniques using Embedding Layer in Keras

In [1]:
import numpy as np
import tensorflow 
import keras

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [2]:
sentences = [
    'I am sabita rajbanshi',
    'I am a software engineer',
    'I am currently learning Machine Learning',
    'Artificial intelligence is the future of humankind',
    '2021 Learnings AI ML DL DS'
]

sentences

['I am sabita rajbanshi',
 'I am a software engineer',
 'I am currently learning Machine Learning',
 'Artificial intelligence is the future of humankind',
 '2021 Learnings AI ML DL DS']

### initialize the vocabulary size=10000

In [3]:
vocab_size = 10000

### one hot representation of word

In [4]:
onehot_repre = [one_hot(words, vocab_size) for words in sentences]
onehot_repre

[[9005, 2674, 1744, 9315],
 [9005, 2674, 3704, 2529, 4806],
 [9005, 2674, 461, 692, 8110, 692],
 [1358, 803, 2149, 9980, 5358, 5226, 1692],
 [1294, 3585, 2213, 125, 4093, 7972]]

### word embedding representation

In [5]:
length_of_sentence = 8
embedded_docs = pad_sequences(onehot_repre, padding='pre', maxlen=length_of_sentence)
print(embedded_docs)

[[   0    0    0    0 9005 2674 1744 9315]
 [   0    0    0 9005 2674 3704 2529 4806]
 [   0    0 9005 2674  461  692 8110  692]
 [   0 1358  803 2149 9980 5358 5226 1692]
 [   0    0 1294 3585 2213  125 4093 7972]]


### initialize the dimension of feature

In [6]:
dim=8
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=length_of_sentence))


In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 8)              80000     
Total params: 80,000
Trainable params: 80,000
Non-trainable params: 0
_________________________________________________________________


In [8]:
embedded_docs[0]

array([   0,    0,    0,    0, 9005, 2674, 1744, 9315])

In [9]:
print(model.predict(embedded_docs[0]))

[[[ 0.0033016  -0.01605767  0.02693038 -0.02690777 -0.03101271
    0.03553862 -0.04609522 -0.04709435]]

 [[ 0.0033016  -0.01605767  0.02693038 -0.02690777 -0.03101271
    0.03553862 -0.04609522 -0.04709435]]

 [[ 0.0033016  -0.01605767  0.02693038 -0.02690777 -0.03101271
    0.03553862 -0.04609522 -0.04709435]]

 [[ 0.0033016  -0.01605767  0.02693038 -0.02690777 -0.03101271
    0.03553862 -0.04609522 -0.04709435]]

 [[ 0.02082952 -0.04860653  0.00728057  0.00022098 -0.02012602
    0.02457568 -0.02631199  0.04604   ]]

 [[-0.02315918 -0.02615763  0.00204688  0.00116062  0.00733156
   -0.01977913 -0.00104349  0.01091676]]

 [[-0.0455346  -0.04452446  0.04850023  0.0086147  -0.04118424
   -0.02724919 -0.01218033  0.03247603]]

 [[-0.02772031  0.04763781  0.03797758 -0.01002715 -0.01011012
    0.04292928 -0.01787959  0.00058595]]]


In [10]:
embedded_docs[3]

array([   0, 1358,  803, 2149, 9980, 5358, 5226, 1692])

In [11]:
print(model.predict(embedded_docs[3]))

[[[ 0.0033016  -0.01605767  0.02693038 -0.02690777 -0.03101271
    0.03553862 -0.04609522 -0.04709435]]

 [[-0.03725219 -0.01219546 -0.00835402  0.00229014  0.0344064
   -0.01500027 -0.04896067 -0.0161113 ]]

 [[ 0.03020997 -0.04469751  0.03684596  0.00619174 -0.0052335
    0.01779931  0.03512359 -0.03447522]]

 [[ 0.0408465  -0.01222742  0.03142723  0.04469771  0.01234747
   -0.00493886 -0.04436953 -0.03363061]]

 [[ 0.04926104  0.03425983 -0.04355545  0.00351394  0.03128134
    0.04983873  0.03370788 -0.03065666]]

 [[-0.03834613 -0.03809262  0.02464939 -0.02476969 -0.0321218
    0.00021293 -0.01050995  0.02176214]]

 [[-0.03546915  0.04512483 -0.00190216 -0.00581015 -0.02060771
    0.04437163  0.02019162  0.04645814]]

 [[-0.04231635  0.00809456  0.04768876  0.00191092  0.01065684
   -0.01744119 -0.01765461 -0.00984139]]]
