#**Importing libraries**

In [None]:
from keras.preprocessing.text import one_hot
from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
import numpy as np

#**Sentence array**

In [None]:
sentences = [  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

#**One-hot encoding**

In [None]:
# One-Hot encoding works just like hashing ==> where next empty slots are filled in based on the calculations done with hashing algorithms
# Thereby, choosing a large vocabulary size is mandatory otherwise there will be  problem with fitting in of all the words and one word might overlap into another
vocabulary_size = 100
onehot_repr = [one_hot(words, vocabulary_size) for words in sentences]
print(onehot_repr)

[[98, 59, 24, 29], [98, 59, 24, 57], [98, 4, 24, 89], [71, 34, 34, 50, 91], [71, 34, 34, 50, 64], [35, 98, 58, 24, 29], [48, 48, 27, 50]]


#**Padding extra zeroes to the start of words to make them all of equal lengths**

In [None]:
sent_length = 8
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[ 0  0  0  0 98 59 24 29]
 [ 0  0  0  0 98 59 24 57]
 [ 0  0  0  0 98  4 24 89]
 [ 0  0  0 71 34 34 50 91]
 [ 0  0  0 71 34 34 50 64]
 [ 0  0  0 35 98 58 24 29]
 [ 0  0  0  0 48 48 27 50]]


#**Reducing the numbers to a 10D embedding**

In [None]:
dim = 10
model = Sequential()
model.add(Embedding(vocabulary_size, dim, input_length = sent_length))
model.compile(optimizer = 'adam', loss = 'mse')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             1000      
                                                                 
Total params: 1000 (3.91 KB)
Trainable params: 1000 (3.91 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#**Length of each sentence == length of model(predict)[0] ==> 8**

In [None]:
# This means that for '0' in '[ 0  0  0  0 98 59 24 29]' we have the embedding as '[-0.04056021  0.02638127 -0.00394828 -0.00419446  0.02237586  0.04525783 -0.02064885  0.01385346 -0.04558868  0.02009482]'
# Similarly, for '29' in '[ 0  0  0  0 98 59 24 29]' we have the embedding as '[-0.02807672  0.02688465 -0.04748455 -0.00117822 -0.02232385 -0.01863257 0.04037025 -0.02319509  0.00152504  0.02576596]'
# Also, notice that for all the four '0's, the embeddings are absolutely similar
print(model.predict(embedded_docs)[0])

[[-0.04056021  0.02638127 -0.00394828 -0.00419446  0.02237586  0.04525783
  -0.02064885  0.01385346 -0.04558868  0.02009482]
 [-0.04056021  0.02638127 -0.00394828 -0.00419446  0.02237586  0.04525783
  -0.02064885  0.01385346 -0.04558868  0.02009482]
 [-0.04056021  0.02638127 -0.00394828 -0.00419446  0.02237586  0.04525783
  -0.02064885  0.01385346 -0.04558868  0.02009482]
 [-0.04056021  0.02638127 -0.00394828 -0.00419446  0.02237586  0.04525783
  -0.02064885  0.01385346 -0.04558868  0.02009482]
 [-0.00058631  0.00024825  0.02992855  0.02244103 -0.00965991 -0.01010143
   0.02492208  0.0213404   0.02707118  0.00097662]
 [ 0.00496878 -0.03892033 -0.02431269 -0.03617909  0.00283279  0.03314129
   0.01415025 -0.02948108  0.03365351  0.04215961]
 [-0.00583934  0.04421267  0.00054066 -0.03793598  0.00558459  0.01656208
  -0.03719988  0.02738718 -0.01282429  0.03746258]
 [-0.02807672  0.02688465 -0.04748455 -0.00117822 -0.02232385 -0.01863257
   0.04037025 -0.02319509  0.00152504  0.02576596]]