# Word Embedding (One Hot)


In [1]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential

In [2]:
sent = [
    "the glass of milk",
    "the glass of juice",
    "the cup of tea",
    "I am a good boy",
    "I am a good developer",
    "understand the meaning of words",
    "your videos are good",
]

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
voc_size = 10000

In [5]:
# One Hot Representation
# Each word is represented as a vector of size voc_size with all zeros except one position which is 1

one_hot_repr = [one_hot(words, voc_size) for words in sent]
one_hot_repr

[[5385, 9937, 9653, 7878],
 [5385, 9937, 9653, 654],
 [5385, 6741, 9653, 4365],
 [9360, 5392, 8282, 6598, 2759],
 [9360, 5392, 8282, 6598, 7207],
 [6946, 5385, 8619, 9653, 9911],
 [2270, 8501, 1982, 6598]]

In [6]:
# One Hot has issues
# Therefore we use Embedding

# Padding the sequences to have same length
sent_length = 8
embedded_docs = pad_sequences(one_hot_repr, padding="pre", maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 5385 9937 9653 7878]
 [   0    0    0    0 5385 9937 9653  654]
 [   0    0    0    0 5385 6741 9653 4365]
 [   0    0    0 9360 5392 8282 6598 2759]
 [   0    0    0 9360 5392 8282 6598 7207]
 [   0    0    0 6946 5385 8619 9653 9911]
 [   0    0    0    0 2270 8501 1982 6598]]


In [7]:
# Feature representation using Embedding

dim = 10

In [8]:
# Creating the model

model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=dim, input_shape=(sent_length,)))
model.compile(optimizer="adam", loss="mse")

  super().__init__(**kwargs)


In [9]:
model.summary()

In [10]:
model.predict(embedded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step


array([[[-2.6397252e-02,  8.8490546e-05,  3.9858606e-02,  9.7003803e-03,
          1.5929807e-02,  3.6606345e-02, -3.1833410e-02, -2.3370994e-02,
         -4.2030822e-02,  3.4446716e-03],
        [-2.6397252e-02,  8.8490546e-05,  3.9858606e-02,  9.7003803e-03,
          1.5929807e-02,  3.6606345e-02, -3.1833410e-02, -2.3370994e-02,
         -4.2030822e-02,  3.4446716e-03],
        [-2.6397252e-02,  8.8490546e-05,  3.9858606e-02,  9.7003803e-03,
          1.5929807e-02,  3.6606345e-02, -3.1833410e-02, -2.3370994e-02,
         -4.2030822e-02,  3.4446716e-03],
        [-2.6397252e-02,  8.8490546e-05,  3.9858606e-02,  9.7003803e-03,
          1.5929807e-02,  3.6606345e-02, -3.1833410e-02, -2.3370994e-02,
         -4.2030822e-02,  3.4446716e-03],
        [-4.1824449e-02,  3.6909867e-02,  2.3588207e-02, -1.3798676e-02,
         -3.3833206e-02,  1.2026288e-02, -4.3521680e-02, -4.5222856e-02,
         -4.1198850e-02,  3.0544985e-02],
        [-2.1137405e-02, -1.6721379e-02, -1.3547242e-02, -2.

In [11]:
embedded_docs[0]

array([   0,    0,    0,    0, 5385, 9937, 9653, 7878])

In [12]:
model.predict(embedded_docs[:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step


array([[[-2.6397252e-02,  8.8490546e-05,  3.9858606e-02,  9.7003803e-03,
          1.5929807e-02,  3.6606345e-02, -3.1833410e-02, -2.3370994e-02,
         -4.2030822e-02,  3.4446716e-03],
        [-2.6397252e-02,  8.8490546e-05,  3.9858606e-02,  9.7003803e-03,
          1.5929807e-02,  3.6606345e-02, -3.1833410e-02, -2.3370994e-02,
         -4.2030822e-02,  3.4446716e-03],
        [-2.6397252e-02,  8.8490546e-05,  3.9858606e-02,  9.7003803e-03,
          1.5929807e-02,  3.6606345e-02, -3.1833410e-02, -2.3370994e-02,
         -4.2030822e-02,  3.4446716e-03],
        [-2.6397252e-02,  8.8490546e-05,  3.9858606e-02,  9.7003803e-03,
          1.5929807e-02,  3.6606345e-02, -3.1833410e-02, -2.3370994e-02,
         -4.2030822e-02,  3.4446716e-03],
        [-4.1824449e-02,  3.6909867e-02,  2.3588207e-02, -1.3798676e-02,
         -3.3833206e-02,  1.2026288e-02, -4.3521680e-02, -4.5222856e-02,
         -4.1198850e-02,  3.0544985e-02],
        [-2.1137405e-02, -1.6721379e-02, -1.3547242e-02, -2.