In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
# Sentences
sent = [
    "The glass of milk",
    "The glass of juice",
    "The cup of tea",
    "I am a good boy",
    "I am a good developer",
    "Understand the meaning of words",
    "your videos are good",
]

In [3]:
## Define the vocabulary size
voc_size = 10000

In [4]:
# One Hot Representation
one_hot_repr = [one_hot(words, voc_size) for words in sent]
one_hot_repr

[[5568, 8551, 9028, 5875],
 [5568, 8551, 9028, 5824],
 [5568, 2876, 9028, 3585],
 [5726, 1810, 1010, 4889, 6011],
 [5726, 1810, 1010, 4889, 8917],
 [3091, 5568, 5817, 9028, 8511],
 [7238, 4970, 9986, 4889]]

In [5]:
# Word embedding representation

from tensorflow.keras.layers import Embedding

from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential

In [6]:
sent_length = 8
embedded_docs = pad_sequences(one_hot_repr, padding="pre", maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 5568 8551 9028 5875]
 [   0    0    0    0 5568 8551 9028 5824]
 [   0    0    0    0 5568 2876 9028 3585]
 [   0    0    0 5726 1810 1010 4889 6011]
 [   0    0    0 5726 1810 1010 4889 8917]
 [   0    0    0 3091 5568 5817 9028 8511]
 [   0    0    0    0 7238 4970 9986 4889]]


In [7]:
## Feature representation
dim = 10

In [8]:
# Embedding Layer: Internal Working Explained
#
# The Embedding layer in Keras is used to represent each word (or token) as a dense vector.
# Instead of using one-hot vectors (which are sparse and high-dimensional),
# this layer maps each integer (word index) to a fixed-size dense vector.
#
# Internally, this is done using a trainable weight matrix of shape (vocab_size, embedding_dim).
# Each row in this matrix corresponds to the embedding of a word in the vocabulary.

# Example:
# If voc_size = 10000 (10k unique words), and dim = 100 (each word maps to a 100-d vector),
# then the embedding matrix is of shape (10000, 100)

# When you input a sequence like [4, 56, 172, 9], the embedding layer:
# - Looks up the corresponding rows for indices 4, 56, 172, and 9 in the embedding matrix
# - Returns a matrix of shape (sequence_length, embedding_dim) — in this case, (4, 100)

# During training, the embedding matrix is updated using backpropagation to capture
# semantic meanings — words used in similar contexts end up with similar vectors.

# Summary of the process:
# 1. Input: sequence of integers (word indices) -> shape: (batch_size, sequence_length)
# 2. Output: sequence of vectors -> shape: (batch_size, sequence_length, embedding_dim)
# 3. The embedding matrix is initialized randomly (or pre-trained) and is trainable

In [9]:
# Create a Sequential model — this means the model will have a linear stack of layers
model = Sequential()

# Add an Embedding layer to the model
# -----------------------------------
# Embedding layer is used to convert integer-encoded words into dense vector representations
# Parameters:
# - voc_size: Total number of unique words in the vocabulary (vocabulary size)
# - dim: Dimension of the dense embedding vectors (e.g., 100, 128, 300, etc.)
# - input_length: Length of each input sequence (number of words per input text)
#
# This layer transforms input of shape (batch_size, input_length) into
# output shape (batch_size, input_length, dim), where each word index is mapped to a vector
model.add(Embedding(input_dim=voc_size, output_dim=dim, input_length=sent_length))

# Compile the model
# The compile step configures the learning process
# - "adam" is the optimizer used for updating weights — it's efficient and works well for most problems
# - "mse" (Mean Squared Error) is used as the loss function — this is typically used for regression tasks.
model.compile(optimizer="adam", loss="mse")



In [10]:
model.summary()

In [11]:
model.predict(embedded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step


array([[[ 2.0660758e-03, -5.0330162e-03, -1.6095266e-03,  4.6050299e-02,
         -3.4905992e-02,  2.8494272e-02,  1.9751489e-05, -1.1864983e-02,
         -3.4263022e-03, -1.9236922e-02],
        [ 2.0660758e-03, -5.0330162e-03, -1.6095266e-03,  4.6050299e-02,
         -3.4905992e-02,  2.8494272e-02,  1.9751489e-05, -1.1864983e-02,
         -3.4263022e-03, -1.9236922e-02],
        [ 2.0660758e-03, -5.0330162e-03, -1.6095266e-03,  4.6050299e-02,
         -3.4905992e-02,  2.8494272e-02,  1.9751489e-05, -1.1864983e-02,
         -3.4263022e-03, -1.9236922e-02],
        [ 2.0660758e-03, -5.0330162e-03, -1.6095266e-03,  4.6050299e-02,
         -3.4905992e-02,  2.8494272e-02,  1.9751489e-05, -1.1864983e-02,
         -3.4263022e-03, -1.9236922e-02],
        [ 8.6981878e-03, -2.5889670e-02,  4.2415146e-02,  5.3854808e-03,
         -2.3356711e-02,  4.2219210e-02,  4.9663376e-02, -1.1169504e-02,
         -1.1303127e-02,  3.4552779e-02],
        [-4.3614533e-02, -1.2159158e-02,  3.6267292e-02, -4.

In [12]:
embedded_docs[0]

array([   0,    0,    0,    0, 5568, 8551, 9028, 5875])

In [13]:
import numpy as np

model.predict(np.array([embedded_docs[0]]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step


array([[[ 2.0660758e-03, -5.0330162e-03, -1.6095266e-03,  4.6050299e-02,
         -3.4905992e-02,  2.8494272e-02,  1.9751489e-05, -1.1864983e-02,
         -3.4263022e-03, -1.9236922e-02],
        [ 2.0660758e-03, -5.0330162e-03, -1.6095266e-03,  4.6050299e-02,
         -3.4905992e-02,  2.8494272e-02,  1.9751489e-05, -1.1864983e-02,
         -3.4263022e-03, -1.9236922e-02],
        [ 2.0660758e-03, -5.0330162e-03, -1.6095266e-03,  4.6050299e-02,
         -3.4905992e-02,  2.8494272e-02,  1.9751489e-05, -1.1864983e-02,
         -3.4263022e-03, -1.9236922e-02],
        [ 2.0660758e-03, -5.0330162e-03, -1.6095266e-03,  4.6050299e-02,
         -3.4905992e-02,  2.8494272e-02,  1.9751489e-05, -1.1864983e-02,
         -3.4263022e-03, -1.9236922e-02],
        [ 8.6981878e-03, -2.5889670e-02,  4.2415146e-02,  5.3854808e-03,
         -2.3356711e-02,  4.2219210e-02,  4.9663376e-02, -1.1169504e-02,
         -1.1303127e-02,  3.4552779e-02],
        [-4.3614533e-02, -1.2159158e-02,  3.6267292e-02, -4.