In [None]:
from tensorflow.keras.preprocessing.text import one_hot

# above one_hot library=> convert words to indexes -> tokenization


In [3]:
### sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

In [4]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [5]:
## Define the vocabulary size
voc_size=10000

In [6]:
### Token Representation
one_hot_repr = [one_hot(words,voc_size) for words in sent]
one_hot_repr

[[4804, 4509, 7047, 2355],
 [4804, 4509, 7047, 1999],
 [4804, 5425, 7047, 97],
 [5293, 344, 5715, 5135, 6895],
 [5293, 344, 5715, 5135, 1840],
 [695, 4804, 3224, 7047, 4794],
 [3635, 4332, 5690, 5135]]

In [7]:
print(one_hot(sent[0], 10000))
print(one_hot(sent[1], 10000))

[4804, 4509, 7047, 2355]
[4804, 4509, 7047, 1999]


In [8]:
## word Embedding Representation

from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential

In [9]:
import numpy as np

In [10]:
sent_length=8
embedded_docs = pad_sequences(one_hot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 4804 4509 7047 2355]
 [   0    0    0    0 4804 4509 7047 1999]
 [   0    0    0    0 4804 5425 7047   97]
 [   0    0    0 5293  344 5715 5135 6895]
 [   0    0    0 5293  344 5715 5135 1840]
 [   0    0    0  695 4804 3224 7047 4794]
 [   0    0    0    0 3635 4332 5690 5135]]


In [11]:
## feature representation
dim=10

In [17]:
model=Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_length))
# embedding layer will convert the words to vectors of size 10

model.compile('adam','mse')

## Embedding layer
The `Embedding` layer in TensorFlow/Keras is used to convert integer-encoded words into dense vector representations of fixed size. It is commonly used in natural language processing tasks to represent words in a continuous vector space.

**Key Features:**
1. **Input**: The input to the `Embedding` layer is a 2D tensor of integers, where each integer represents a word index.
2. **Output**: The output is a 3D tensor where each word index is mapped to a dense vector of fixed size.

**Syntax:**

tf.keras.layers.Embedding(input_dim, output_dim, input_length=None, ...)

- **`input_dim`**: Size of the vocabulary (maximum integer index + 1).
- **`output_dim`**: Dimension of the dense embedding vectors.
- **`input_length`**: Length of input sequences (optional). Useful when the input has a fixed length.

In [18]:
model.summary()

In [19]:
model.predict(embedded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step


array([[[ 0.01538545,  0.01764551, -0.0080392 ,  0.04928317,
         -0.01194482, -0.01810577, -0.00638794,  0.04928017,
         -0.03895007, -0.02424481],
        [ 0.01538545,  0.01764551, -0.0080392 ,  0.04928317,
         -0.01194482, -0.01810577, -0.00638794,  0.04928017,
         -0.03895007, -0.02424481],
        [ 0.01538545,  0.01764551, -0.0080392 ,  0.04928317,
         -0.01194482, -0.01810577, -0.00638794,  0.04928017,
         -0.03895007, -0.02424481],
        [ 0.01538545,  0.01764551, -0.0080392 ,  0.04928317,
         -0.01194482, -0.01810577, -0.00638794,  0.04928017,
         -0.03895007, -0.02424481],
        [-0.01866769, -0.02851596,  0.0208424 ,  0.03971031,
         -0.04042069,  0.03928827, -0.03697126, -0.02125756,
         -0.00044016, -0.01201725],
        [-0.02479745, -0.01638663,  0.01577786, -0.04962279,
         -0.03428215, -0.04031745,  0.02567836,  0.01237505,
          0.04936996, -0.04438325],
        [-0.02506733,  0.037515  ,  0.03388115, -0.0

In [15]:
embedded_docs[0]

array([   0,    0,    0,    0, 4804, 4509, 7047, 2355], dtype=int32)

In [22]:
model.predict(embedded_docs[0].reshape(1,8))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step


array([[[ 0.01538545,  0.01764551, -0.0080392 ,  0.04928317,
         -0.01194482, -0.01810577, -0.00638794,  0.04928017,
         -0.03895007, -0.02424481],
        [ 0.01538545,  0.01764551, -0.0080392 ,  0.04928317,
         -0.01194482, -0.01810577, -0.00638794,  0.04928017,
         -0.03895007, -0.02424481],
        [ 0.01538545,  0.01764551, -0.0080392 ,  0.04928317,
         -0.01194482, -0.01810577, -0.00638794,  0.04928017,
         -0.03895007, -0.02424481],
        [ 0.01538545,  0.01764551, -0.0080392 ,  0.04928317,
         -0.01194482, -0.01810577, -0.00638794,  0.04928017,
         -0.03895007, -0.02424481],
        [-0.01866769, -0.02851596,  0.0208424 ,  0.03971031,
         -0.04042069,  0.03928827, -0.03697126, -0.02125756,
         -0.00044016, -0.01201725],
        [-0.02479745, -0.01638663,  0.01577786, -0.04962279,
         -0.03428215, -0.04031745,  0.02567836,  0.01237505,
          0.04936996, -0.04438325],
        [-0.02506733,  0.037515  ,  0.03388115, -0.0