-----------------------------
#### Embedding layer in Keras
------------------------------

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
# Define the sentences
sentences = [
    "I love machine learning",
    "Deep learning is fascinating",
    "Natural language processing is a challenge",
    "I enjoy learning new things",
    "Machine learning models are powerful"
]

# Define the binary labels
labels = [1, 1, 0, 0, 1]

#### tokenization and integer encoding

In [4]:
# Initialize the Tokenizer
tokenizer = Tokenizer()

tokenizer.fit_on_texts(sentences)

In [5]:
# Word index mapping
word_index = tokenizer.word_index
print(word_index)

{'learning': 1, 'i': 2, 'machine': 3, 'is': 4, 'love': 5, 'deep': 6, 'fascinating': 7, 'natural': 8, 'language': 9, 'processing': 10, 'a': 11, 'challenge': 12, 'enjoy': 13, 'new': 14, 'things': 15, 'models': 16, 'are': 17, 'powerful': 18}


In [6]:
# Convert sentences to integer sequences
sequences = tokenizer.texts_to_sequences(sentences)

In [7]:
sequences

[[2, 5, 3, 1],
 [6, 1, 4, 7],
 [8, 9, 10, 4, 11, 12],
 [2, 13, 1, 14, 15],
 [3, 1, 16, 17, 18]]

#### padding
- Since the sentences have different lengths, we pad them to ensure they all have the same length.

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to the same length
max_length = 7  # Maximum length of the sentences after padding
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

padded_sequences

array([[ 2,  5,  3,  1,  0,  0,  0],
       [ 6,  1,  4,  7,  0,  0,  0],
       [ 8,  9, 10,  4, 11, 12,  0],
       [ 2, 13,  1, 14, 15,  0,  0],
       [ 3,  1, 16, 17, 18,  0,  0]])

#### Model Construction
- We construct a simple neural network model with an Embedding layer followed by a dense layer for binary classification.

In [9]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

In [10]:
len(word_index)

18

In [11]:
# Define vocabulary size and embedding size
vocab_size    = len(word_index) + 1  # Adding 1 because indexing starts from 1
embedding_dim = 8                    # Dimension of the dense embedding

In [13]:
# Create the model
model = Sequential()

model.add(Embedding(input_dim   = vocab_size,     # 18
                    output_dim  = embedding_dim,  # 8
                    input_length= max_length))    # 7

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [14]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])