#  Chapter 16: Natural Language Processing with RNNs and Attention

In [2]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1" 

import tensorflow as tf
import numpy as np
from pathlib import Path
import tf_keras

In [None]:

shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [None]:
print(shakespeare_text[:80])

In [None]:
all_chars  = "".join(sorted(set(shakespeare_text.lower())))
print(all_chars)
print(len(all_chars))

## Generating Shakespearean Text Using a Character RNN

### Preparing Dataset for a char level rnn model

#### Text Vectorization

In [None]:
text_vec_layer = tf.keras.layers.TextVectorization(split='character', standardize='lower')
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]
encoded

In [None]:
encoded -= 2 # drop 0 for padding and 1 for unkown tokens
n_tokens = text_vec_layer.vocabulary_size()-2
dataset_size = len(encoded)
print("n_tokens:", n_tokens)
print("dataset_size:", dataset_size)

it is seq2seq model

In [None]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices(sequence)
    dataset = dataset.window(length+1, shift=1, drop_remainder =True)
    dataset = dataset.flat_map(lambda window: window.batch(length+1))
    if shuffle:
      dataset = dataset.shuffle(buffer_size=100_000, seed=seed)
    dataset = dataset.batch(batch_size)
    return dataset.map(lambda window: (window[:,:-1], window[:,1:])).prefetch(1)

In [None]:
# There's just one sample in this dataset: the input represents "to b" and the
# output represents "o be"
list(to_dataset(text_vec_layer(["To be"])[0], length=4))

In [None]:
length = 100
tf.random.set_seed(42)

train_set = to_dataset(encoded[:1_000_000], length=100, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=100)
test_set = to_dataset(encoded[1_060_000:], length=100)


### Building and Training the Char-RNN Model


In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation='softmax'),
])


model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam',
              metrics=['accuracy'])

model_ckpt = tf.keras.callbacks.ModelCheckpoint(
 "my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True)

In [None]:
model.summary()

In [None]:
history = model.fit(train_set, validation_data=valid_set, epochs=1,
 callbacks=[model_ckpt])


#### shakespeare model

In [None]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X:X-2),
    model
])

In [None]:
# or we can the pretrained model
url = "https://github.com/ageron/data/raw/main/shakespeare_model.tgz"
path = tf.keras.utils.get_file("shakespeare_model.tgz", url, extract=True)
model_path = Path(path).with_name("shakespeare_model")
shakespeare_model = tf.keras.models.load_model(model_path)

In [None]:
model_path

In [None]:
shakespeare_model.summary()

predicting next character:

In [None]:
y_propas = shakespeare_model.predict(['To be or not to b'])[0]
print( y_propas[-1].shape,'\n\n', y_propas[-1],'\n')
predicted_char = tf.argmax(y_propas[-1])
print(predicted_char+2) # predicted character index + 2 to map the original char again

print("predicted character is: ", text_vec_layer.get_vocabulary()[predicted_char+2])

### Generating Shakespear 'FAKE' text

Instead of using greedy decoding (predict next character and add it to the current text and use all to predict next char and so on.)

The previous approach lead to repeated words.

We can use random sampling (with keeping the estimated propabilty of the model prediction when sampling)


In [None]:
tf.random.set_seed(412)

log_propas = tf.math.log([[.5,.4,.1]]) # simulate the logits
print("> logits: ", log_propas)

print("> Sampling results: ", tf.random.categorical(log_propas, num_samples=8))

In [None]:
tf.random.categorical(log_propas, num_samples=1).numpy()[0,0]

we can take the control over the generated diversity of the text using `temperature`

> high values indicates creativity
> 
> low values indicates precision

In [None]:
def next_char(text, temperature=1):
    y_propas = shakespeare_model.predict([text], verbose=0)[0,-1:]
    y_propas = tf.math.log(y_propas) / temperature
    predicted_ind = tf.random.categorical(y_propas, num_samples=1).numpy()[0,0]
    return text_vec_layer.get_vocabulary()[predicted_ind+2]

def generate(text, n_char=50, temperature=1):
    for _ in range(n_char):
        text += next_char(text, temperature)
    return text


In [None]:
tf.random.set_seed(42)
print(generate('to be or not to b', temperature=0.01),end= '\n'+'='*50+'\n')
print(generate('to be or not to b', temperature=1),end= '\n'+'='*50+'\n')
print(generate('to be or not to b', temperature=199),end= '\n'+'='*50+'\n')

### Sampling from top k charcters

In [None]:
def next_char(text, model, text_vec_layer, k=5, temperature=1.0):
    y_probs = model.predict([text], verbose=0)[0, -1:]
    y_probs = tf.math.log(y_probs) / temperature
    top_k_indices = tf.math.top_k(y_probs, k=k).indices
    top_k_probs = tf.gather(y_probs, top_k_indices, axis=-1)
    top_k_probs = tf.reshape(top_k_probs, (1, -1))
    predicted_idx = tf.random.categorical(top_k_probs, num_samples=1)[0, 0]
    char_idx = top_k_indices[0].numpy()[predicted_idx]+2
    return text_vec_layer.get_vocabulary()[char_idx]

def generate(text, model, text_vec_layer, n_chars=50, k=5, temperature=1.0):
    generated_text = text
    for _ in range(n_chars):
        generated_text += next_char(generated_text, model, text_vec_layer, k, temperature)
    return generated_text

In [None]:
tf.random.set_seed(42)
generated_text = generate("To be or not to b", shakespeare_model, text_vec_layer, n_chars=100, k=5, temperature=0.7)
print(generated_text)
print('-'*60,'\n\n')

generated_text = generate("To be or not to b", shakespeare_model, text_vec_layer, n_chars=100, k=5, temperature=0.001)
print(generated_text)
print('-'*60,'\n\n')

generated_text = generate("To be or not to b", shakespeare_model, text_vec_layer, n_chars=100, k=5, temperature=60)
print(generated_text)
print('-'*60,'\n\n')

### Nucleus Sampling and Beam Search Generation

In [None]:
def generate_beam_search(text, max_length=50, beam_width=3, temperature=1.0):
    beams = [(0.0, text)]
    completed_beams = []
    
    for _ in range(max_length):
        candidates = []
        for score, beam_text in beams:
            y_probs = shakespeare_model.predict([beam_text], verbose=0)[0, -1:]
            logits = tf.math.log(y_probs) / temperature
            top_k_logits, top_k_indices = tf.math.top_k(logits, k=beam_width)
            
            for logit, token_idx in zip(top_k_logits[0], top_k_indices[0]):
                next_char = text_vec_layer.get_vocabulary()[token_idx.numpy() + 2]
                new_text = beam_text + next_char
                new_score = score - float(logit)
                candidates.append((new_score, new_text))
        
        beams = sorted(candidates, key=lambda x: x[0])[:beam_width] # cut off top candidates after each generation step.
    
    return beams[0][1]

def generate_nucleus(text, max_length=50, p=0.9, temperature=1.0):
    result = text
    
    for _ in range(max_length):
        y_probs = shakespeare_model.predict([result], verbose=0)[0, -1:]
        logits = tf.math.log(y_probs) / temperature
        probs = tf.nn.softmax(logits, axis=-1)[0]
        sorted_indices = tf.argsort(probs, direction='DESCENDING')
        sorted_probs = tf.gather(probs, sorted_indices)
        cumulative_probs = tf.cumsum(sorted_probs)
        nucleus_mask = cumulative_probs <= p
        filtered_probs = sorted_probs * tf.cast(nucleus_mask, tf.float32)
        filtered_probs = filtered_probs / tf.reduce_sum(filtered_probs)
        sample_idx = tf.random.categorical(tf.math.log(filtered_probs[None, :]), num_samples=1)[0, 0]
        char_idx = sorted_indices[sample_idx]
        next_char = text_vec_layer.get_vocabulary()[char_idx.numpy() + 2]
        result += next_char
    
    return result

In [None]:
print("-------------------Beam Search (conservative)-------------------")
print(generate_beam_search("to be or not to b", max_length=50, beam_width=3, temperature=0.7))
print("="*64)

print("\n---------------------Beam Search (standard)---------------------")
print(generate_beam_search("to be or not to b", max_length=50, beam_width=5, temperature=1.0))
print("="*64)

print("\n-------------------Nucleus Sampling (focused)-------------------")
print(generate_nucleus("to be or not to b", max_length=50, p=0.9, temperature=0.7))
print("="*64)

print("\n-------------------Nucleus Sampling (creative)------------------")
print(generate_nucleus("to be or not to b", max_length=50, p=0.95, temperature=1.3))

###  Stateful RNN

Preparing the dataset for statefull rnn, it must takes sequential and non-overlaping dataset rather than shuffled and overlapped dataset for stateless rnn.

In [None]:
def to_dataset_for_stateful_rnn(sequence, length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length+1, shift = length, drop_remainder=True)
    ds = ds.flat_map(lambda window: window.batch(length+1)).batch(1)
    return ds.map(lambda window: (window[:,:-1], window[:,1:])).prefetch(1)

stateful_train_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length)
stateful_valid_set = to_dataset_for_stateful_rnn(encoded[1_000_000:1_060_000],length)
stateful_test_set = to_dataset_for_stateful_rnn(encoded[1_060_000:], length)

### Using Batching with Statful RNN

In [None]:
def to_non_overlapping_windows(sequence, length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=length, drop_remainder=True)
    return ds.flat_map(lambda window: window.batch(length + 1))

def to_batched_dataset_for_stateful_rnn(sequence, length, batch_size=32):
    parts = np.array_split(sequence, batch_size)
    datasets = tuple(to_non_overlapping_windows(part, length) for part in parts) 
    ds = tf.data.Dataset.zip(datasets).map(lambda *windows: tf.stack(windows))
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

stateful_train_set = to_batched_dataset_for_stateful_rnn(encoded[:1_000_000], length)
stateful_valid_set = to_batched_dataset_for_stateful_rnn(encoded[1_000_000:1_060_000],length)
stateful_test_set = to_batched_dataset_for_stateful_rnn(encoded[1_060_000:], length)

In [None]:
for idx, (seq, tar) in enumerate(to_batched_dataset_for_stateful_rnn(tf.range(50), length=3, batch_size=4)):
    print('Sequence: \n', seq, '\nTarget: \n', tar, '\n\n')
    if idx>0: break

#### Building the stateful model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16,
                              batch_input_shape=[32, None]),
    tf.keras.layers.GRU(128, return_sequences=True, stateful=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

class ResetStatesCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_stateful_shakespeare_model.keras",
    monitor="val_accuracy",
    save_best_only=True)

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

In [None]:
history = model.fit(stateful_train_set, validation_data=stateful_valid_set,
                    epochs=20, callbacks=[ResetStatesCallback(), model_ckpt])

To use the model with different batch sizes, we need to create a stateless copy:


In [None]:
stateless_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

stateless_model.build(tf.TensorShape([None, None]))
stateless_model.set_weights(model.get_weights())

shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2),  # no <PAD> or <UNK> tokens
    stateless_model
])


In [None]:
generated_text = generate("To be or not to b", shakespeare_model, text_vec_layer, n_chars=100, k=5, temperature=0.001)
print(generated_text)

## Sentiment Analysis

let's download the `imdb` dataset from tensorflow datasets

In [None]:
import tensorflow_datasets as tfds

raw_train_set, raw_valid_set, raw_test_set = tfds.load('imdb_reviews', 
                                       split=['train[:90%]', 'train[90%:]', 'test'],
                                       as_supervised=True)
tf.random.set_seed(42)
train_set = raw_train_set.shuffle(5000, seed=42).batch(32).prefetch(1)
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)


In [None]:
for review, label in raw_train_set.take(4):
    print(">> ",review.numpy().decode('utf-8')[:100])
    print("Label: ", label.numpy())

#### sentiment analysis model

In [None]:
vocab_size = 1000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda review, label: review))

embed_size =128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim=128),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation='sigmoid')

])

In [None]:
for review, label in raw_train_set.take(1):
    print(text_vec_layer(review.numpy().decode('utf-8')))

note that tekens 0,1 are for unknown and padding

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
history1 = model.fit(train_set, validation_data=valid_set, epochs=5)

we notice that the model performance is very poor and this is because there's many padding tokens in the seqeuences fed to the model. which make the RNN forget about what it learned.

<details>
<summary><h3>RNNs and Padding Issues Illustration (click for details)</h3></summary>

Let’s break this down step by step with a simple RNN example. We’ll demonstrate how zeros (padding tokens) lead to forgetting or losing information in the sequence `"What a goal, wow."`.

### Initial Setup
- **Input Sequence**: `"What a goal, wow."`
- After padding: `["What", "a", "goal,", "wow.", 0, 0, 0, ..., 0]`
- Assume each token is represented as an embedding vector for the RNN:
  ```plaintext
  ["What" → [1, 0.5],
   "a" → [0.2, 0.1],
   "goal," → [0.9, 0.7],
   "wow." → [1.2, 0.8],
   0 → [0, 0],  # Padding token mapped to [0, 0]
   0 → [0, 0], ..., 0 → [0, 0]]
  ```

### RNN Computation
For simplicity, assume:
- Hidden state size = 2
- Initial hidden state: `h_0 = [0, 0]`
- Weight matrices: `W_x`, `W_h`, and bias `b` (omitted explicit values for clarity)

The RNN computes at each timestep:
\[
h_t = \tanh(W_x \cdot x_t + W_h \cdot h_{t-1} + b)
\]

#### Step-by-Step
1. **First Token: `"What"`**
   - \( x_1 = [1, 0.5] \)
   - \( h_1 = \tanh(W_x \cdot [1, 0.5] + W_h \cdot [0, 0] + b) \)
   - Result: \( h_1 = [0.8, 0.6] \) (example value)

2. **Second Token: `"a"`**
   - \( x_2 = [0.2, 0.1] \)
   - \( h_2 = \tanh(W_x \cdot [0.2, 0.1] + W_h \cdot [0.8, 0.6] + b) \)
   - Result: \( h_2 = [0.7, 0.5] \)

3. **After `"goal,"** and `"wow."**
   - Gradually builds up meaningful context:
     - \( h_3 = [0.9, 0.7] \), \( h_4 = [1.0, 0.8] \)

4. **Padding Tokens: `0`**
   - \( x_5 = [0, 0] \), \( x_6 = [0, 0] \), etc.
   - For these, \( h_t = \tanh(W_x \cdot [0, 0] + W_h \cdot h_{t-1} + b) \).
   - Since \( x_t = [0, 0] \), only \( W_h \cdot h_{t-1} \) contributes. However, over multiple padding steps, the hidden state \( h_t \) starts to decay:
     - \( h_5 \approx [0.6, 0.4] \)
     - \( h_6 \approx [0.3, 0.2] \)
     - Eventually, \( h_t \approx [0, 0] \).

### Key Observations
- **Information Loss**: The meaningful context \( h_4 = [1.0, 0.8] \) (derived from `"What a goal, wow."`) decays to near-zero as padding dominates.
- **Learning Challenges**: During training, the RNN might learn to ignore later timesteps entirely, assuming they don’t contain useful information.

</details>

We can use a mask to ignore zeros during computation and training. This helps the RNN focus only on the meaningful parts of the sequence.
This is done by setting `mask_zero` equal to true in the embedding layer, and it propagates the mask downstream to all layers that accept it.

In [None]:
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim=128, mask_zero=True),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation='sigmoid')

])

model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
history2 = model.fit(train_set, validation_data=valid_set, epochs=5)

The model now is capable to learn and performing well!

<details>
<summary><h3>Why is masking needed in internal layers, and how does padding affect the input layer?</h3></summary>

### Question:
**Why do we need masking in the internal layers (not just the first one), and how does padding zeros only in the input layer affect the sequence processing?**

### Answer:
1. Propagation of Padding Effect
Even though the first layer (e.g., an embedding or RNN) processes the padded input and replaces explicit zeros with meaningful values (e.g., the previous timestep's hidden state), those padding steps still represent "invalid" parts of the sequence.
Without a mask, internal layers may treat these invalid timesteps as meaningful, which can corrupt the learned representations.
For example:

Suppose the input sequence is [word1, word2, 0, 0], and the RNN replaces the padding steps with the hidden state of word2.
If an internal RNN or dense layer operates on these outputs without masking, it may treat the repeated values from word2 as meaningful information, skewing the results.

</details>

In [None]:
tf.random.set_seed(42)
inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
token_ids = text_vec_layer(inputs)
mask = tf.math.not_equal(token_ids,0)
Z = tf.keras.layers.Embedding(input_dim = vocab_size, output_dim=128)(token_ids)
Z = tf.keras.layers.GRU(128, dropout=.2)(Z, mask=mask)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(Z)
model = tf.keras.Model(inputs=[inputs], outputs=[outputs])

model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history3 = model.fit(train_set, validation_data=valid_set, epochs=5)

#### Last approach using ragged tensors

In [None]:
text_vec_layer_ragged = tf.keras.layers.TextVectorization(max_tokens=vocab_size, ragged=True)
text_vec_layer_ragged.adapt(train_set.map(lambda review, label: review))
text_vec_layer_ragged(["Great movie!", "This is DiCaprio's best role."])

In [None]:
text_vec_layer(["Great movie!", "This is DiCaprio's best role."])

In [None]:
embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer_ragged,
    tf.keras.layers.Embedding(vocab_size, embed_size),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history4 = model.fit(train_set, validation_data=valid_set, epochs=5)

#### Using TensorBoard for Embedding Visualization

In [None]:
embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer_ragged,
    tf.keras.layers.Embedding(vocab_size, embed_size),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

model.summary()

In [None]:

vocab = text_vec_layer_ragged.get_vocabulary()

# Create a metadata file for your words
metadata_file = "metadata.tsv"
with open(metadata_file, 'w') as f:
    for word in vocab:
        f.write(f"{word}\n")

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir="./logs",
    embeddings_freq=1,
    embeddings_layer_names=['embedding'], 
    embeddings_metadata=metadata_file,
    update_freq='epoch'
)


history5 = model.fit(
    train_set,  
    validation_data=valid_set,  
    epochs=5,
    callbacks=[tensorboard_callback]
)

##  Reusing Pretrained Embeddings and Language Models


In [None]:
import os
import tensorflow_hub as hub

os.environ['TFHUB_CACHE_DIR'] = "my_tfhub_cache"
tf.random.set_seed(42)

model = tf.keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                   trainable=True, dtype=tf.string, input_shape=[]),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history_with_pretrained = model.fit(train_set,
                                    validation_data=valid_set, epochs=10)

## An Encoder–Decoder Network for Neural Machine Translation

In [41]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file('spa-eng.zip', origin=url, cache_dir='datasets', extract=True)
text = (Path(path).with_name("spa-eng")/'spa.txt').read_text()

In [42]:
import numpy as np

text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.seed(42)  # extra code – ensures reproducibility on CPU
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)  # separates the pairs into 2 lists

In [43]:
for i in range(3):
    print(sentences_en[i], "=>", sentences_es[i])

How boring! => Qué aburrimiento!
I love sports. => Adoro el deporte.
Would you like to swap jobs? => Te gustaría que intercambiemos los trabajos?


#### Tokenization

In [44]:
vocab_size = 1000
max_length = 50

text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)

text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])

In [45]:
print("English Vocab samples: ", text_vec_layer_en.get_vocabulary()[:10])
print("Spanish Vocab samples: ", text_vec_layer_es.get_vocabulary()[:10])


English Vocab samples:  ['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']
Spanish Vocab samples:  ['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']


#### Preparing Dataset 

In [46]:
# encoder inputs
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])

# decoder inputs
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])

# decoder outputs (targets)
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])

#### Model Construction

In [47]:
tf.random.set_seed(42)

# input layer
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

# embedding
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

#### Encoder Decoder

In [48]:
# encoder
encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

# decoder
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

<details>
    <summary><h3>What is return state and it's uses?</h3></summary>


### Answer:

#### It is the link between the encoder and decoder.

1. **Capturing Context**: The internal states of the LSTM (hidden state `h` and cell state `c`) capture the context and information from the input sequence. These states are essentially a summary of the input sequence.
2. **Passing Information**: In the encoder-decoder architecture, the encoder processes the input sequence and generates the final states. These states are then passed to the decoder as initial states. This allows the decoder to start generating the output sequence with the context of the input sequence.

### How It Works:
- **Encoder**: When `return_state=True`, the LSTM layer returns three outputs: the output sequence, the hidden state `h`, and the cell state `c`.
- **Decoder**: The decoder LSTM uses the hidden state `h` and cell state `c` from the encoder as its initial states. This helps the decoder generate the target sequence with the context of the input sequence.

### Example:
In the code snippet I provided earlier, the encoder LSTM is defined as follows:
```python
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]
```
Here, `return_state=True` ensures that the LSTM layer returns the hidden state `state_h` and cell state `state_c` along with the output sequence `encoder_outputs`. These states are then passed to the decoder:
```python
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
```
The `initial_state=encoder_states` parameter initializes the decoder LSTM with the states from the encoder, allowing the decoder to generate the output sequence with the context of the input sequence.

I hope this clarifies the use of `return_state`! Feel free to ask more questions or dive deeper into any specific aspect.

</details>

In [49]:
# output layer ( dense + softmax ) 
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [50]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history_enc_dec = model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Translation

In [52]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        # Prepare encoder input
        X = np.array([sentence_en])  # encoder input
        print(f"Step {word_idx+1}: Encoder input X = {X}")
        
        # Prepare decoder input
        X_dec = np.array(["startofseq " + translation])  # decoder input
        print(f"Step {word_idx+1}: Decoder input X_dec = {X_dec}")
        
        # Predict the next word's probabilities
        y_proba = model.predict((X, X_dec))[0, word_idx]  # last token's probas
        print(f"Step {word_idx+1}: Model Outputs = {y_proba.shape}")
        # Get the predicted word ID
        predicted_word_id = np.argmax(y_proba)
        print(f"Step {word_idx+1}: Predicted word ID = {predicted_word_id}")
        
        # Get the predicted word from the vocabulary
        vocabulary = text_vec_layer_es.get_vocabulary()
        if predicted_word_id >= len(vocabulary):
            print(f"Step {word_idx+1}: Predicted word ID {predicted_word_id} is out of vocabulary range")
            break  # Handle out-of-vocabulary predictions
        predicted_word = vocabulary[predicted_word_id]
        print(f"Step {word_idx+1}: Predicted word = {predicted_word}")
        
        # Check for end-of-sequence token
        if predicted_word == "endofseq":
            print(f"Step {word_idx+1}: End-of-sequence token encountered")
            break
        
        # Append the predicted word to the translation
        translation += " " + predicted_word
        print(f"Step {word_idx+1}: Updated translation = {translation}")
    
    return translation.strip()

In [53]:
translate("I like soccer")

Step 1: Encoder input X = ['I like soccer']
Step 1: Decoder input X_dec = ['startofseq ']
Step 1: Model Outputs = (1000,)
Step 1: Predicted word ID = 14
Step 1: Predicted word = me
Step 1: Updated translation =  me
Step 2: Encoder input X = ['I like soccer']
Step 2: Decoder input X_dec = ['startofseq  me']
Step 2: Model Outputs = (1000,)
Step 2: Predicted word ID = 61
Step 2: Predicted word = gusta
Step 2: Updated translation =  me gusta
Step 3: Encoder input X = ['I like soccer']
Step 3: Decoder input X_dec = ['startofseq  me gusta']
Step 3: Model Outputs = (1000,)
Step 3: Predicted word ID = 10
Step 3: Predicted word = el
Step 3: Updated translation =  me gusta el
Step 4: Encoder input X = ['I like soccer']
Step 4: Decoder input X_dec = ['startofseq  me gusta el']
Step 4: Model Outputs = (1000,)
Step 4: Predicted word ID = 663
Step 4: Predicted word = fútbol
Step 4: Updated translation =  me gusta el fútbol
Step 5: Encoder input X = ['I like soccer']
Step 5: Decoder input X_dec = ['s

'me gusta el fútbol'

In [54]:
translate("I like soccer and also going to the beach")

Step 1: Encoder input X = ['I like soccer and also going to the beach']
Step 1: Decoder input X_dec = ['startofseq ']
Step 1: Model Outputs = (1000,)
Step 1: Predicted word ID = 14
Step 1: Predicted word = me
Step 1: Updated translation =  me
Step 2: Encoder input X = ['I like soccer and also going to the beach']
Step 2: Decoder input X_dec = ['startofseq  me']
Step 2: Model Outputs = (1000,)
Step 2: Predicted word ID = 61
Step 2: Predicted word = gusta
Step 2: Updated translation =  me gusta
Step 3: Encoder input X = ['I like soccer and also going to the beach']
Step 3: Decoder input X_dec = ['startofseq  me gusta']
Step 3: Model Outputs = (1000,)
Step 3: Predicted word ID = 31
Step 3: Predicted word = y
Step 3: Updated translation =  me gusta y
Step 4: Encoder input X = ['I like soccer and also going to the beach']
Step 4: Decoder input X_dec = ['startofseq  me gusta y']
Step 4: Model Outputs = (1000,)
Step 4: Predicted word ID = 6
Step 4: Predicted word = a
Step 4: Updated translati

'me gusta y a la escuela lo que [UNK] en mí'

The translation says “I like and go to school what [unk] in me”. So how can you
 improve it? One way is to increase the training set size and add more LSTM layers in
 both the encoder and the decoder. But this will only get you so far, so let’s look at
 more sophisticated techniques, starting with bidirectional recurrent layers.

## Bidirectional RNNs

 At each time step, a regular recurrent layer only looks at past and present inputs
 before generating its output. In other words, it is causal, meaning it cannot look
 into the future. This type of RNN makes sense when forecasting time series, or
 in the decoder of a sequence-to-sequence (seq2seq) model. But for tasks like text
 classification, or in the encoder of a seq2seq model, it is often preferable to look
 ahead at the next words before encoding a given word.

To create a bidirectional recurrent layer, just wrap a regular recurrent layer in a Bidirectional layer:

In [55]:
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_state=True))


# we must concat the states from the encoder:
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),  # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)]  # long-term (1 & 3)

# same decoder and training:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7e575960bcd0>

In [56]:
translate("I like soccer and also going to the beach")

Step 1: Encoder input X = ['I like soccer and also going to the beach']
Step 1: Decoder input X_dec = ['startofseq ']
Step 1: Model Outputs = (1000,)
Step 1: Predicted word ID = 14
Step 1: Predicted word = me
Step 1: Updated translation =  me
Step 2: Encoder input X = ['I like soccer and also going to the beach']
Step 2: Decoder input X_dec = ['startofseq  me']
Step 2: Model Outputs = (1000,)
Step 2: Predicted word ID = 61
Step 2: Predicted word = gusta
Step 2: Updated translation =  me gusta
Step 3: Encoder input X = ['I like soccer and also going to the beach']
Step 3: Decoder input X_dec = ['startofseq  me gusta']
Step 3: Model Outputs = (1000,)
Step 3: Predicted word ID = 52
Step 3: Predicted word = hacer
Step 3: Updated translation =  me gusta hacer
Step 4: Encoder input X = ['I like soccer and also going to the beach']
Step 4: Decoder input X_dec = ['startofseq  me gusta hacer']
Step 4: Model Outputs = (1000,)
Step 4: Predicted word ID = 1
Step 4: Predicted word = [UNK]
Step 4: U

'me gusta hacer [UNK] a la playa por la playa'

## Beam Search

<font size=3>We boosting our encoder–decoder model’s performance without any extra training, simply by using it more wisely</font>

In [57]:
def beam_search(sentence_en, beam_width, verbose=False):
    X = np.array([sentence_en])  # encoder input
    X_dec = np.array(["startofseq"])  # decoder input
    y_proba = model.predict((X, X_dec))[0, 0]  # first token's probas
    top_k = tf.math.top_k(y_proba, k=beam_width)
    top_translations = [  # list of best (log_proba, translation)
        (np.log(word_proba), text_vec_layer_es.get_vocabulary()[word_id])
        for word_proba, word_id in zip(top_k.values, top_k.indices)
    ]
    
    # extra code – displays the top first words in verbose mode
    if verbose:
        print("Top first words:", top_translations)

    for idx in range(1, max_length):
        candidates = []
        for log_proba, translation in top_translations:
            if translation.endswith("endofseq"):
                candidates.append((log_proba, translation))
                continue  # translation is finished, so don't try to extend it
            X = np.array([sentence_en])  # encoder input
            X_dec = np.array(["startofseq " + translation])  # decoder input
            y_proba = model.predict((X, X_dec))[0, idx]  # last token's proba
            for word_id, word_proba in enumerate(y_proba):
                word = text_vec_layer_es.get_vocabulary()[word_id]
                candidates.append((log_proba + np.log(word_proba),
                                   f"{translation} {word}"))
        top_translations = sorted(candidates, reverse=True)[:beam_width]

        # extra code – displays the top translation so far in verbose mode
        if verbose:
            print("Top translations so far:", top_translations)

        if all([tr.endswith("endofseq") for _, tr in top_translations]):
            return top_translations[0][1].replace("endofseq", "").strip()

In [58]:
sentence_en = "I love cats and dogs"
translate(sentence_en)

Step 1: Encoder input X = ['I love cats and dogs']
Step 1: Decoder input X_dec = ['startofseq ']
Step 1: Model Outputs = (1000,)
Step 1: Predicted word ID = 14
Step 1: Predicted word = me
Step 1: Updated translation =  me
Step 2: Encoder input X = ['I love cats and dogs']
Step 2: Decoder input X_dec = ['startofseq  me']
Step 2: Model Outputs = (1000,)
Step 2: Predicted word ID = 1
Step 2: Predicted word = [UNK]
Step 2: Updated translation =  me [UNK]
Step 3: Encoder input X = ['I love cats and dogs']
Step 3: Decoder input X_dec = ['startofseq  me [UNK]']
Step 3: Model Outputs = (1000,)
Step 3: Predicted word ID = 21
Step 3: Predicted word = los
Step 3: Updated translation =  me [UNK] los
Step 4: Encoder input X = ['I love cats and dogs']
Step 4: Decoder input X_dec = ['startofseq  me [UNK] los']
Step 4: Model Outputs = (1000,)
Step 4: Predicted word ID = 556
Step 4: Predicted word = perros
Step 4: Updated translation =  me [UNK] los perros
Step 5: Encoder input X = ['I love cats and do

'me [UNK] los perros y no los gatos'

In [59]:
beam_search(sentence_en, beam_width=3, verbose=True)

Top first words: [(-0.07913118, 'me'), (-2.8435392, 'yo'), (-4.2571306, 'odio')]
Top translations so far: [(-0.40813845, 'me [UNK]'), (-2.2316828, 'me encanta'), (-2.4425611, 'me odio')]
Top translations so far: [(-1.0272044, 'me [UNK] los'), (-2.302462, 'me [UNK] y'), (-2.4974494, 'me [UNK] a')]
Top translations so far: [(-1.0715384, 'me [UNK] los perros'), (-2.5088043, 'me [UNK] a los'), (-2.734133, 'me [UNK] y a')]
Top translations so far: [(-1.2074786, 'me [UNK] los perros y'), (-2.5113072, 'me [UNK] a los perros'), (-2.9482303, 'me [UNK] y a los')]
Top translations so far: [(-2.0225499, 'me [UNK] los perros y no'), (-2.2251234, 'me [UNK] los perros y los'), (-2.959288, 'me [UNK] y a los perros')]
Top translations so far: [(-2.333189, 'me [UNK] los perros y no los'), (-2.415937, 'me [UNK] los perros y los gatos'), (-3.593313, 'me [UNK] y a los perros [UNK]')]
Top translations so far: [(-2.415949, 'me [UNK] los perros y los gatos endofseq'), (-3.2233038, 'me [UNK] los perros y no lo

'me [UNK] los perros y los gatos'

Penalizing UNK token to discourage the model using it.

In [68]:
def beam_search(sentence_en, beam_width, verbose=False):
    X = np.array([sentence_en])  # encoder input
    X_dec = np.array(["startofseq"])  # decoder input
    y_proba = model.predict((X, X_dec))[0, 0]  # first token's probas
    top_k = tf.math.top_k(y_proba, k=beam_width)
    top_translations = [  # list of best (log_proba, translation)
        (np.log(word_proba), text_vec_layer_es.get_vocabulary()[word_id])
        for word_proba, word_id in zip(top_k.values, top_k.indices)
        if text_vec_layer_es.get_vocabulary()[word_id] != "[UNK]"
    ]
    
    # extra code – displays the top first words in verbose mode
    if verbose:
        print("Top first words:", top_translations)

    for idx in range(1, max_length):
        candidates = []
        for log_proba, translation in top_translations:
            if translation.endswith("endofseq"):
                candidates.append((log_proba, translation))
                continue  # translation is finished, so don't try to extend it
            X = np.array([sentence_en])  # encoder input
            X_dec = np.array(["startofseq " + translation])  # decoder input
            y_proba = model.predict((X, X_dec))[0, idx]  # last token's proba
            for word_id, word_proba in enumerate(y_proba):
                word = text_vec_layer_es.get_vocabulary()[word_id]
                if word == "[UNK]":
                    word_proba /= 5  # penalize [UNK] token
                candidates.append((log_proba + np.log(word_proba),
                                   f"{translation} {word}"))
        top_translations = sorted(candidates, reverse=True)[:beam_width]

        # extra code – displays the top translation so far in verbose mode
        if verbose:
            print("Top translations so far:", top_translations)

        if all([tr.endswith("endofseq") for _, tr in top_translations]):
            return top_translations[0][1].replace("endofseq", "").strip()


In [69]:
beam_search(sentence_en, beam_width=3)



'me odio los perros y los gatos'