#  Chapter 16: Natural Language Processing with RNNs and Attention

In [None]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1" 

import tensorflow as tf
import numpy as np
from pathlib import Path
import tf_keras

In [None]:

shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [None]:
print(shakespeare_text[:80])

In [None]:
all_chars  = "".join(sorted(set(shakespeare_text.lower())))
print(all_chars)
print(len(all_chars))

## Generating Shakespearean Text Using a Character RNN

### Preparing Dataset for a char level rnn model

#### Text Vectorization

In [None]:
text_vec_layer = tf.keras.layers.TextVectorization(split='character', standardize='lower')
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]
encoded

In [None]:
encoded -= 2 # drop 0 for padding and 1 for unkown tokens
n_tokens = text_vec_layer.vocabulary_size()-2
dataset_size = len(encoded)
print("n_tokens:", n_tokens)
print("dataset_size:", dataset_size)

it is seq2seq model

In [None]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices(sequence)
    dataset = dataset.window(length+1, shift=1, drop_remainder =True)
    dataset = dataset.flat_map(lambda window: window.batch(length+1))
    if shuffle:
      dataset = dataset.shuffle(buffer_size=100_000, seed=seed)
    dataset = dataset.batch(batch_size)
    return dataset.map(lambda window: (window[:,:-1], window[:,1:])).prefetch(1)

In [None]:
# There's just one sample in this dataset: the input represents "to b" and the
# output represents "o be"
list(to_dataset(text_vec_layer(["To be"])[0], length=4))

In [None]:
length = 100
tf.random.set_seed(42)

train_set = to_dataset(encoded[:1_000_000], length=100, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=100)
test_set = to_dataset(encoded[1_060_000:], length=100)


### Building and Training the Char-RNN Model


In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation='softmax'),
])


model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam',
              metrics=['accuracy'])

model_ckpt = tf.keras.callbacks.ModelCheckpoint(
 "my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True)

In [None]:
model.summary()

In [None]:
history = model.fit(train_set, validation_data=valid_set, epochs=1,
 callbacks=[model_ckpt])


#### shakespeare model

In [None]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X:X-2),
    model
])

In [None]:
# or we can the pretrained model
url = "https://github.com/ageron/data/raw/main/shakespeare_model.tgz"
path = tf.keras.utils.get_file("shakespeare_model.tgz", url, extract=True)
model_path = Path(path).with_name("shakespeare_model")
shakespeare_model = tf.keras.models.load_model(model_path)

In [None]:
model_path

In [None]:
shakespeare_model.summary()

predicting next character:

In [None]:
y_propas = shakespeare_model.predict(['To be or not to b'])[0]
print( y_propas[-1].shape,'\n\n', y_propas[-1],'\n')
predicted_char = tf.argmax(y_propas[-1])
print(predicted_char+2) # predicted character index + 2 to map the original char again

print("predicted character is: ", text_vec_layer.get_vocabulary()[predicted_char+2])

### Generating Shakespear 'FAKE' text

Instead of using greedy decoding (predict next character and add it to the current text and use all to predict next char and so on.)

The previous approach lead to repeated words.

We can use random sampling (with keeping the estimated propabilty of the model prediction when sampling)


In [None]:
tf.random.set_seed(412)

log_propas = tf.math.log([[.5,.4,.1]]) # simulate the logits
print("> logits: ", log_propas)

print("> Sampling results: ", tf.random.categorical(log_propas, num_samples=8))

In [None]:
tf.random.categorical(log_propas, num_samples=1).numpy()[0,0]

we can take the control over the generated diversity of the text using `temperature`

> high values indicates creativity
> 
> low values indicates precision

In [None]:
def next_char(text, temperature=1):
    y_propas = shakespeare_model.predict([text], verbose=0)[0,-1:]
    y_propas = tf.math.log(y_propas) / temperature
    predicted_ind = tf.random.categorical(y_propas, num_samples=1).numpy()[0,0]
    return text_vec_layer.get_vocabulary()[predicted_ind+2]

def generate(text, n_char=50, temperature=1):
    for _ in range(n_char):
        text += next_char(text, temperature)
    return text


In [None]:
tf.random.set_seed(42)
print(generate('to be or not to b', temperature=0.01),end= '\n'+'='*50+'\n')
print(generate('to be or not to b', temperature=1),end= '\n'+'='*50+'\n')
print(generate('to be or not to b', temperature=199),end= '\n'+'='*50+'\n')

### Sampling from top k charcters

In [None]:
def next_char(text, model, text_vec_layer, k=5, temperature=1.0):
    y_probs = model.predict([text], verbose=0)[0, -1:]
    y_probs = tf.math.log(y_probs) / temperature
    top_k_indices = tf.math.top_k(y_probs, k=k).indices
    top_k_probs = tf.gather(y_probs, top_k_indices, axis=-1)
    top_k_probs = tf.reshape(top_k_probs, (1, -1))
    predicted_idx = tf.random.categorical(top_k_probs, num_samples=1)[0, 0]
    char_idx = top_k_indices[0].numpy()[predicted_idx]+2
    return text_vec_layer.get_vocabulary()[char_idx]

def generate(text, model, text_vec_layer, n_chars=50, k=5, temperature=1.0):
    generated_text = text
    for _ in range(n_chars):
        generated_text += next_char(generated_text, model, text_vec_layer, k, temperature)
    return generated_text

In [None]:
tf.random.set_seed(42)
generated_text = generate("To be or not to b", shakespeare_model, text_vec_layer, n_chars=100, k=5, temperature=0.7)
print(generated_text)
print('-'*60,'\n\n')

generated_text = generate("To be or not to b", shakespeare_model, text_vec_layer, n_chars=100, k=5, temperature=0.001)
print(generated_text)
print('-'*60,'\n\n')

generated_text = generate("To be or not to b", shakespeare_model, text_vec_layer, n_chars=100, k=5, temperature=60)
print(generated_text)
print('-'*60,'\n\n')

### Nucleus Sampling and Beam Search Generation

In [None]:
def generate_beam_search(text, max_length=50, beam_width=3, temperature=1.0):
    beams = [(0.0, text)]
    completed_beams = []
    
    for _ in range(max_length):
        candidates = []
        for score, beam_text in beams:
            y_probs = shakespeare_model.predict([beam_text], verbose=0)[0, -1:]
            logits = tf.math.log(y_probs) / temperature
            top_k_logits, top_k_indices = tf.math.top_k(logits, k=beam_width)
            
            for logit, token_idx in zip(top_k_logits[0], top_k_indices[0]):
                next_char = text_vec_layer.get_vocabulary()[token_idx.numpy() + 2]
                new_text = beam_text + next_char
                new_score = score - float(logit)
                candidates.append((new_score, new_text))
        
        beams = sorted(candidates, key=lambda x: x[0])[:beam_width] # cut off top candidates after each generation step.
    
    return beams[0][1]

def generate_nucleus(text, max_length=50, p=0.9, temperature=1.0):
    result = text
    
    for _ in range(max_length):
        y_probs = shakespeare_model.predict([result], verbose=0)[0, -1:]
        logits = tf.math.log(y_probs) / temperature
        probs = tf.nn.softmax(logits, axis=-1)[0]
        sorted_indices = tf.argsort(probs, direction='DESCENDING')
        sorted_probs = tf.gather(probs, sorted_indices)
        cumulative_probs = tf.cumsum(sorted_probs)
        nucleus_mask = cumulative_probs <= p
        filtered_probs = sorted_probs * tf.cast(nucleus_mask, tf.float32)
        filtered_probs = filtered_probs / tf.reduce_sum(filtered_probs)
        sample_idx = tf.random.categorical(tf.math.log(filtered_probs[None, :]), num_samples=1)[0, 0]
        char_idx = sorted_indices[sample_idx]
        next_char = text_vec_layer.get_vocabulary()[char_idx.numpy() + 2]
        result += next_char
    
    return result

In [None]:
print("-------------------Beam Search (conservative)-------------------")
print(generate_beam_search("to be or not to b", max_length=50, beam_width=3, temperature=0.7))
print("="*64)

print("\n---------------------Beam Search (standard)---------------------")
print(generate_beam_search("to be or not to b", max_length=50, beam_width=5, temperature=1.0))
print("="*64)

print("\n-------------------Nucleus Sampling (focused)-------------------")
print(generate_nucleus("to be or not to b", max_length=50, p=0.9, temperature=0.7))
print("="*64)

print("\n-------------------Nucleus Sampling (creative)------------------")
print(generate_nucleus("to be or not to b", max_length=50, p=0.95, temperature=1.3))

###  Stateful RNN

Preparing the dataset for statefull rnn, it must takes sequential and non-overlaping dataset rather than shuffled and overlapped dataset for stateless rnn.

In [None]:
def to_dataset_for_stateful_rnn(sequence, length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length+1, shift = length, drop_remainder=True)
    ds = ds.flat_map(lambda window: window.batch(length+1)).batch(1)
    return ds.map(lambda window: (window[:,:-1], window[:,1:])).prefetch(1)

stateful_train_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length)
stateful_valid_set = to_dataset_for_stateful_rnn(encoded[1_000_000:1_060_000],length)
stateful_test_set = to_dataset_for_stateful_rnn(encoded[1_060_000:], length)

### Using Batching with Statful RNN

In [None]:
def to_non_overlapping_windows(sequence, length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=length, drop_remainder=True)
    return ds.flat_map(lambda window: window.batch(length + 1))

def to_batched_dataset_for_stateful_rnn(sequence, length, batch_size=32):
    parts = np.array_split(sequence, batch_size)
    datasets = tuple(to_non_overlapping_windows(part, length) for part in parts) 
    ds = tf.data.Dataset.zip(datasets).map(lambda *windows: tf.stack(windows))
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

stateful_train_set = to_batched_dataset_for_stateful_rnn(encoded[:1_000_000], length)
stateful_valid_set = to_batched_dataset_for_stateful_rnn(encoded[1_000_000:1_060_000],length)
stateful_test_set = to_batched_dataset_for_stateful_rnn(encoded[1_060_000:], length)

In [None]:
for idx, (seq, tar) in enumerate(to_batched_dataset_for_stateful_rnn(tf.range(50), length=3, batch_size=4)):
    print('Sequence: \n', seq, '\nTarget: \n', tar, '\n\n')
    if idx>0: break

#### Building the stateful model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16,
                              batch_input_shape=[32, None]),
    tf.keras.layers.GRU(128, return_sequences=True, stateful=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

class ResetStatesCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_stateful_shakespeare_model.keras",
    monitor="val_accuracy",
    save_best_only=True)

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

In [None]:
history = model.fit(stateful_train_set, validation_data=stateful_valid_set,
                    epochs=20, callbacks=[ResetStatesCallback(), model_ckpt])

To use the model with different batch sizes, we need to create a stateless copy:


In [None]:
stateless_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

stateless_model.build(tf.TensorShape([None, None]))
stateless_model.set_weights(model.get_weights())

shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2),  # no <PAD> or <UNK> tokens
    stateless_model
])


In [None]:
generated_text = generate("To be or not to b", shakespeare_model, text_vec_layer, n_chars=100, k=5, temperature=0.001)
print(generated_text)

## Sentiment Analysis

let's download the `imdb` dataset from tensorflow datasets

In [None]:
import tensorflow_datasets as tfds

raw_train_set, raw_valid_set, raw_test_set = tfds.load('imdb_reviews', 
                                       split=['train[:90%]', 'train[90%:]', 'test'],
                                       as_supervised=True)
tf.random.set_seed(42)
train_set = raw_train_set.shuffle(5000, seed=42).batch(32).prefetch(1)
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)


In [16]:
for review, label in raw_train_set.take(4):
    print(">> ",review.numpy().decode('utf-8')[:100])
    print("Label: ", label.numpy())

>>  This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. 
Label:  0
>>  I have been known to fall asleep during films, but this is usually due to a combination of things in
Label:  0
>>  Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brenn
Label:  0
>>  This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with i
Label:  1


#### sentiment analysis model

In [35]:
vocab_size = 1000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda review, label: review))

embed_size =128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim=128),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation='sigmoid')

])

In [36]:
for review, label in raw_train_set.take(1):
    print(text_vec_layer(review.numpy().decode('utf-8')))

tf.Tensor(
[ 11  14  34 410 383  18  90  28   1   8  33   1   1  41 491   1 192  24
  86 152  19  11 218 315  28  65 241 217   8 487  54  65  86 113  95  22
   1  11  93 644 729  11  18   7  34 396   1 171   1 404   2  88   1 137
  67 144  52   2   1   1  67 245  65   1  16   1   1   1   1   1   1   3
  40   1   1  17   1  14 158  19   4   1 874   1   8   4  18  12  14   1
   5  98 146   1  10 237 688  12  48  24  93  39  11   1 152  39   1   1
  50 403  10  95   1 863 140   9], shape=(116,), dtype=int64)


note that tekens 0,1 are for unknown and padding

In [37]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_2 (Text  (None, None)              0         
 Vectorization)                                                  
                                                                 
 embedding_5 (Embedding)     (None, None, 128)         128000    
                                                                 
 gru_4 (GRU)                 (None, 128)               99072     
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 227201 (887.50 KB)
Trainable params: 227201 (887.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [38]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
history1 = model.fit(train_set, validation_data=valid_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


we notice that the model performance is very poor and this is because there's many padding tokens in the seqeuences fed to the model. which make the RNN forget about what it learned.

<details>
<summary><h3>RNNs and Padding Issues Illustration (click for details)</h3></summary>

Let’s break this down step by step with a simple RNN example. We’ll demonstrate how zeros (padding tokens) lead to forgetting or losing information in the sequence `"What a goal, wow."`.

### Initial Setup
- **Input Sequence**: `"What a goal, wow."`
- After padding: `["What", "a", "goal,", "wow.", 0, 0, 0, ..., 0]`
- Assume each token is represented as an embedding vector for the RNN:
  ```plaintext
  ["What" → [1, 0.5],
   "a" → [0.2, 0.1],
   "goal," → [0.9, 0.7],
   "wow." → [1.2, 0.8],
   0 → [0, 0],  # Padding token mapped to [0, 0]
   0 → [0, 0], ..., 0 → [0, 0]]
  ```

### RNN Computation
For simplicity, assume:
- Hidden state size = 2
- Initial hidden state: `h_0 = [0, 0]`
- Weight matrices: `W_x`, `W_h`, and bias `b` (omitted explicit values for clarity)

The RNN computes at each timestep:
\[
h_t = \tanh(W_x \cdot x_t + W_h \cdot h_{t-1} + b)
\]

#### Step-by-Step
1. **First Token: `"What"`**
   - \( x_1 = [1, 0.5] \)
   - \( h_1 = \tanh(W_x \cdot [1, 0.5] + W_h \cdot [0, 0] + b) \)
   - Result: \( h_1 = [0.8, 0.6] \) (example value)

2. **Second Token: `"a"`**
   - \( x_2 = [0.2, 0.1] \)
   - \( h_2 = \tanh(W_x \cdot [0.2, 0.1] + W_h \cdot [0.8, 0.6] + b) \)
   - Result: \( h_2 = [0.7, 0.5] \)

3. **After `"goal,"** and `"wow."**
   - Gradually builds up meaningful context:
     - \( h_3 = [0.9, 0.7] \), \( h_4 = [1.0, 0.8] \)

4. **Padding Tokens: `0`**
   - \( x_5 = [0, 0] \), \( x_6 = [0, 0] \), etc.
   - For these, \( h_t = \tanh(W_x \cdot [0, 0] + W_h \cdot h_{t-1} + b) \).
   - Since \( x_t = [0, 0] \), only \( W_h \cdot h_{t-1} \) contributes. However, over multiple padding steps, the hidden state \( h_t \) starts to decay:
     - \( h_5 \approx [0.6, 0.4] \)
     - \( h_6 \approx [0.3, 0.2] \)
     - Eventually, \( h_t \approx [0, 0] \).

### Key Observations
- **Information Loss**: The meaningful context \( h_4 = [1.0, 0.8] \) (derived from `"What a goal, wow."`) decays to near-zero as padding dominates.
- **Learning Challenges**: During training, the RNN might learn to ignore later timesteps entirely, assuming they don’t contain useful information.

</details>

We can use a mask to ignore zeros during computation and training. This helps the RNN focus only on the meaningful parts of the sequence.
This is done by setting `mask_zero` equal to true in the embedding layer, and it propagates the mask downstream to all layers that accept it.

In [39]:
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim=128, mask_zero=True),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation='sigmoid')

])

model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
history2 = model.fit(train_set, validation_data=valid_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


The model now is capable to learn and performing well!

<details>
<summary><h3>Why is masking needed in internal layers, and how does padding affect the input layer?</h3></summary>

### Question:
**Why do we need masking in the internal layers (not just the first one), and how does padding zeros only in the input layer affect the sequence processing?**

### Answer:
1. Propagation of Padding Effect
Even though the first layer (e.g., an embedding or RNN) processes the padded input and replaces explicit zeros with meaningful values (e.g., the previous timestep's hidden state), those padding steps still represent "invalid" parts of the sequence.
Without a mask, internal layers may treat these invalid timesteps as meaningful, which can corrupt the learned representations.
For example:

Suppose the input sequence is [word1, word2, 0, 0], and the RNN replaces the padding steps with the hidden state of word2.
If an internal RNN or dense layer operates on these outputs without masking, it may treat the repeated values from word2 as meaningful information, skewing the results.

</details>

In [43]:
tf.random.set_seed(42)
inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
token_ids = text_vec_layer(inputs)
mask = tf.math.not_equal(token_ids,0)
Z = tf.keras.layers.Embedding(input_dim = vocab_size, output_dim=128)(token_ids)
Z = tf.keras.layers.GRU(128, dropout=.2)(Z, mask=mask)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(Z)
model = tf.keras.Model(inputs=[inputs], outputs=[outputs])

model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history3 = model.fit(train_set, validation_data=valid_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


#### Last approach using ragged tensors

In [45]:
text_vec_layer_ragged = tf.keras.layers.TextVectorization(max_tokens=vocab_size, ragged=True)
text_vec_layer_ragged.adapt(train_set.map(lambda review, label: review))
text_vec_layer_ragged(["Great movie!", "This is DiCaprio's best role."])

<tf.RaggedTensor [[86, 18], [11, 7, 1, 116, 217]]>

In [46]:
text_vec_layer(["Great movie!", "This is DiCaprio's best role."])

<tf.Tensor: shape=(2, 5), dtype=int64, numpy=
array([[ 86,  18,   0,   0,   0],
       [ 11,   7,   1, 116, 217]])>

In [47]:
embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer_ragged,
    tf.keras.layers.Embedding(vocab_size, embed_size),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history4 = model.fit(train_set, validation_data=valid_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


#### Using TensorBoard for Embedding Visualization

In [48]:
embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer_ragged,
    tf.keras.layers.Embedding(vocab_size, embed_size),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])



# -----------------modify TensorBoard callback to include more metadata------------
vocab = text_vec_layer_ragged.get_vocabulary()

# Create a metadata file for your words
metadata_file = "metadata.tsv"
with open(metadata_file, 'w') as f:
    for word in vocab:
        f.write(f"{word}\n")

# Modified TensorBoard callback
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir="./logs",
    embeddings_freq=1,
    embeddings_layer_names=['embedding'], # Make sure this matches your embedding layer name
    embeddings_metadata=metadata_file,
    update_freq='epoch'
)

#---------------------------------------------------------------------------------------

history5 = model.fit(
    train_set,  
    validation_data=valid_set,  
    epochs=5,
    callbacks=[tensorboard_callback]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
from tensorboard import program
import webbrowser
from pathlib import Path

# Setup the TensorBoard notebook extension
%load_ext tensorboard

# Launch TensorBoard
# In Kaggle, we need to use a specific port
%tensorboard --logdir ./logs --port 6006

In [None]:
import tensorflow as tf
import tensorboard as tb

# Get the embedding layer
embedding_layer = model.layers[1]  # Adjust index based on your model structure
weights = embedding_layer.get_weights()[0]

# Create a summary writer
writer = tf.summary.create_file_writer("./logs/embedding")

# Write the embeddings
with writer.as_default():
    tf.summary.text("vocab", tf.constant(vocab), step=0)
    tf.summary.embedding(
        "embedding",
        weights,
        metadata=vocab,
        step=0
    )

In [None]:
import tensorboard as tb

# Get the embedding layer weights
embedding_layer = model.get_layer('embedding')
weights = embedding_layer.get_weights()[0]

# Create a summary writer for embeddings
writer = tf.summary.create_file_writer(str(log_dir / "embedding"))

# Write the embeddings
with writer.as_default():
    tf.summary.text("vocab", tf.constant(vocab), step=0)
    tf.summary.embedding(
        "embedding",
        weights,
        metadata=vocab,
        step=0
    )

# 9. Helper function to check nearest neighbors in embedding space
def find_nearest_words(word, n=5):
    """Find n nearest neighbors for a given word in the embedding space."""
    if word not in vocab:
        return "Word not in vocabulary"
    
    word_idx = vocab.index(word)
    word_embedding = weights[word_idx]
    
    # Calculate distances to all other words
    distances = np.linalg.norm(weights - word_embedding, axis=1)
    
    # Get indices of nearest neighbors
    nearest_indices = np.argsort(distances)[1:n+1]
    
    return [(vocab[idx], distances[idx]) for idx in nearest_indices]

# Example usage:
print(find_nearest_words("good", n=5))

  pid, fd = os.forkpty()


2025-01-02 20:52:03.096473: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-02 20:52:03.118680: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-02 20:52:03.125158: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1735851125.706429    8117 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735851125.771418    8117 cuda_executor.cc:1015] succ