## Part 2: Seq2Seq with Attention

1. Importing Necessary Libraries

In [None]:
import tensorflow as tf
tf.config.optimizer.set_jit(True)

from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

2025-06-19 09:13:53.805390: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750324434.178719      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750324434.286729      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
import os
import random
import json
import pickle
import numpy as np

import tensorflow as tf
from tensorflow.keras import mixed_precision
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, AdditiveAttention, TimeDistributed, Concatenate, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from nltk.translate.bleu_score import corpus_bleu

2. Configuration

In [None]:
with open('config.json', 'r') as f:
    config = json.load(f)

eng_vocab_size = config['eng_vocab_size']
fra_vocab_size = config['fra_vocab_size']

EMBEDDING_DIM = 100
HIDDEN_UNITS = 256
STACKED_LAYERS = 2
BATCH_SIZE = 256
EPOCHS = 30
LEARNING_RATE = 0.001

3. Load Preprocessed Data & Embeddings

In [None]:
data = np.load('data.npz', allow_pickle=True)
encoder_input_data = np.array(data['encoder_input_data'], dtype=np.int32)
decoder_input_data = np.array(data['decoder_input_data'], dtype=np.int32)
decoder_target_data = np.array(data['decoder_target_data'], dtype=np.int32)

if decoder_target_data.ndim == 3 and decoder_target_data.shape[-1] == 1:
    decoder_target_data = np.squeeze(decoder_target_data, axis=-1)

with open('en_embedding_matrix.pkl', 'rb') as f:
    en_embedding_matrix = pickle.load(f)

val_size = int(0.1 * len(encoder_input_data))

def create_bucketed_dataset(enc, dec, tgt, batch_size):
    def gen():
        for x, y, z in zip(enc, dec, tgt):
            yield {"encoder_input": x, "decoder_input": y, "decoder_target": z}

    output_signature = {
        "encoder_input": tf.TensorSpec(shape=(None,), dtype=tf.int32),
        "decoder_input": tf.TensorSpec(shape=(None,), dtype=tf.int32),
        "decoder_target": tf.TensorSpec(shape=(None,), dtype=tf.int32),
    }

    dataset = tf.data.Dataset.from_generator(gen, output_signature=output_signature)

    def map_fn(sample):
        return (sample["encoder_input"], sample["decoder_input"]), sample["decoder_target"]

    def bucket_by_seq_len(example):
        enc_len = tf.shape(example["encoder_input"])[0]
        return tf.cast(enc_len // 10, tf.int64)  # bucket keys

    dataset = dataset.apply(
        tf.data.experimental.group_by_window(
            key_func=bucket_by_seq_len,
            reduce_func=lambda key, ds: ds.map(map_fn).padded_batch(
                batch_size,
                padded_shapes=(([None], [None]), [None]),
                padding_values=((0, 0), 0),
                drop_remainder=True
            ),
            window_size=batch_size
        )
    )

    return dataset.prefetch(tf.data.AUTOTUNE)


train_dataset = create_bucketed_dataset(
    encoder_input_data[:-val_size],
    decoder_input_data[:-val_size],
    decoder_target_data[:-val_size],
    batch_size=48
)

val_dataset = create_bucketed_dataset(
    encoder_input_data[-val_size:],
    decoder_input_data[-val_size:],
    decoder_target_data[-val_size:],
    batch_size=48
)


4. Build Seq2Seq Model with Additive Attention

In [None]:
encoder_inputs = Input(shape=(None,), name="encoder_input")
encoder_embedding = Embedding(
    input_dim=eng_vocab_size,
    output_dim=EMBEDDING_DIM,
    weights=[en_embedding_matrix],
    trainable=True,
    name="encoder_embedding"
)(encoder_inputs)

encoder_lstm1 = Bidirectional(LSTM(HIDDEN_UNITS, return_sequences=True, return_state=True))
encoder_output1, fh1, fc1, bh1, bc1 = encoder_lstm1(encoder_embedding)

encoder_states_h = Concatenate()([fh1, bh1])
encoder_states_c = Concatenate()([fc1, bc1])
encoder_states = [encoder_states_h, encoder_states_c]

decoder_inputs = Input(shape=(None,), name="decoder_input")
decoder_embedding = Embedding(fra_vocab_size, EMBEDDING_DIM, trainable=True, name="decoder_embedding")(decoder_inputs)

decoder_lstm = LSTM(HIDDEN_UNITS * 2, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

attention = AdditiveAttention(name="attention_layer")
context_vector = attention([decoder_outputs, encoder_output1])

decoder_combined_context = Concatenate(axis=-1)([decoder_outputs, context_vector])

decoder_dense = TimeDistributed(Dense(fra_vocab_size, activation="softmax", dtype="float32"), name="output_dense")
decoder_outputs = decoder_dense(decoder_combined_context)

model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)

def masked_sparse_accuracy(y_true, y_pred):
    y_pred_labels = tf.argmax(y_pred, axis=-1, output_type=tf.int32)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    correct = tf.cast(tf.equal(y_true, y_pred_labels), tf.float32) * mask
    return tf.reduce_sum(correct) / tf.reduce_sum(mask)

model.compile(
    optimizer=Adam(learning_rate=LEARNING_RATE),
    loss="sparse_categorical_crossentropy",
    metrics=[masked_sparse_accuracy]
)

model.summary()

5. Training with Early Stopping

In [21]:
checkpoint = ModelCheckpoint(
    "best_model_part2_attention.h5",
    monitor="val_loss",
    save_best_only=True,
    verbose=1,
)

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True,
    verbose=1,
)

steps_per_epoch = len(encoder_input_data[:-val_size]) // BATCH_SIZE
validation_steps = len(encoder_input_data[-val_size:]) // BATCH_SIZE

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
    callbacks=[checkpoint, early_stop],
    verbose=1,
)

Epoch 1/30


I0000 00:00:1750324754.883149     113 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490ms/step - loss: 1.3788 - masked_sparse_accuracy: 0.1405
Epoch 1: val_loss improved from inf to 0.67216, saving model to best_model_part2_attention.h5
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 518ms/step - loss: 1.3781 - masked_sparse_accuracy: 0.1405 - val_loss: 0.6722 - val_masked_sparse_accuracy: 0.2863
Epoch 2/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 496ms/step - loss: 0.6256 - masked_sparse_accuracy: 0.3196
Epoch 2: val_loss improved from 0.67216 to 0.53328, saving model to best_model_part2_attention.h5
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 524ms/step - loss: 0.6256 - masked_sparse_accuracy: 0.3197 - val_loss: 0.5333 - val_masked_sparse_accuracy: 0.3970
Epoch 3/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 497ms/step - loss: 0.5090 - masked_sparse_accuracy: 0.4119
Epoch 3: val_loss improved from 0.53328 to 0.




Epoch 6: val_loss improved from 0.35804 to 0.34820, saving model to best_model_part2_attention.h5
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 194ms/step - loss: 0.3497 - masked_sparse_accuracy: 0.5509 - val_loss: 0.3482 - val_masked_sparse_accuracy: 0.5559
Epoch 7/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499ms/step - loss: 0.3335 - masked_sparse_accuracy: 0.5630
Epoch 7: val_loss improved from 0.34820 to 0.32760, saving model to best_model_part2_attention.h5
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m370s[0m 527ms/step - loss: 0.3335 - masked_sparse_accuracy: 0.5630 - val_loss: 0.3276 - val_masked_sparse_accuracy: 0.5767
Epoch 8/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 500ms/step - loss: 0.2973 - masked_sparse_accuracy: 0.5936
Epoch 8: val_loss improved from 0.32760 to 0.30899, saving model to best_model_part2_attention.h5
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m

6. Evaluation

In [None]:
start_token = '<sos>'
end_token = '<eos>'

with open('fra_tokenizer.pkl', 'rb') as f:
    fra_tokenizer = pickle.load(f)

def tokens_to_sentence(token_ids, tokenizer, end_token):
    words = []
    for t in token_ids:
        if t == end_token:
            break
        word = tokenizer.index_word.get(t, '') 
        if word == '':
            continue
        words.append(word)
    return words

def beam_search_decode(
    model,
    encoder_input_seq,
    start_token,
    end_token,
    fra_vocab_size,
    beam_width=3,
    max_decoder_seq_length=50,
):
    start_seq = [start_token]
    beam = [(start_seq, 0.0)]
    for _ in range(max_decoder_seq_length):
        all_candidates = []
        for seq, score in beam:
            if seq[-1] == end_token:
                all_candidates.append((seq, score))
                continue
            enc_input = encoder_input_seq
            dec_input = np.array(seq)[np.newaxis, :]
            preds = model.predict([enc_input, dec_input], verbose=0)
            next_token_logits = preds[0, -1, :]
            log_probs = np.log(next_token_logits + 1e-9)
            top_tokens = np.argsort(log_probs)[-beam_width:]
            for t in top_tokens:
                candidate_seq = seq + [int(t)]
                candidate_score = score + log_probs[t]
                all_candidates.append((candidate_seq, candidate_score))
        ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
        beam = ordered[:beam_width]
        if all(seq[-1] == end_token for seq, _ in beam):
            break
    best_sequence = beam[0][0]
    return best_sequence


def calculate_bleu_score(
    model,
    encoder_input_data,
    decoder_target_data,
    fra_tokenizer,
    start_token_id,
    end_token_id,
    beam_width=3,
    max_decoder_seq_length=50,
    num_samples=100,
):
    references = []
    hypotheses = []

    for i in range(num_samples):
        enc_seq = encoder_input_data[i : i + 1]
        target_seq = decoder_target_data[i]
        pred_seq = beam_search_decode(
            model,
            enc_seq,
            start_token=start_token_id,
            end_token=end_token_id,
            fra_vocab_size=len(fra_tokenizer.word_index) + 1,
            beam_width=beam_width,
            max_decoder_seq_length=max_decoder_seq_length,
        )
        pred_sentence = tokens_to_sentence(pred_seq[1:], fra_tokenizer, end_token_id)
        ref_sentence = tokens_to_sentence(target_seq, fra_tokenizer, end_token_id)
        references.append([ref_sentence])
        hypotheses.append(pred_sentence)

        if i % 10 == 0:
            print(f"Sample {i} | Pred: {' '.join(pred_sentence)} | Ref: {' '.join(ref_sentence)}")
    bleu_score = corpus_bleu(references, hypotheses)
    return bleu_score

bleu = calculate_bleu_score(
    model,
    encoder_input_data[-100:],
    decoder_target_data[-100:],
    fra_tokenizer,
    start_token_id=fra_tokenizer.word_index.get(start_token),
    end_token_id=fra_tokenizer.word_index.get(end_token),
    max_decoder_seq_length=50,
    num_samples=100
)

print(f"BLEU score: {bleu:.4f}")


Sample 0 | Pred: tom est en faveur. | Ref: tom est en faveur.
Sample 10 | Pred: il ai a seul. raison de faim. notre fenêtre, | Ref: il ai a seul. raison de faim. connais fenêtre,
Sample 20 | Pred: je pense que tu cet demain. pensait chose | Ref: je pense que vous faites demain. pensait chose
Sample 30 | Pred: je tenu que vous espion pas plus part | Ref: je tenu que tu grappe. pas plus part
Sample 40 | Pred: les veux-tu fit des manger crier | Ref: les veux-tu fit des manger « ça
Sample 50 | Pred: je ne depuis vraiment pas que tu le salée. | Ref: mourir. vraiment que vous l'as ce embrassés
Sample 60 | Pred: l'accusèrent pu étonnant | Ref: éponge pu tout
Sample 70 | Pred: ton français est cabane | Ref: ton français est cabane
Sample 80 | Pred: boston. rencontre votre prie. tout | Ref: pins. êtes-vous vous crois
Sample 90 | Pred: il ne n'arrive pas de ma l'arbre | Ref: il ne n'arrive pas ma l'arbre
BLEU score: 0.3712
