## Part 1: Seq2Seq without Attention

1. Importing Necessary Libraries

In [1]:
import tensorflow as tf
tf.config.optimizer.set_jit(True)

from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

In [2]:
import os
import random
import json
import pickle
import numpy as np
from tqdm import tqdm
import nltk
nltk.download('punkt')

import tensorflow as tf
from tensorflow.keras import mixed_precision
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, TimeDistributed, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


2. Configuration

In [3]:
with open('config.json', 'r') as f:
    config = json.load(f)

eng_vocab_size = config['eng_vocab_size']
fra_vocab_size = config['fra_vocab_size']

EMBEDDING_DIM = 100
HIDDEN_UNITS = 256
STACKED_LAYERS = 2
BATCH_SIZE = 256
EPOCHS = 30
LEARNING_RATE = 0.001

3. Load Preprocessed Data & Embeddings

In [4]:
data = np.load('data.npz', allow_pickle=True)
encoder_input_data = np.array(data['encoder_input_data'], dtype=np.int32)
decoder_input_data = np.array(data['decoder_input_data'], dtype=np.int32)
decoder_target_data = np.array(data['decoder_target_data'], dtype=np.int32)

if decoder_target_data.ndim == 3 and decoder_target_data.shape[-1] == 1:
    decoder_target_data = np.squeeze(decoder_target_data, axis=-1)

with open('en_embedding_matrix.pkl', 'rb') as f:
    en_embedding_matrix = pickle.load(f)

val_size = int(0.1 * len(encoder_input_data))

def create_bucketed_dataset(enc, dec, tgt, batch_size):
    def gen():
        for x, y, z in zip(enc, dec, tgt):
            yield {"encoder_input": x, "decoder_input": y, "decoder_target": z}

    output_signature = {
        "encoder_input": tf.TensorSpec(shape=(None,), dtype=tf.int32),
        "decoder_input": tf.TensorSpec(shape=(None,), dtype=tf.int32),
        "decoder_target": tf.TensorSpec(shape=(None,), dtype=tf.int32),
    }

    dataset = tf.data.Dataset.from_generator(gen, output_signature=output_signature)

    def map_fn(sample):
        return (sample["encoder_input"], sample["decoder_input"]), sample["decoder_target"]

    def bucket_by_seq_len(example):
        enc_len = tf.shape(example["encoder_input"])[0]
        return tf.cast(enc_len // 10, tf.int64)  # bucket keys

    dataset = dataset.apply(
        tf.data.experimental.group_by_window(
            key_func=bucket_by_seq_len,
            reduce_func=lambda key, ds: ds.map(map_fn).padded_batch(
                batch_size,
                padded_shapes=(([None], [None]), [None]),
                padding_values=((0, 0), 0),
                drop_remainder=True
            ),
            window_size=batch_size
        )
    )

    return dataset.prefetch(tf.data.AUTOTUNE)


train_dataset = create_bucketed_dataset(
    encoder_input_data[:-val_size],
    decoder_input_data[:-val_size],
    decoder_target_data[:-val_size],
    batch_size=48
)

val_dataset = create_bucketed_dataset(
    encoder_input_data[-val_size:],
    decoder_input_data[-val_size:],
    decoder_target_data[-val_size:],
    batch_size=48
)


Instructions for updating:
Use `tf.data.Dataset.group_by_window(...)`.


4. Build Seq2Seq Model

In [5]:
enc_in = Input(shape=(None,), name='encoder_inputs')
x = Embedding(eng_vocab_size, EMBEDDING_DIM, weights=[en_embedding_matrix], trainable=True)(enc_in)

for i in range(STACKED_LAYERS - 1):
    x = Bidirectional(LSTM(HIDDEN_UNITS, return_sequences=True))(x)

encoder_output, fh, fc, bh, bc = Bidirectional(
    LSTM(HIDDEN_UNITS, return_state=True)
)(x)

state_h = Concatenate()([fh, bh])
state_c = Concatenate()([fc, bc])

dec_in = Input(shape=(None,), name='decoder_inputs')
y = Embedding(fra_vocab_size, EMBEDDING_DIM, trainable=True)(dec_in)

decoder_lstm = LSTM(HIDDEN_UNITS * 2, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(y, initial_state=[state_h, state_c])
decoder_dense = TimeDistributed(Dense(fra_vocab_size, activation='softmax', dtype='float32'))
final_output = decoder_dense(decoder_output)

model = Model([enc_in, dec_in], final_output)

def masked_sparse_accuracy(y_true, y_pred):
    y_pred_labels = tf.argmax(y_pred, axis=-1, output_type=tf.int32)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    correct = tf.cast(tf.equal(y_true, y_pred_labels), tf.float32) * mask
    return tf.reduce_sum(correct) / tf.reduce_sum(mask)

model.compile(
    optimizer=Adam(LEARNING_RATE),
    loss='sparse_categorical_crossentropy',
    metrics=[masked_sparse_accuracy]
)

model.summary()

In [6]:
for (inp, out) in train_dataset.take(1):
    print("Encoder Input:", inp[0].shape)
    print("Decoder Input:", inp[1].shape)
    print("Target:", out.shape)


Encoder Input: (48, 55)
Decoder Input: (48, 57)
Target: (48, 57)


4. Training with Early Stopping

In [7]:
checkpoint = ModelCheckpoint(
    filepath='lstm_no_attention.h5',
    save_best_only=True,
    monitor='val_loss',
)

earlystop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    min_delta=0.001,
    restore_best_weights=True
)

steps_per_epoch = len(encoder_input_data[:-val_size]) // BATCH_SIZE
validation_steps = len(encoder_input_data[-val_size:]) // BATCH_SIZE

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
    callbacks=[checkpoint, earlystop],
    verbose=1,
)

Epoch 1/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 1.8920 - masked_sparse_accuracy: 0.0914



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 164ms/step - loss: 1.8909 - masked_sparse_accuracy: 0.0915 - val_loss: 0.9090 - val_masked_sparse_accuracy: 0.1985
Epoch 2/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 0.8693 - masked_sparse_accuracy: 0.2149



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 159ms/step - loss: 0.8692 - masked_sparse_accuracy: 0.2149 - val_loss: 0.7010 - val_masked_sparse_accuracy: 0.2683
Epoch 3/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step - loss: 0.6658 - masked_sparse_accuracy: 0.2888



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 199ms/step - loss: 0.6658 - masked_sparse_accuracy: 0.2889 - val_loss: 0.5772 - val_masked_sparse_accuracy: 0.3567
Epoch 4/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 0.5494 - masked_sparse_accuracy: 0.3726



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 160ms/step - loss: 0.5493 - masked_sparse_accuracy: 0.3726 - val_loss: 0.4997 - val_masked_sparse_accuracy: 0.4139
Epoch 5/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step - loss: 0.4815 - masked_sparse_accuracy: 0.4256



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 202ms/step - loss: 0.4815 - masked_sparse_accuracy: 0.4256 - val_loss: 0.4471 - val_masked_sparse_accuracy: 0.4557
Epoch 6/30
[1m235/703[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:07[0m 144ms/step - loss: 0.4382 - masked_sparse_accuracy: 0.4552



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 64ms/step - loss: 0.4363 - masked_sparse_accuracy: 0.4574 - val_loss: 0.4331 - val_masked_sparse_accuracy: 0.4665
Epoch 7/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 0.4211 - masked_sparse_accuracy: 0.4752



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 202ms/step - loss: 0.4211 - masked_sparse_accuracy: 0.4752 - val_loss: 0.3991 - val_masked_sparse_accuracy: 0.4974
Epoch 8/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 0.3862 - masked_sparse_accuracy: 0.5086



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 158ms/step - loss: 0.3862 - masked_sparse_accuracy: 0.5086 - val_loss: 0.3750 - val_masked_sparse_accuracy: 0.5218
Epoch 9/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 0.3546 - masked_sparse_accuracy: 0.5338



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 203ms/step - loss: 0.3546 - masked_sparse_accuracy: 0.5339 - val_loss: 0.3516 - val_masked_sparse_accuracy: 0.5417
Epoch 10/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 0.3217 - masked_sparse_accuracy: 0.5590



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 202ms/step - loss: 0.3217 - masked_sparse_accuracy: 0.5591 - val_loss: 0.3341 - val_masked_sparse_accuracy: 0.5585
Epoch 11/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - loss: 0.2981 - masked_sparse_accuracy: 0.5783



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 158ms/step - loss: 0.2981 - masked_sparse_accuracy: 0.5783 - val_loss: 0.3202 - val_masked_sparse_accuracy: 0.5705
Epoch 12/30
[1m235/703[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:06[0m 143ms/step - loss: 0.2787 - masked_sparse_accuracy: 0.5923



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 60ms/step - loss: 0.2780 - masked_sparse_accuracy: 0.5926 - val_loss: 0.3170 - val_masked_sparse_accuracy: 0.5738
Epoch 13/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 0.2721 - masked_sparse_accuracy: 0.5986



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 158ms/step - loss: 0.2721 - masked_sparse_accuracy: 0.5986 - val_loss: 0.3083 - val_masked_sparse_accuracy: 0.5845
Epoch 14/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 0.2549 - masked_sparse_accuracy: 0.6171



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 155ms/step - loss: 0.2549 - masked_sparse_accuracy: 0.6171 - val_loss: 0.3039 - val_masked_sparse_accuracy: 0.5901
Epoch 15/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 0.2382 - masked_sparse_accuracy: 0.6320



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 205ms/step - loss: 0.2381 - masked_sparse_accuracy: 0.6320 - val_loss: 0.2951 - val_masked_sparse_accuracy: 0.5983
Epoch 16/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 0.2211 - masked_sparse_accuracy: 0.6484



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 206ms/step - loss: 0.2211 - masked_sparse_accuracy: 0.6484 - val_loss: 0.2889 - val_masked_sparse_accuracy: 0.6054
Epoch 17/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 0.2086 - masked_sparse_accuracy: 0.6610



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 158ms/step - loss: 0.2086 - masked_sparse_accuracy: 0.6610 - val_loss: 0.2840 - val_masked_sparse_accuracy: 0.6107
Epoch 18/30
[1m235/703[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:05[0m 141ms/step - loss: 0.1970 - masked_sparse_accuracy: 0.6680



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 116ms/step - loss: 0.1964 - masked_sparse_accuracy: 0.6682 - val_loss: 0.2829 - val_masked_sparse_accuracy: 0.6137
Epoch 19/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 0.1925 - masked_sparse_accuracy: 0.6767



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 202ms/step - loss: 0.1925 - masked_sparse_accuracy: 0.6767 - val_loss: 0.2805 - val_masked_sparse_accuracy: 0.6144
Epoch 20/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 192ms/step - loss: 0.1827 - masked_sparse_accuracy: 0.6887 - val_loss: 0.2833 - val_masked_sparse_accuracy: 0.6164
Epoch 21/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - loss: 0.1708 - masked_sparse_accuracy: 0.7017



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 155ms/step - loss: 0.1708 - masked_sparse_accuracy: 0.7017 - val_loss: 0.2794 - val_masked_sparse_accuracy: 0.6202
Epoch 22/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 0.1610 - masked_sparse_accuracy: 0.7146



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 205ms/step - loss: 0.1610 - masked_sparse_accuracy: 0.7146 - val_loss: 0.2756 - val_masked_sparse_accuracy: 0.6237
Epoch 23/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 0.1540 - masked_sparse_accuracy: 0.7266



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 157ms/step - loss: 0.1540 - masked_sparse_accuracy: 0.7266 - val_loss: 0.2739 - val_masked_sparse_accuracy: 0.6296
Epoch 24/30
[1m235/703[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:06[0m 143ms/step - loss: 0.1469 - masked_sparse_accuracy: 0.7342



[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 116ms/step - loss: 0.1463 - masked_sparse_accuracy: 0.7343 - val_loss: 0.2732 - val_masked_sparse_accuracy: 0.6304
Epoch 25/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 148ms/step - loss: 0.1439 - masked_sparse_accuracy: 0.7404 - val_loss: 0.2737 - val_masked_sparse_accuracy: 0.6301
Epoch 26/30
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 148ms/step - loss: 0.1385 - masked_sparse_accuracy: 0.7504 - val_loss: 0.2786 - val_masked_sparse_accuracy: 0.6312


In [10]:
start_token = '<sos>'
end_token = '<eos>'

with open('fra_tokenizer.pkl', 'rb') as f:
    fra_tokenizer = pickle.load(f)

from nltk.translate.bleu_score import corpus_bleu

def tokens_to_sentence(token_ids, tokenizer, end_token):
    words = []
    for t in token_ids:
        if t == end_token:
            break
        word = tokenizer.index_word.get(t, '')  # or use your vocab mapping
        if word == '':
            continue
        words.append(word)
    return words

import numpy as np
import tensorflow as tf

def beam_search_decode(
    model,
    encoder_input_seq,
    start_token,
    end_token,
    fra_vocab_size,
    beam_width=3,
    max_decoder_seq_length=50,
):
    start_seq = [start_token]
    beam = [(start_seq, 0.0)]
    for _ in range(max_decoder_seq_length):
        all_candidates = []
        for seq, score in beam:
            if seq[-1] == end_token:
                all_candidates.append((seq, score))
                continue
            enc_input = encoder_input_seq
            dec_input = np.array(seq)[np.newaxis, :]
            preds = model.predict([enc_input, dec_input], verbose=0)
            next_token_logits = preds[0, -1, :]
            log_probs = np.log(next_token_logits + 1e-9)
            top_tokens = np.argsort(log_probs)[-beam_width:]
            for t in top_tokens:
                candidate_seq = seq + [int(t)]
                candidate_score = score + log_probs[t]
                all_candidates.append((candidate_seq, candidate_score))
        ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
        beam = ordered[:beam_width]
        if all(seq[-1] == end_token for seq, _ in beam):
            break
    best_sequence = beam[0][0]
    return best_sequence


def calculate_bleu_score(
    model,
    encoder_input_data,
    decoder_target_data,
    fra_tokenizer,
    start_token_id,
    end_token_id,
    beam_width=3,
    max_decoder_seq_length=50,
    num_samples=100,
):
    references = []
    hypotheses = []

    for i in range(num_samples):
        enc_seq = encoder_input_data[i : i + 1]
        target_seq = decoder_target_data[i]
        pred_seq = beam_search_decode(
            model,
            enc_seq,
            start_token=start_token_id,
            end_token=end_token_id,
            fra_vocab_size=len(fra_tokenizer.word_index) + 1,
            beam_width=beam_width,
            max_decoder_seq_length=max_decoder_seq_length,
        )
        pred_sentence = tokens_to_sentence(pred_seq[1:], fra_tokenizer, end_token_id)
        ref_sentence = tokens_to_sentence(target_seq, fra_tokenizer, end_token_id)
        references.append([ref_sentence])
        hypotheses.append(pred_sentence)

        if i % 10 == 0:
            print(f"Sample {i} | Pred: {' '.join(pred_sentence)} | Ref: {' '.join(ref_sentence)}")
    bleu_score = corpus_bleu(references, hypotheses)
    return bleu_score

bleu = calculate_bleu_score(
    model,
    encoder_input_data[-100:],
    decoder_target_data[-100:],
    fra_tokenizer,
    start_token_id=fra_tokenizer.word_index.get(start_token),
    end_token_id=fra_tokenizer.word_index.get(end_token),
    max_decoder_seq_length=50,
    num_samples=100
)

print(f"BLEU score: {bleu:.4f}")


  log_probs = np.log(next_token_logits + 1e-9)


Sample 0 | Pred: tom est à j'attendrai | Ref: tom est en faveur.
Sample 10 | Pred: il ai a seul. raison de avec peur ? | Ref: il ai a seul. raison de faim. connais fenêtre,
Sample 20 | Pred: je pense que tu cet demain. pensait chose | Ref: je pense que vous faites demain. pensait chose
Sample 30 | Pred: je tenu que vous espion pas plus part | Ref: je tenu que tu grappe. pas plus part
Sample 40 | Pred: les veux-tu fit des gagné | Ref: les veux-tu fit des manger « ça
Sample 50 | Pred: je ne depuis vraiment pas que vous chanter. le projet de sans | Ref: mourir. vraiment que vous l'as ce embrassés
Sample 60 | Pred: type ? c'était tout | Ref: éponge pu tout
Sample 70 | Pred: votre français est cabane | Ref: ton français est cabane
Sample 80 | Pred: boston. petit. | Ref: pins. êtes-vous vous crois
Sample 90 | Pred: il ne n'arrive pas ma l'arbre | Ref: il ne n'arrive pas ma l'arbre
BLEU score: 0.2880
