# Seq2Seq Language Translator (English → French)
This notebook demonstrates building a Seq2Seq model with LSTM + Attention using Keras.
It covers data loading, preprocessing, model training, and inference (translation demo).

In [16]:
import numpy as np
import pandas as pd
import pickle, json, os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Attention
from tensorflow.keras.optimizers import Adam


## Load Data

In [17]:
# Load parallel English-French data
pairs = []
with open('data/sample_eng_fra.txt', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 2:   # only keep lines with English + French
            en, fr = parts
            pairs.append((en.lower(), fr.lower()))

print("Loaded pairs:", len(pairs))
print("Sample:", pairs[:5])


Loaded pairs: 0
Sample: []


## Tokenization

In [18]:
# Prepare texts
eng_texts = [p[0] for p in pairs]
fra_texts = ['<sos> ' + p[1] + ' <eos>' for p in pairs]

# Tokenizers
eng_tok = Tokenizer(num_words=5000, oov_token='<oov>')
fra_tok = Tokenizer(num_words=5000, oov_token='<oov>')
eng_tok.fit_on_texts(eng_texts)
fra_tok.fit_on_texts(fra_texts)

# Convert texts to padded sequences
enc_seq = pad_sequences(eng_tok.texts_to_sequences(eng_texts), maxlen=20, padding='post')
dec_seq = pad_sequences(fra_tok.texts_to_sequences(fra_texts), maxlen=20, padding='post')

# Decoder input and target sequences
decoder_input = dec_seq[:, :-1]       # all tokens except last
decoder_target = dec_seq[:, 1:]       # all tokens except first (for teacher forcing)

# Ensure correct dtype
enc_seq = enc_seq.astype('int32')
decoder_input = decoder_input.astype('int32')
decoder_target = decoder_target.astype('int32')  # must NOT be bool

# Print shapes and vocab sizes
print("Encoder input shape:", enc_seq.shape)
print("Decoder input shape:", decoder_input.shape)
print("Decoder target shape:", decoder_target.shape)
print("English vocab size:", len(eng_tok.word_index)+1)
print("French vocab size:", len(fra_tok.word_index)+1)


Encoder input shape: (0, 20)
Decoder input shape: (0, 19)
Decoder target shape: (0, 19)
English vocab size: 2
French vocab size: 2


## Build Seq2Seq Model with Attention

In [21]:
embed_dim, latent_dim = 64, 128

# --- Encoder ---
encoder_inputs = Input(shape=(20,), name="encoder_inputs")
enc_emb = Embedding(input_dim=len(eng_tok.word_index)+1,
                    output_dim=embed_dim,
                    mask_zero=True,
                    name="encoder_embedding")(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="encoder_lstm")
enc_outs, state_h, state_c = encoder_lstm(enc_emb)

# --- Decoder ---
decoder_inputs = Input(shape=(19,), name="decoder_inputs")
dec_emb = Embedding(input_dim=len(fra_tok.word_index)+1,
                    output_dim=embed_dim,
                    mask_zero=True,
                    name="decoder_embedding")(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")
dec_outs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# --- Attention ---
attn = Attention(name="attention_layer")([dec_outs, enc_outs])
concat = Concatenate(axis=-1, name="concat_layer")([dec_outs, attn])

# --- Output ---
outputs = TimeDistributed(Dense(len(fra_tok.word_index)+1, activation='softmax'), name="output_layer")(concat)

# --- Model ---
model = Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer=Adam(0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()


## Train Model (few epochs for demo)

In [20]:
history = model.fit(
    [enc_seq, decoder_input],
    decoder_target,
    batch_size=2,      # small dataset, small batch
    epochs=50,         # train enough to see learning
    validation_split=0,  # no validation set
    verbose=1          # show progress bar
)


Epoch 1/50


TypeError: Exception encountered when calling BroadcastTo.call().

[1mFailed to convert elements of (None, 19, 128) to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.[0m

Arguments received by BroadcastTo.call():
  • x=tf.Tensor(shape=(2, 19, 1), dtype=bool)

## Save Model & Tokenizers

In [None]:
os.makedirs('models', exist_ok=True)
model.save('models/seq2seq_model')
with open('models/eng_tokenizer.pkl','wb') as f: pickle.dump(eng_tok,f)
with open('models/fra_tokenizer.pkl','wb') as f: pickle.dump(fra_tok,f)
with open('models/meta.json','w',encoding='utf-8') as f: json.dump({'max_enc_len':20,'max_dec_len':20}, f)

## Inference (Greedy Decode)

In [None]:
inv = {v:k for k,v in fra_tok.word_index.items()}
def greedy_decode(sentence):
    seq = eng_tok.texts_to_sequences([sentence.lower()])
    enc_seq2 = pad_sequences(seq, maxlen=20, padding='post')
    dec_input = pad_sequences([[fra_tok.word_index.get('<sos>',1)]], maxlen=19, padding='post')
    preds = model.predict([enc_seq2, dec_input])
    pred_ids = np.argmax(preds[0], axis=-1)
    words=[]
    for idx in pred_ids:
        if idx==0: continue
        w = inv.get(idx,'<unk>')
        if w=='<eos>': break
        if w=='<sos>': continue
        words.append(w)
    return ' '.join(words)

for s in ['hello','how are you?','i love you']:
    print(s,'->',greedy_decode(s))