In [44]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [45]:

# Load dataset (ANKI format: English \t French)
file_path = "fra.txt"  # Update with your dataset path
data = pd.read_csv(file_path, delimiter='\t', header=None, names=['eng', 'fra'])

In [46]:
# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = text.replace("\u202f", " ").replace("\xa0", " ")
    text = text.replace("!", " !").replace("?", " ?").replace(",", " ,").replace(".", " .")
    return text

data['eng'] = data['eng'].apply(preprocess_text)
data['fra'] = data['fra'].apply(lambda x: f"<start> {preprocess_text(x)} <end>")

In [47]:
# Tokenization
eng_tokenizer = Tokenizer()
fra_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(data['eng'])
fra_tokenizer.fit_on_texts(data['fra'])

eng_sequences = eng_tokenizer.texts_to_sequences(data['eng'])
fra_sequences = fra_tokenizer.texts_to_sequences(data['fra'])

In [48]:
# Padding
max_eng_len = max(len(seq) for seq in eng_sequences)
max_fra_len = max(len(seq) for seq in fra_sequences)

eng_sequences = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fra_sequences = pad_sequences(fra_sequences, maxlen=max_fra_len, padding='post')

In [49]:
# Vocabulary sizes
eng_vocab_size = len(eng_tokenizer.word_index) + 1
fra_vocab_size = len(fra_tokenizer.word_index) + 1

In [50]:
# Define Seq2Seq Model
latent_dim = 256

In [51]:
# Encoder
encoder_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(eng_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [52]:
# Decoder
decoder_inputs = Input(shape=(max_fra_len - 1,))

dec_emb = Embedding(fra_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(fra_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [53]:
# Define and compile model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


In [54]:

decoder_input_data = np.array(fra_sequences)[:, :-1]  # Remove <end> token
decoder_target_data = np.array(fra_sequences)[:, 1:]  # Remove <start> token


In [None]:
model.fit(
    [eng_sequences, decoder_input_data], decoder_target_data,
    batch_size=10,
    epochs=1,
    validation_split=0.2
)


  732/11635 [>.............................] - ETA: 1:43:10 - loss: 4.2478

In [None]:
# Inference models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = Embedding(fra_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs2, state_h2, state_c2 = decoder_lstm2(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

In [None]:
# Function to predict a new sequence
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fra_tokenizer.word_index['<start>']
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = list(fra_tokenizer.word_index.keys())[list(fra_tokenizer.word_index.values()).index(sampled_token_index)]
        decoded_sentence += ' ' + sampled_word
        if sampled_word == '<end>' or len(decoded_sentence) > max_fra_len:
            stop_condition = True
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    return decoded_sentence
