In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

In [2]:
with open('C:\\Users\\User\\Downloads\\fra-eng\\fra.txt', 'r',encoding="utf-8") as file:
    # Read the entire file content into a string
    lines = file.read().split('\n')

In [3]:
# Create lists to store the French and English sentences
french_sentences = []
english_sentences = []
for line in lines:
    if '\t' in line:
        french, english, _ = line.split('\t')
        french_sentences.append(french)
        english_sentences.append(english)

french_sentences = french_sentences[0:1000]
english_sentences = english_sentences[0:1000]

In [4]:
# Tokenize the text (convert words to integer IDs)
french_tokenizer = Tokenizer()
french_tokenizer.fit_on_texts(french_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)

english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)

# Add special start and end tokens to the English tokenizer
start_token = '<start>'
end_token = '<end>'
english_tokenizer.word_index[start_token] = len(english_tokenizer.word_index) + 1
english_tokenizer.index_word[len(english_tokenizer.word_index)] = start_token
english_tokenizer.word_index[end_token] = len(english_tokenizer.word_index) + 1
english_tokenizer.index_word[len(english_tokenizer.word_index)] = end_token

# Pad sequences to a fixed length
max_sequence_length = max(len(seq) for seq in french_sequences + english_sequences)
french_sequences = pad_sequences(french_sequences, maxlen=max_sequence_length, padding='post')
english_sequences = pad_sequences(english_sequences, maxlen=max_sequence_length, padding='post')

# Create one-hot encoding for English sequences
english_vocab_size = len(english_tokenizer.word_index) + 1
english_sequences_one_hot = tf.keras.utils.to_categorical(english_sequences, num_classes=english_vocab_size)

In [44]:
# Split the data into training and validation sets
X_train_french, X_val_french, X_train_english, X_val_english, y_train, y_val = train_test_split(
    french_sequences, english_sequences, english_sequences_one_hot, test_size=0.2, random_state=42)

In [45]:
# Define the Seq2Seq model
latent_dim = 256  # Adjust as needed

# Encoder
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(input_dim=len(french_tokenizer.word_index) + 1, output_dim=latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(english_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [46]:
# Train the model
model.fit([X_train_french, X_train_english], y_train,
          validation_data=([X_val_french, X_val_english], y_val),
          batch_size=64, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x16bdaac7be0>

In [47]:
# Use the trained model for inference/translation
encoder_model = Model(encoder_inputs, encoder_states)

In [48]:
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [1]:
decoder_inputs

NameError: name 'decoder_inputs' is not defined

In [49]:
# Save the encoder model
encoder_model.save("encoder_model.h5")

# Save the decoder model
decoder_model.save("decoder_model.h5")

# Save the tokenizers
with open("french_tokenizer.pickle", "wb") as f:
    pickle.dump(french_tokenizer, f)

with open("english_tokenizer.pickle", "wb") as f:
    pickle.dump(english_tokenizer, f)

# Save vocabulary and special tokens information
vocab_info = {
    "french_tokenizer_word_index": french_tokenizer.word_index,
    "english_tokenizer_word_index": english_tokenizer.word_index,
    "start_token": start_token,
    "end_token": end_token,
}

with open("vocab_info.pickle", "wb") as f:
    pickle.dump(vocab_info, f)

# Save maximum sequence length
with open("max_sequence_length.txt", "w") as f:
    f.write(str(max_sequence_length))



In [2]:
# Load the trained model
encoder_model = load_model("encoder_model.h5")

# Load the trained model
decoder_model = load_model("decoder_model.h5")

# Load the saved tokenizers
with open("french_tokenizer.pickle", "rb") as f:
    french_tokenizer = pickle.load(f)

with open("english_tokenizer.pickle", "rb") as f:
    english_tokenizer = pickle.load(f)

# Load vocabulary and special tokens information
with open("vocab_info.pickle", "rb") as f:
    vocab_info = pickle.load(f)

start_token = vocab_info["start_token"]
end_token = vocab_info["end_token"]

# Load maximum sequence length
with open("max_sequence_length.txt", "r") as f:
    max_sequence_length = int(f.read())



In [3]:
def translate(input_text):
    input_seq = french_tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_sequence_length, padding='post')
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = english_tokenizer.word_index[start_token]  # Start token
    
    stop_condition = False
    translation = []
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        
        # Handle OOV tokens
        if sampled_token_index == 0:
            sampled_word = end_token  # Replace with your choice of fallback token
        else:
            sampled_word = english_tokenizer.index_word.get(sampled_token_index, end_token)
        
        translation.append(sampled_word)
        
        if sampled_word == end_token or len(translation) > max_sequence_length:
            stop_condition = True
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        states_value = [h, c]
    
    return ' '.join(translation)


In [4]:
# Example usage:
input_text = "Go."  # Replace with your own French text
translation = translate(input_text)
print("French Input:", input_text)
print("English Translation:", translation)

French Input: Go.
English Translation: je je <end>
