In [16]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

In [2]:
with open('C:\\Users\\User\\Downloads\\fra-eng\\fra.txt', 'r',encoding="utf-8") as file:
    # Read the entire file content into a string
    lines = file.read().split('\n')

In [5]:
# Create lists to store the French and English sentences
french_sentences = []
english_sentences = []
for line in lines:
    if '\t' in line:
        french, english, _ = line.split('\t')
        french_sentences.append(french)
        english_sentences.append(english)

french_sentences = french_sentences[0:1000]
english_sentences = english_sentences[0:1000]

In [6]:
# Tokenize the French and English sentences
french_tokenizer = Tokenizer(filters='')
french_tokenizer.fit_on_texts(french_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)

english_tokenizer = Tokenizer(filters='')
english_tokenizer.fit_on_texts(english_sentences)
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)

In [7]:
# Pad sequences to a fixed length
max_sequence_length = 12  # You can adjust this based on your dataset
french_sequences = pad_sequences(french_sequences, maxlen=max_sequence_length, padding='post')
english_sequences = pad_sequences(english_sequences, maxlen=max_sequence_length, padding='post')

In [8]:
# Create one-hot encoding for English sequences
english_vocab_size = len(english_tokenizer.word_index) + 1
english_sequences_one_hot = tf.keras.utils.to_categorical(english_sequences, num_classes=english_vocab_size)

In [9]:
# Split the data into training and validation sets
X_train_french, X_val_french, X_train_english, X_val_english, y_train, y_val = train_test_split(
    french_sequences, english_sequences, english_sequences_one_hot, test_size=0.2, random_state=42)

In [12]:
# Define the Seq2Seq model
latent_dim = 256  # Adjust as needed

# Encoder
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(input_dim=len(french_tokenizer.word_index) + 1, output_dim=latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(english_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [13]:
# Train the model
model.fit([X_train_french, X_train_english], y_train,
          validation_data=([X_val_french, X_val_english], y_val),
          batch_size=64, epochs=1)
# Save the model
model.save('seq2seq_translation_model.h5')



  saving_api.save_model(


In [14]:
# Save tokenizers, vocab info, and max sequence length for inference
with open("french_tokenizer.pickle", "wb") as f:
    pickle.dump(french_tokenizer, f)

with open("english_tokenizer.pickle", "wb") as f:
    pickle.dump(english_tokenizer, f)

vocab_info = {"start_token": '<start>', "end_token": '<end>'}
with open("vocab_info.pickle", "wb") as f:
    pickle.dump(vocab_info, f)

with open("max_sequence_length.txt", "w") as f:
    f.write(str(max_sequence_length))

In [17]:
# Inference
model = load_model('seq2seq_translation_model.h5')

# Load saved components
with open("french_tokenizer.pickle", "rb") as f:
    french_tokenizer = pickle.load(f)

with open("english_tokenizer.pickle", "rb") as f:
    english_tokenizer = pickle.load(f)

with open("vocab_info.pickle", "rb") as f:
    vocab_info = pickle.load(f)

max_sequence_length = int(open("max_sequence_length.txt", "r").read())

In [22]:
# Manually add the start and end tokens to the English tokenizer's word index
english_tokenizer.word_index[start_token] = len(english_tokenizer.word_index) + 1
english_tokenizer.word_index[end_token] = len(english_tokenizer.word_index) + 1

# Inverse mapping for the word index
reverse_word_index = {v: k for k, v in english_tokenizer.word_index.items()}

In [23]:
# Example input text
input_text = "Bonjour, comment ça va ?"

# Tokenize and preprocess the input text
input_seq = french_tokenizer.texts_to_sequences([input_text])
input_seq = pad_sequences(input_seq, maxlen=max_sequence_length, padding='post')

In [24]:
# Use the encoder model to encode the input sequence
encoder_input = input_seq
encoder_model = Model(inputs=model.input[0], outputs=model.layers[4].output)  # Encoder is the fifth layer in the model
encoder_states = encoder_model.predict(encoder_input)



In [31]:
# Initialize the decoder input with the start token
target_seq = np.zeros((1, 1, 1))  # Reshape target_seq to (1, 1, 1)
start_token_index = english_tokenizer.word_index[start_token]
target_seq[0, 0, 0] = start_token_index 

In [32]:
# Create a new model for prediction
decoder_input_h = Input(shape=(latent_dim,))
decoder_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_input_h, decoder_input_c]

decoder_lstm = model.layers[5]  # Decoder LSTM layer
decoder_dense = model.layers[6]  # Dense layer

In [30]:
# Connect the decoder LSTM layer using the initial states and target_seq
decoder_lstm_outputs, state_h, state_c = decoder_lstm(target_seq, initial_state=decoder_states_inputs)

# Reconnect the decoder dense layer
decoder_outputs = decoder_dense(decoder_lstm_outputs)

# Create a new prediction model
prediction_model = Model(
    [target_seq] + decoder_states_inputs,
    [decoder_outputs] + [state_h, state_c]
)

ValueError: Exception encountered when calling layer "lstm_3" (type LSTM).

Shape (1, 1) must have rank at least 3

Call arguments received by layer "lstm_3" (type LSTM):
  • inputs=['tf.Tensor(shape=(1, 1), dtype=float32)', 'tf.Tensor(shape=(None, 256), dtype=float32)', 'tf.Tensor(shape=(None, 256), dtype=float32)']
  • mask=None
  • training=None
  • initial_state=None