In [2]:
!pip install tensorflow



Prepare Data

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample dataset
data = [
    ("hello", "bonjour"),
    ("how are you", "comment ça va"),
    ("I am fine", "je vais bien"),
    ("what is your name", "comment tu t'appelles"),
    ("my name is", "je m'appelle"),
    ("thank you", "merci"),
    ("goodbye", "au revoir")
]

# Separate input and target sentences
input_texts, target_texts = zip(*data)
target_texts = ["\t" + text + "\n" for text in target_texts]  # Adding start (\t) and end (\n) tokens


Tokenize the Sentences

In [4]:
# Tokenize English and French
input_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

# Convert to sequences
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

# Vocabulary sizes
num_encoder_tokens = len(input_tokenizer.word_index) + 1
num_decoder_tokens = len(target_tokenizer.word_index) + 1

# Maximum sequence lengths
max_encoder_seq_length = max([len(seq) for seq in input_sequences])
max_decoder_seq_length = max([len(seq) for seq in target_sequences])

# Pad sequences
encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

# Prepare decoder output data shifted by one position
decoder_target_data = np.zeros((len(data), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
for i, seq in enumerate(target_sequences):
    for t, word_id in enumerate(seq[1:]):  # Shifted by one
        decoder_target_data[i, t, word_id] = 1.0

Build the Seq2Seq Model

In [6]:
# Define the encoder
from tensorflow.keras.layers import Embedding

encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=256)  # Use Embedding layer here
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [state_h, state_c]

# Define the decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=num_decoder_tokens, output_dim=256)  # Use Embedding layer here
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_dense = Dense(num_decoder_tokens, activation="softmax")
decoder_outputs, _, _ = decoder_lstm(decoder_embedding(decoder_inputs), initial_state=encoder_states)
decoder_outputs = decoder_dense(decoder_outputs)

# Compile the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

Train the Model

In [7]:
# Train the model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=64,
          epochs=100)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 1.0048
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step - loss: 0.9962
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - loss: 0.9887
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - loss: 0.9812
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - loss: 0.9734
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - loss: 0.9652
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - loss: 0.9561
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - loss: 0.9460
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - loss: 0.9345
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - loss: 0.9212
Epoch 1

<keras.src.callbacks.history.History at 0x79734997d6f0>

Inference Models

In [8]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding(decoder_inputs), initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

Define the Translation Function

In [9]:
# Reverse dictionaries for mapping indices back to words
reverse_input_word_index = {idx: word for word, idx in input_tokenizer.word_index.items()}
reverse_target_word_index = {idx: word for word, idx in target_tokenizer.word_index.items()}

def translate_sentence(input_seq):
    # Encode the input sentence to get initial states
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence with start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_tokenizer.word_index['\t']  # Start token

    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index.get(sampled_token_index, "")
        decoded_sentence += " " + sampled_word

        # Exit condition: end of sentence or max length
        if sampled_word == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()

Test Translation with New Sentences

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

# Adding start and end tokens to each target sentence
target_texts = ["\t" + text + "\n" for text in target_texts]

# Initialize the tokenizer and fit on target texts with start and end tokens
target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(target_texts)

# Ensure start and end tokens are in the vocabulary
target_word_index = target_tokenizer.word_index
if '\t' not in target_word_index:
    target_word_index['\t'] = len(target_word_index) + 1
if '\n' not in target_word_index:
    target_word_index['\n'] = len(target_word_index) + 1

# Define the max sequence length for encoder and decoder inputs
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

# Function to handle translation with both encoder and decoder inputs
def translate_sentence(sequence):
    try:
        # Prepare encoder input by padding the input sequence
        encoder_input = pad_sequences(
            input_tokenizer.texts_to_sequences([sequence]),
            maxlen=max_encoder_seq_length
        )

        # Prepare initial decoder input with the start token
        start_token_index = target_word_index['\t']
        decoder_input = np.zeros((1, max_decoder_seq_length))
        decoder_input[0, 0] = start_token_index

        # Make predictions (assuming `model` is a trained encoder-decoder model)
        prediction = model.predict([encoder_input, decoder_input])

        # Decode the prediction (assuming `decode_prediction` maps indices to words)
        translation = decode_prediction(prediction)
        return translation

    except Exception as e:
        print(f"Error during translation: {e}")
        return "Translation failed"

# Example usage with input text
test_sentence = "hello"
test_sequence = pad_sequences(
    input_tokenizer.texts_to_sequences([test_sentence]),
    maxlen=max_encoder_seq_length
)

# Make sure test_sequence has valid indices
if any(index >= len(target_word_index) for index in test_sequence.flatten()):
    print("Test sequence contains out-of-vocabulary tokens.")
else:
    # Perform translation
    print(f"Translation: {translate_sentence(test_sentence)}")

Error during translation: Graph execution error:

Detected at node functional_1/embedding_1_2/GatherV2 defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr