English to French

In [1]:
import numpy as np
from  tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
english_sentences = ['hello','how are you','good morning','good night','thank you']
french_sentences = ['bonjour','comment ça va','bonjour','bonne nuit','merci']


In [3]:
# add START & END tokens to french sentence

french_sentences  = ['starttoken ' + sentence + ' endtoken' for sentence in french_sentences]

#Hyperpaameter
batch_size = 2
epochs = 100
latent_dim = 256


In [4]:
french_sentences

['starttoken bonjour endtoken',
 'starttoken comment ça va endtoken',
 'starttoken bonjour endtoken',
 'starttoken bonne nuit endtoken',
 'starttoken merci endtoken']

In [5]:
#Initilaize tokenizer
eng_tokenizer = Tokenizer(char_level = False)
fra_tokenizer = Tokenizer(char_level=False)

#Fit tokenizer
eng_tokenizer.fit_on_texts(english_sentences)
fra_tokenizer.fit_on_texts(french_sentences)

In [6]:
eng_tokenizer.word_index


{'you': 1,
 'good': 2,
 'hello': 3,
 'how': 4,
 'are': 5,
 'morning': 6,
 'night': 7,
 'thank': 8}

In [7]:
fra_tokenizer.word_index

{'starttoken': 1,
 'endtoken': 2,
 'bonjour': 3,
 'comment': 4,
 'ça': 5,
 'va': 6,
 'bonne': 7,
 'nuit': 8,
 'merci': 9}

In [8]:
#Convert sentences into sequences of intgers
encoder_input_data = eng_tokenizer.texts_to_sequences(english_sentences)
decoder_input_data = fra_tokenizer.texts_to_sequences(french_sentences)

In [9]:
#pad sequence to ensure uniform length
max_encoder_seq_length =  max([len(seq) for seq in encoder_input_data])
max_decoder_seq_length =  max([len(seq) for seq in decoder_input_data])

encoder_input_data = pad_sequences(encoder_input_data,maxlen=max_encoder_seq_length,padding='post')
decoder_input_data = pad_sequences(decoder_input_data,maxlen=max_decoder_seq_length,padding='post')


In [10]:
max_encoder_seq_length,max_decoder_seq_length

(3, 5)

In [11]:
encoder_input_data

array([[3, 0, 0],
       [4, 5, 1],
       [2, 6, 0],
       [2, 7, 0],
       [8, 1, 0]], dtype=int32)

In [12]:
decoder_input_data

array([[1, 3, 2, 0, 0],
       [1, 4, 5, 6, 2],
       [1, 3, 2, 0, 0],
       [1, 7, 8, 2, 0],
       [1, 9, 2, 0, 0]], dtype=int32)

In [13]:
#Prepare the target data (decoder output, shifted by one time step)
decoder_output_data = np.zeros_like(decoder_input_data)
decoder_output_data[:,:-1] = decoder_input_data[:,1:]

In [14]:
decoder_output_data

array([[3, 2, 0, 0, 0],
       [4, 5, 6, 2, 0],
       [3, 2, 0, 0, 0],
       [7, 8, 2, 0, 0],
       [9, 2, 0, 0, 0]], dtype=int32)

In [15]:
decoder_input_data

array([[1, 3, 2, 0, 0],
       [1, 4, 5, 6, 2],
       [1, 3, 2, 0, 0],
       [1, 7, 8, 2, 0],
       [1, 9, 2, 0, 0]], dtype=int32)

In [16]:
#Define vocabulary size(total unique tokens)
num_encoder_tokens = len(eng_tokenizer.word_index) + 1
num_decoder_tokens = len(fra_tokenizer.word_index) + 1

In [17]:
num_encoder_tokens ,num_decoder_tokens

(9, 10)

In [18]:
# Define input sequence and output sequence for Seq2Seq model
encoder_inputs = Input(shape=(None,))
decoder_inputs = Input(shape=(None,))

In [19]:
#Encoder
encoder_embedding = Embedding(input_dim = num_encoder_tokens, output_dim=latent_dim,input_length=max_encoder_seq_length)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)




In [20]:
#Decoder
decoder_embedding = Embedding(input_dim = num_decoder_tokens, output_dim=latent_dim,input_length=max_decoder_seq_length)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

In [21]:
deocder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = deocder_dense(decoder_outputs)

In [22]:
#Functional the model
model = Model([encoder_inputs, decoder_inputs],decoder_outputs)

In [23]:
model.summary()

In [24]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [25]:

  model.fit([encoder_input_data,decoder_input_data],
          np.expand_dims(decoder_output_data,-1),
          batch_size=batch_size,epochs=epochs)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - accuracy: 0.2800 - loss: 2.2873
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.4650 - loss: 2.1541
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.5025 - loss: 1.9130
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.4650 - loss: 1.5554
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.5025 - loss: 1.2433
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.5025 - loss: 1.2055
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.5025 - loss: 1.0673
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.6775 - loss: 1.1070
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7cc626dffe80>

In [26]:
#Create inference models for the encoder and decoder for prediction
#Encoder model

encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

In [27]:
#Decoder model(for inference)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

decoder_lstm_inf = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm_inf(decoder_embedding, initial_state=[decoder_state_input_h, decoder_state_input_c])
decoder_output_inf = deocder_dense(decoder_outputs_inf)

decoder_model = Model([decoder_inputs, decoder_state_input_h, decoder_state_input_c], [decoder_output_inf, state_h_inf, state_c_inf])


In [28]:
def decode_sequence(input_seq):
  # Get the encoder states
  states_value = encoder_model.predict(input_seq)

  # Generate an initial target sequence (the start token)
  target_seq = np.zeros((1,1))
  target_seq[0,0] = fra_tokenizer.word_index['starttoken']  # start token index

  # sample output tokens
  stop_condition = False
  decoded_sentence = ''
  while not stop_condition:
    # Correct the prediction input to match expected inputs
    # Unpack the states_value list into separate arguments
    output_tokens, h, c = decoder_model.predict([target_seq, states_value[1], states_value[2]])

    # Sample the next token
    sampled_token_index = np.argmax(output_tokens[0,-1,:])
    sampled_token = fra_tokenizer.index_word[sampled_token_index]

    decoded_sentence += ' ' + sampled_token

    # Stop if we hit the end token or reach max length
    if sampled_token == 'endtoken' or len(decoded_sentence) > max_decoder_seq_length:
      stop_condition = True

    # update the target sequence
    target_seq = np.zeros((1,1))
    target_seq[0,0] = sampled_token_index

    # update the states
    states_value = [h,c]

  return decoded_sentence

In [29]:
#Test the decoder with a sample sentence
input_seq = encoder_input_data[2:3]
input_seq

array([[2, 6, 0]], dtype=int32)

In [30]:
decoded_sentence = decode_sequence(input_seq)
print(decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step
 bonjour
