# Build a seq2seq model for machine translation.


Change LSTM model to Bidirectional LSTM Model and Translate English to Spanish



## 0. You will do the following:

1. Read and run the code. Please make sure you have installed keras or tensorflow.Running the script on colab will speed up the training process and also prevent package loading issue. 
2. Complete the code in Section 1.1, you may fill in your data directory.
3. Directly modify the code in Section 3. Change the current LSTM layer to a Bidirectional LSTM Model.
4. Training your model and translate English to Spanish in Section 4.2. You could try translating other languages.
5. Complete the code in Section 5.

### Hint: 

To implement ```Bi-LSTM```, you will need the following code to build the encoder **in Section 3**. Do NOT use Bi-LSTM for the decoder. But there are other codes **you need to modify** to make it work.

In [4]:
# from keras.layers import Bidirectional, Concatenate

# encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True, 
#                                   dropout=0.5, name='encoder_lstm'))
# _, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_inputs)

# state_h = Concatenate()([forward_h, backward_h])
# state_c = Concatenate()([forward_c, backward_c])

## 1. Data preparation 

1. Download spanish-english data from http://www.manythings.org/anki/
2. You may try to use other languages.
3. Unzip the .ZIP file.
4. Put the .TXT file (e.g., "deu.txt") in the directory "./Data/".
5. Fill in your data directory in section 1.1.

### 1.1. Load and clean text


In [7]:
import re
import string
from unicodedata import normalize
import numpy

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return numpy.array(cleaned)

In [9]:
# e.g., filename = 'Data/deu.txt'
filename = 'spa.txt'

# e.g., n_train = 20000
n_train = 20000

In [10]:
# load dataset
doc = load_doc(filename)

# split into Language1-Language2 pairs
pairs = to_pairs(doc)

# clean sentences
clean_pairs = clean_data(pairs)[:n_train, :]

In [11]:
for i in range(3000, 3010):
    print('[' + clean_pairs[i, 0] + '] => [' + clean_pairs[i, 1] + ']')

[we are here] => [estamos aqui]
[we ate eggs] => [hemos comido huevos]
[we ate eggs] => [comimos huevos]
[we broke up] => [nos separamos]
[we broke up] => [lo dejamos]
[we broke up] => [rompimos]
[we can help] => [podemos ayudar]
[we can help] => [nosotros podemos ayudar]
[we can meet] => [podemos encontrarnos]
[we can meet] => [podemos vernos]


In [12]:
input_texts = clean_pairs[:, 0]
target_texts = numpy.array(['\t' + text + '\n' for text in clean_pairs[:, 1]])

print('Length of input_texts:  ' + str(input_texts.shape))
print('Length of target_texts: ' + str(target_texts.shape))

Length of input_texts:  (20000,)
Length of target_texts: (20000,)


In [13]:
max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)

print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))

max length of input  sentences: 18
max length of target sentences: 48


**Remark:** To this end, you have two lists of sentences: input_texts and target_texts

## 2. Text processing

### 2.1. Convert texts to sequences

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# encode and pad sequences
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index


encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, 
                                                      input_texts)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, 
                                                       target_texts)

print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

shape of encoder_input_seq: (20000, 18)
shape of input_token_index: 27
shape of decoder_input_seq: (20000, 48)
shape of target_token_index: 29


In [17]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 28
num_decoder_tokens: 30


**Remark:** To this end, the input language and target language texts are converted to 2 matrices. 

- Their number of rows are both n_train.
- Their number of columns are respective max_encoder_seq_length and max_decoder_seq_length.

The followings print a sentence and its representation as a sequence.

In [20]:
target_texts[100]

'\tentendiste\n'

In [21]:
decoder_input_seq[100, :]

array([ 6,  2,  9,  8,  2,  9, 15, 11,  5,  8,  2,  7,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

## 2.2. One-hot encode

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.
- It is represented by a $n\times t \times v$ tensor ($t$ is the number of unique chars) after the one-hot encoding.

In [23]:
from tensorflow.keras.utils import to_categorical

# one hot encode target sequence
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences)
    data = numpy.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

encoder_input_data = onehot_encode(encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data = onehot_encode(decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)

decoder_target_seq = numpy.zeros(decoder_input_seq.shape)
decoder_target_seq[:, 0:-1] = decoder_input_seq[:, 1:]
decoder_target_data = onehot_encode(decoder_target_seq, 
                                    max_decoder_seq_length, 
                                    num_decoder_tokens)

print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)

(20000, 18, 28)
(20000, 48, 30)
(20000, 48, 30)


## 3. Build the networks (for training) 

- In this section, we have already implemented the LSTM model for you. You can run the code and see what the code is doing.  

- **You need to change the existing LSTM model to a Bidirectional LSTM model. Just modify the network structrue and do not change the training cell in section 3.4.**

- Build encoder, decoder, and connect the two modules to get "model". 

- Fit the model on the bilingual data to train the parameters in the encoder and decoder.



### 3.1. Encoder network

- Input:  one-hot encode of the input language

- Return: 

    -- output (all the hidden states   $h_1, \cdots , h_t$) are always discarded
    
    -- the final hidden state  $h_t$
    
    -- the final conveyor belt $c_t$

In [26]:
from tensorflow.keras.layers import Input, LSTM, Bidirectional, Concatenate, Dense
from tensorflow.keras.models import Model

latent_dim = 256

encoder_inputs = Input(shape=(None, num_encoder_tokens), name='encoder_inputs')

# set Bi-LSTM
encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True, dropout=0.5, name='encoder_lstm'))
# When return_state=True with Bidirectional, the output is:
# [forward_outputs, forward_h, forward_c, backward_h, backward_c]
_, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_inputs)

state_h = Concatenate()([forward_h, backward_h])  # final hidden state (size 2*latent_dim)
state_c = Concatenate()([forward_c, backward_c])  # final cell state (size 2*latent_dim)

encoder_model = Model(encoder_inputs, [state_h, state_c], name='encoder')

Print a summary and save the encoder network structure to "./encoder.pdf"

In [28]:
from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot, plot_model
from tensorflow.keras.utils import model_to_dot, plot_model

SVG(model_to_dot(encoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=encoder_model, show_shapes=False,
    to_file='encoder.pdf'
)

encoder_model.summary()

### 3.2. Decoder network

- Inputs:  

    -- one-hot encode of the target language
    
    -- The initial hidden state $h_t$ 
    
    -- The initial conveyor belt $c_t$ 

- Return: 

    -- output (all the hidden states) $h_1, \cdots , h_t$

    -- the final hidden state  $h_t$ (discarded in the training and used in the prediction)
    
    -- the final conveyor belt $c_t$ (discarded in the training and used in the prediction)

In [30]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

# Decoder initial states (inputs to the decoder model)
decoder_input_h = Input(shape=(latent_dim * 2,), name='decoder_input_h')  # 512-dimensional
decoder_input_c = Input(shape=(latent_dim * 2,), name='decoder_input_c')
# Decoder input sequence (one-hot encoded Spanish characters)
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# Decoder LSTM (not bidirectional). It now has latent_dim*2 units to match encoder states.
decoder_lstm = LSTM(latent_dim * 2, return_sequences=True, return_state=True, dropout=0.5, name='decoder_lstm')
decoder_outputs, dec_state_h, dec_state_c = decoder_lstm(decoder_input_x, 
                                                         initial_state=[decoder_input_h, decoder_input_c])

# Decoder output dense layer to map LSTM outputs to character probabilities
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# Build the decoder model
decoder_model = Model([decoder_input_x, decoder_input_h, decoder_input_c],
                      [decoder_outputs, dec_state_h, dec_state_c],
                      name='decoder')

Print a summary and save the encoder network structure to "./decoder.pdf"

In [32]:
from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot, plot_model
from tensorflow.keras.utils import model_to_dot, plot_model

SVG(model_to_dot(decoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=decoder_model, show_shapes=False,
    to_file='decoder.pdf'
)

decoder_model.summary()

### 3.3. Connect the encoder and decoder

In [34]:
# Define inputs to the combined model (encoder input sequence and decoder input sequence)
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# Encoder outputs (final states) for given encoder input
encoder_states = encoder_model(encoder_input_x)  # [state_h, state_c]

# Decoder outputs for given decoder inputs and encoder initial states
decoder_outputs_train, _, _ = decoder_lstm(decoder_input_x, initial_state=encoder_states)
decoder_outputs_train = decoder_dense(decoder_outputs_train)  # apply dense to each time step output

# Build the training model
model = Model([encoder_input_x, decoder_input_x], decoder_outputs_train, name='seq2seq_training')
model.summary()

In [35]:
from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot, plot_model
from tensorflow.keras.utils import model_to_dot, plot_model

SVG(model_to_dot(model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=model, show_shapes=False,
    to_file='model_training.pdf'
)

model.summary()

### 3.4. Fit the model on the bilingual dataset

- encoder_input_data: one-hot encode of the input language

- decoder_input_data: one-hot encode of the input language

- decoder_target_data: labels (left shift of decoder_input_data)

- tune the hyper-parameters

- stop when the validation loss stop decreasing.

In [37]:
print('shape of encoder_input_data' + str(encoder_input_data.shape))
print('shape of decoder_input_data' + str(decoder_input_data.shape))
print('shape of decoder_target_data' + str(decoder_target_data.shape))

shape of encoder_input_data(20000, 18, 28)
shape of decoder_input_data(20000, 48, 30)
shape of decoder_target_data(20000, 48, 30)


In [38]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

model.fit([encoder_input_data, decoder_input_data],  # training data
          decoder_target_data,                       # labels (left shift of the target sequences)
          batch_size=64, epochs=50, validation_split=0.2)

model.save('seq2seq.h5')

Epoch 1/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 183ms/step - loss: 1.2799 - val_loss: 1.1163
Epoch 2/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 200ms/step - loss: 0.9553 - val_loss: 0.9701
Epoch 3/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 202ms/step - loss: 0.8678 - val_loss: 0.8992
Epoch 4/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 197ms/step - loss: 0.8387 - val_loss: 0.8858
Epoch 5/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 190ms/step - loss: 0.8282 - val_loss: 0.8583
Epoch 6/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 212ms/step - loss: 0.8094 - val_loss: 0.8348
Epoch 7/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 198ms/step - loss: 0.8028 - val_loss: 0.8296
Epoch 8/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 186ms/step - loss: 0.7886 - val_loss: 0.8056
Epoch 9/50
[1m2



## 4. Make predictions

- In this section, you need to complete section 4.2 to translate English to the target language.


### 4.1. Translate English to XXX

1. Encoder read a sentence (source language) and output its final states, $h_t$ and $c_t$.
2. Take the [star] sign "\t" and the final state $h_t$ and $c_t$ as input and run the decoder.
3. Get the new states and predicted probability distribution.
4. sample a char from the predicted probability distribution
5. take the sampled char and the new states as input and repeat the process (stop if reach the [stop] sign "\n").

In [40]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [41]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = numpy.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        output_tokens[0, -1, 0] = 0

        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        sampled_token_index = numpy.argmax(output_tokens[0, -1, :])
        
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = numpy.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence


In [42]:
for seq_index in range(2100, 2120):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('English:       ', input_texts[seq_index])
    print('Spanish (true): ', target_texts[seq_index][1:-1])
    print('Spanish (pred): ', decoded_sentence[0:-1])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10

### 4.2. Translate an English sentence to the target language （20 points）

1. Tokenization
2. One-hot encode
3. Translate

In [44]:
input_sentence = 'I love you'

# import string
input_sentence_clean = input_sentence.lower()
input_sentence_clean = input_sentence_clean.translate(str.maketrans('', '', string.punctuation))

# Convert each character to the corresponding index (ignore unknown chars)
input_sequence = [input_token_index[char] for char in input_sentence_clean if char in input_token_index]

# Pad the sequence to the maximum encoder length
# from tensorflow.keras.preprocessing.sequence import pad_sequences
input_sequence_padded = pad_sequences([input_sequence], maxlen=max_encoder_seq_length, padding='post')

# 2. One-hot encode the padded sequence
input_x = onehot_encode(input_sequence_padded, max_encoder_seq_length, num_encoder_tokens)

# 3. Use the seq2seq model to decode the sequence to Spanish
translated_sentence = decode_sequence(input_x)

print('source sentence is: ' + input_sentence)
print('translated sentence is: ' + translated_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10

# 5. Evaluate the translation using BLEU score

- We have already translated from English to target language, but how can we evaluate the performance of our model quantitatively? 

- In this section, you need to re-train the model we built in secton 3 and then evaluate the bleu score on testing dataset.

Reference:

https://machinelearningmastery.com/calculate-bleu-score-for-text-python/

https://en.wikipedia.org/wiki/BLEU

#### Hint:

- Randomly partition the dataset to training, validation, and test.

- Evaluate the BLEU score using the test set. Report the average.

- You may use packages to calculate bleu score, e.g., sentence_bleu() from nltk package.

### 5.1. Partition the dataset to training, validation, and test. Build new token index. (10 points)

1. You may try to load more data/lines from text file.
2. Convert text to sequences and build token index using training data.
3. One-hot encode your training and validation text sequences.

In [47]:
import numpy as np

# (Optional) Load more data for a larger dataset
# Here we reload the dataset and do not truncate to n_train straight away.
doc_full = load_doc(filename)
pairs_full = to_pairs(doc_full)
clean_pairs_full = clean_data(pairs_full)

# Optionally limit the total data for manageability (or use all data)
# e.g., use first 50,000 pairs
clean_pairs_full = clean_pairs_full[:50000]

# Shuffle and split into train/validation/test
np.random.shuffle(clean_pairs_full)
n_total = len(clean_pairs_full)
train_end = int(0.8 * n_total)    # 80% training
val_end   = int(0.9 * n_total)    # next 10% validation, 10% test
train_pairs = clean_pairs_full[:train_end]
val_pairs   = clean_pairs_full[train_end:val_end]
test_pairs  = clean_pairs_full[val_end:]

# Separate input and target texts for each set, adding start/end tokens for targets
train_input_texts = [pair[0] for pair in train_pairs]
train_target_texts = ['\t' + pair[1] + '\n' for pair in train_pairs]
val_input_texts = [pair[0] for pair in val_pairs]
val_target_texts = ['\t' + pair[1] + '\n' for pair in val_pairs]
test_input_texts = [pair[0] for pair in test_pairs]
test_target_texts = ['\t' + pair[1] + '\n' for pair in test_pairs]

# Recompute max sequence lengths based on training set
max_encoder_seq_length = max(len(text) for text in train_input_texts)
max_decoder_seq_length = max(len(text) for text in train_target_texts)
print("Train set size:", len(train_input_texts))
print("Validation set size:", len(val_input_texts))
print("Test set size:", len(test_input_texts))
print("Max encoder seq length (train):", max_encoder_seq_length)
print("Max decoder seq length (train):", max_decoder_seq_length)

# Build token indices using only the training set
encoder_input_seq_train, input_token_index = text2sequences(max_encoder_seq_length, train_input_texts)
decoder_input_seq_train, target_token_index = text2sequences(max_decoder_seq_length, train_target_texts)
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

# Convert validation and test texts to sequences using the training token indices (no new fitting)
# For characters not seen in training, we use index 0 (which corresponds to padding/OOV)
val_encoder_input_seq = [[ input_token_index.get(char, 0) for char in text] for text in val_input_texts]
val_decoder_input_seq = [[ target_token_index.get(char, 0) for char in text] for text in val_target_texts]
test_encoder_input_seq = [[ input_token_index.get(char, 0) for char in text] for text in test_input_texts]
test_decoder_input_seq = [[ target_token_index.get(char, 0) for char in text] for text in test_target_texts]

# Pad the validation and test sequences to the max lengths determined by training data
val_encoder_input_seq = pad_sequences(val_encoder_input_seq, maxlen=max_encoder_seq_length, padding='post')
val_decoder_input_seq = pad_sequences(val_decoder_input_seq, maxlen=max_decoder_seq_length, padding='post')
test_encoder_input_seq = pad_sequences(test_encoder_input_seq, maxlen=max_encoder_seq_length, padding='post')
test_decoder_input_seq = pad_sequences(test_decoder_input_seq, maxlen=max_decoder_seq_length, padding='post')

# One-hot encode training and validation sequences
encoder_input_data_train = onehot_encode(encoder_input_seq_train, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data_train = onehot_encode(decoder_input_seq_train, max_decoder_seq_length, num_decoder_tokens)
decoder_target_seq_train = np.zeros_like(decoder_input_seq_train)
decoder_target_seq_train[:, 0:-1] = decoder_input_seq_train[:, 1:]
decoder_target_data_train = onehot_encode(decoder_target_seq_train, max_decoder_seq_length, num_decoder_tokens)

encoder_input_data_val = onehot_encode(val_encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data_val = onehot_encode(val_decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)
decoder_target_seq_val = np.zeros_like(val_decoder_input_seq)
decoder_target_seq_val[:, 0:-1] = val_decoder_input_seq[:, 1:]
decoder_target_data_val = onehot_encode(decoder_target_seq_val, max_decoder_seq_length, num_decoder_tokens)


Train set size: 40000
Validation set size: 5000
Test set size: 5000
Max encoder seq length (train): 24
Max decoder seq length (train): 68


### 5.2 Retrain your previous Bidirectional LSTM model with training and validation data and tune the parameters (learning rate, optimizer, etc) based on validation score. 

1. Use the model structure in section 3 to train a new model with new training and validation datasets.
2. Based on validation BLEU score or loss to tune parameters.

In [None]:
# Reinitialize the model architecture (Bidirectional encoder + LSTM decoder)
latent_dim = 256  # using the same latent_dim as before
# Encoder (Bi-LSTM)
encoder_inputs = Input(shape=(None, num_encoder_tokens), name='encoder_inputs')
encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True, dropout=0.5, name='encoder_lstm'))
_, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_inputs)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_model = Model(encoder_inputs, [state_h, state_c], name='encoder')

# Decoder
decoder_input_h = Input(shape=(latent_dim * 2,), name='decoder_input_h')
decoder_input_c = Input(shape=(latent_dim * 2,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')
decoder_lstm = LSTM(latent_dim * 2, return_sequences=True, return_state=True, dropout=0.5, name='decoder_lstm')
decoder_outputs, dec_h, dec_c = decoder_lstm(decoder_input_x, initial_state=[decoder_input_h, decoder_input_c])
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_input_x, decoder_input_h, decoder_input_c],
                      [decoder_outputs, dec_h, dec_c], name='decoder')

# Combined training model
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')
encoder_states = encoder_model(encoder_input_x)
decoder_outputs_train, _, _ = decoder_lstm(decoder_input_x, initial_state=encoder_states)
decoder_outputs_train = decoder_dense(decoder_outputs_train)
model_new = Model([encoder_input_x, decoder_input_x], decoder_outputs_train, name='seq2seq_training_new')

# Compile the model with a different optimizer (e.g., Adam) for experimentation
from tensorflow.keras.optimizers import Adam
model_new.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy')


# Train the model on the new training set, with validation data
history_new = model_new.fit([encoder_input_data_train, decoder_input_data_train],
                            decoder_target_data_train,
                            batch_size=64,
                            epochs=50,
                            validation_data=([encoder_input_data_val, decoder_input_data_val],
                                             decoder_target_data_val))


Epoch 1/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 194ms/step - loss: 1.0262 - val_loss: 0.6508
Epoch 2/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 201ms/step - loss: 0.7296 - val_loss: 0.5916
Epoch 3/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 200ms/step - loss: 0.6897 - val_loss: 0.5430
Epoch 4/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 197ms/step - loss: 0.6587 - val_loss: 0.5039
Epoch 5/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m589s[0m 944ms/step - loss: 0.6299 - val_loss: 0.4753
Epoch 6/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m586s[0m 938ms/step - loss: 0.6050 - val_loss: 0.4499
Epoch 7/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 202ms/step - loss: 0.5881 - val_loss: 0.4298
Epoch 8/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 203ms/step - loss: 0.5687 - val_loss: 0.4103
Epoch 9/

### 5.3 Evaluate the BLEU score using the test set.

1. Use trained model above to calculate the BLEU score with testing dataset.
2. A reasonable should be 0.1-0.3. The higher, the better.

In [None]:
!pip install -q tqdm
from tqdm import tqdm

def decode_sequence_new(input_seq):
    states_value = encoder_model.predict(input_seq, verbose=0)

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.0

    decoded_sentence = ''
    stop_condition = False
    step_count = 0
    max_steps = max_decoder_seq_length + 10  # just in case model overshoots

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = [char for char, index in target_token_index.items() if index == sampled_token_index]
        sampled_char = sampled_char[0] if sampled_char else ''

        decoded_sentence += sampled_char

        # Exit if end token found OR max steps exceeded
        if sampled_char == '\n' or step_count > max_steps:
            stop_condition = True

        # Prepare next char
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0
        states_value = [h, c]
        step_count += 1

    return decoded_sentence



references = []
predictions = []
test_sample_size = 2000  # adjust to 30–50 for speed

print(f"Evaluating BLEU on {test_sample_size} samples...")

for i in tqdm(range(test_sample_size), desc="Translating..."):
    input_text = test_input_texts[i]
    seq = [input_token_index.get(char, 0) for char in input_text]
    seq_padded = pad_sequences([seq], maxlen=max_encoder_seq_length, padding='post')
    input_onehot = onehot_encode(seq_padded, max_encoder_seq_length, num_encoder_tokens)

    decoded = decode_sequence_new(input_onehot).rstrip('\n')
    ref_sentence = test_target_texts[i][1:-1]

    references.append([ref_sentence.split()])
    predictions.append(decoded.split())

# BLEU Score
from nltk.translate.bleu_score import corpus_bleu
bleu_score = corpus_bleu(references, predictions)
print(f"\n✅ BLEU score on {test_sample_size} samples: {bleu_score:.3f}")
