Download the data

In [3]:
import tensorflow as tf
from pathlib import Path

url = "https://github.com/stevelukis/rnn-ind-eng/raw/main/ind-eng.zip"
path = tf.keras.utils.get_file("ind-eng.zip", origin=url, cache_dir=".",
                               extract=True)
text = (Path(path).parent / "ind.txt").read_text(encoding='utf-8')

The dataset is tab-formatted like this "english   indonesian   description". We only need the English (features) and Indonesian (target).

In [51]:
import numpy as np

triples = [line.split('\t') for line in text.splitlines()]
np.random.shuffle(triples)
sentences_en, sentences_id, _ = zip(*triples)

It is a small dataset

In [52]:
print(len(sentences_en))

9243


In [53]:
for i in range(3):
    print(sentences_en[i] + ' => ' + sentences_id[i])

It's a good question. => Ini pertanyaan yang bagus
Is it far from here? => Apakah jauh dari sini?
His bag was stolen yesterday. => Tasnya telah dicuri kemarin.


Vectorizing the text

In [54]:
vocab_size = 500
max_length = 50

text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length
)
text_vec_layer_id = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length
)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_id.adapt([f'startofseq {s} endofseq' for s in sentences_id])

In [55]:
X_train = tf.constant(sentences_en[:8500])
X_val = tf.constant(sentences_en[8500:])
X_train_dec = tf.constant([f'startofseq {s}' for s in sentences_id[:8500]])
X_val_dec = tf.constant([f'startofseq {s}' for s in sentences_id[8500:]])

y_train = text_vec_layer_id([f'{s} endofseq' for s in sentences_id[:8500]])
y_val = text_vec_layer_id([f'{s} endofseq' for s in sentences_id[8500:]])

Defining model structure using Functional API

In [56]:
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

embed_size = 128

encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_id(decoder_inputs)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                                    embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                                    embed_size,
                                                    mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [57]:
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_state=True)
)

encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),
                 tf.concat(encoder_state[1::2], axis=-1)]

In [58]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [59]:
output_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')
y_proba = output_layer(decoder_outputs)

In [60]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[y_proba])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam',
              metrics=['accuracy'])
history = model.fit((X_train, X_train_dec), y_train, epochs=20,
                    validation_data=((X_val, X_val_dec), y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Use word-by-word prediction

In [61]:
def translate(sentence_en):
    translation = ''
    for word_idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array([f'startofseq ' + translation])
        y_proba = model.predict([X, X_dec])[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_id.get_vocabulary()[predicted_word_id]
        if predicted_word == 'endofseq':
            break
        translation += ' ' + predicted_word
    return translation.strip()

In [93]:
translate('I go to school today')



'aku pergi ke sekolah untuk [UNK] ini'

The `aku pergi ke sekolah` part is correct. The rest is nonsense...

Trying beam search.

In [None]:
def top_k_indices(k, arr):
    return np.argsort(arr, axis=0)[-k:][::-1]


vocab = text_vec_layer_id.get_vocabulary()


def beam_search_translate(sentence_en, k):
    X = np.array([sentence_en])
    X_dec = np.array(['startofseq'])

    y_proba = model.predict([X, X_dec])[0, 0]
    top_k_words_id = top_k_indices(k, y_proba)
    top_k_translations = [(vocab[word_id], y_proba[word_id]) for word_id in top_k_words_id]

    for i in range(1, max_length):
        top_branch_translations = []
        for translation, proba in top_k_translations:
            X_dec = np.array([f'startofseq {translation}'])

            y_proba = model.predict([X, X_dec])[0][i]
            top_k_words_id = top_k_indices(k, y_proba)
            top_branch_translations += [(translation + ' ' + vocab[word_id], y_proba[word_id] * proba)
                                        for word_id in top_k_words_id]

        top_k_translations = sorted(top_branch_translations, key=lambda x: x[1], reverse=True)[:k]

        for translation, proba in top_k_translations:
            if 'endofseq' in translation:
                return translation[:-len('endofseq')-1]

    return top_k_translations[0][0]

In [92]:
beam_search_translate('I go to school today', 3)



'aku datang ke sekolah besok'

Well, it is still wrong. It says `besok` which means `tomorrow` in English, although we say `today`. But at least it knows that we have time modifier at the end of the sentence.