<a href="https://colab.research.google.com/github/rashidkisejjere0784/English-to-luganda-with-transformers/blob/main/Luganda_to_English_with_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('Luganda.csv', encoding = "ISO-8859-1")
data.head()

Unnamed: 0,English,Luganda,Unnamed: 2,Unnamed: 3
0,All refugees were requested to register with t...,Abanoonyiboobubudamu bonna baasabiddwa beewand...,,
1,They called for a refugees' meeting yesterday.,Baayise olukungaana lw'abanoonyiboobubudamu eg...,,
2,Refugees had misunderstandings between thems...,Abanoonyiboobubudamu b'abadde n'obutakkaanya w...,,
3,We were urged to welcome refugees into our com...,Twakubirizibwa okwaniriza abanoonyiboobubudamu...,,
4,More development is achieved when we work toge...,Bwe tukolera awamu enkulaakulana enyingi efuni...,,


In [4]:
data = data[["English", "Luganda"]]
data = data.dropna()

In [5]:
data.iloc[0]['English']

'All refugees were requested to register with the chairman.'

In [22]:
text_pairs = []
for lug, eng in zip(data['Luganda'], data['English']):
    lug = "[start]" + lug + "[end]"
    text_pairs.append((eng, lug))

In [23]:
text_pairs[:10]

[('All refugees were requested to register with the chairman.',
  '[start]Abanoonyiboobubudamu bonna baasabiddwa beewandiise ewa ssentebe.[end]'),
 ("They called for a refugees' meeting yesterday.",
  "[start]Baayise olukungaana lw'abanoonyiboobubudamu eggulo.[end]"),
 ('Refugees had misunderstandings between   themselves.',
  "[start]Abanoonyiboobubudamu b'abadde n'obutakkaanya wakati waabwe.[end]"),
 ('We were urged to welcome refugees into our communities.',
  '[start]Twakubirizibwa okwaniriza abanoonyiboobubudamu mu bitundu byaffe.[end]'),
 ('More development is achieved when we work together.',
  '[start]Bwe tukolera awamu enkulaakulana enyingi efunibwa.[end]'),
 ('The border districts are insecure.',
  '[start]Disitulikiti eziriraanye ensalo si ntebenkevu.[end]'),
 ('Refugees have started practicing farming so as to earn a living.',
  '[start]Abanoonyiboobubudamu batandise okulima okusobola okwebeezaawo.[end]'),
 ('It is illegal to own a gun.',
  "[start]Kimenya mateeka okubeera 

In [24]:
random.shuffle(text_pairs)
num_val_samples = int(0.05 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

15021 total pairs
13519 training pairs
751 validation pairs
751 test pairs


In [9]:
len(set(" ".join(data['Luganda']).split()))

24693

In [25]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
sequence_length = 20
batch_size = 64


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


eng_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,
)
lug_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_lug_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
lug_vectorization.adapt(train_lug_texts)

In [26]:
def format_dataset(eng, lug):
    eng = eng_vectorization(eng)
    lug = lug_vectorization(lug)
    return ({"encoder_inputs": eng, "decoder_inputs": lug[:, :-1],}, lug[:, 1:])


def make_dataset(pairs):
    eng_texts, lug_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    lug_texts = list(lug_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, lug_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [27]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 20)
inputs["decoder_inputs"].shape: (64, 20)
targets.shape: (64, 20)


In [28]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

In [29]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

In [30]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [31]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [32]:
epochs = 1

transformer.summary()


Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_2 (Positi  (None, None, 256)   3845120     ['encoder_inputs[0][0]']         
 onalEmbedding)                                                                                   
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder_1 (Transfo  (None, None, 256)   3155456     ['positional_embedding_

In [33]:
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

In [34]:
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)



<keras.callbacks.History at 0x7fa1257af040>

In [35]:
output = transformer.fit(train_ds, epochs=50, validation_data=val_ds)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [36]:
lug_vocab = lug_vectorization.get_vocabulary()
lug_index_lookup = dict(zip(range(len(lug_vocab)), lug_vocab))
max_decoded_sentence_length = 20


def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = lug_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = lug_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence




In [38]:
input_sentence

'District leaders have discouraged gender based violence.'

In [37]:
translated

'[start] ba disitulikiti [UNK] ku [UNK] ku [UNK] ku [UNK]  [UNK]  [UNK]    [UNK]  [UNK] '

In [39]:
test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(30):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequence(input_sentence)
    print(input_sentence)
    print(translated)

People should work towards achieving rural development.
[start] balina okwenyigira mu kufuna okwenyigira ku mirimu [UNK]  [UNK] [UNK]      [UNK]  [UNK] 
Who is responsible for providing students with food at school?
[start] [UNK] abantu [UNK] ku [UNK] ku ssomero[end]  ssomero[end] zaabwe[end] ssomero[end]      ssomero[end]  bulijjo[end] 
The business community informed the council to involve people in the planning process.
[start] [UNK] ku ngeri [UNK] [UNK] nga [UNK] [UNK]  [UNK] [UNK]      [UNK]  [UNK] 
What causes well  s to dry up?
[start] otya [UNK] oluvannyuma kyalo ki[end]  mateeka[end] [UNK] lunaku[end] ki[end] lunaku[end]      mateeka[end]  ki[end] 
The ban should be lifted after the laboratory test results.
[start] alina [UNK] [UNK] [UNK] mu [UNK]  [UNK] [UNK] [UNK] [UNK]      [UNK]  [UNK] 
The government has failed to ensure proper water supply within the area.
[start] [UNK] abantu balina [UNK] mu kitundu[end]  kitundu[end] kitundu[end] kitundu[end]       kitundu[end]  [UNK] 