<a href="https://colab.research.google.com/github/pnabende/spelling-correction-for-East-African-languages/blob/master/spellcorrection_model_transformer_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import the necessary packages

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


Open the google drive folder that has the datasets

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load the datasets

In [3]:
train_data = pd.read_csv('/content/drive/MyDrive/research/spelling-correction/data/sampled-10000train-1000test/train-10000-luganda-double-tripple-errors.csv')
test_data = pd.read_csv('/content/drive/MyDrive/research/spelling-correction/data/sampled-10000train-1000test/test-1000-single-A-error.csv')

Ensure the data is only of string type

In [4]:
train_data['incorrect_word'] = train_data['incorrect_word'].astype(str)
train_data['correct_word'] = train_data['correct_word'].astype(str)
test_data['incorrect_word'] = test_data['incorrect_word'].astype(str)
test_data['correct_word'] = test_data['correct_word'].astype(str)


Tokenize

In [5]:
# Step 2: Tokenization using Tokenizer (character-level, case-sensitive)
tokenizer = Tokenizer(char_level=True, lower=False)
tokenizer.fit_on_texts(train_data['incorrect_word'] + train_data['correct_word'])

train_seq_incorrect = tokenizer.texts_to_sequences(train_data['incorrect_word'])
train_seq_correct = tokenizer.texts_to_sequences(train_data['correct_word'])
test_seq_incorrect = tokenizer.texts_to_sequences(test_data['incorrect_word'])
test_seq_correct = tokenizer.texts_to_sequences(test_data['correct_word'])


Pad sequences to maximum length

In [6]:
# Padding sequences to the maximum length
max_length = max(len(seq) for seq in train_seq_incorrect + test_seq_incorrect + train_seq_correct + test_seq_correct)

train_input_ids = pad_sequences(train_seq_incorrect, maxlen=max_length, padding='post')
train_labels = pad_sequences(train_seq_correct, maxlen=max_length, padding='post')
test_input_ids = pad_sequences(test_seq_incorrect, maxlen=max_length, padding='post')
test_labels = pad_sequences(test_seq_correct, maxlen=max_length, padding='post')


Build the character-level transformer

In [15]:
# Step 3: Build the character-level transformer model
num_chars = len(tokenizer.word_index) + 1
embedding_dim = 64
num_heads = 4
dff = 128
num_encoder_layers = 2

def positional_encoding(max_position, d_model):
    angle_rates = 1 / np.power(10000, (2 * (np.arange(d_model) // 2)) / np.float32(d_model))
    angle_rads = np.arange(max_position)[:, np.newaxis] * angle_rates[np.newaxis, :]
    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    return tf.constant(pos_encoding, dtype=tf.float32)

class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, dff, rate=0.1):
        super(EncoderBlock, self).__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.ffn = tf.keras.Sequential([tf.keras.layers.Dense(dff, activation='relu'), tf.keras.layers.Dense(embedding_dim)])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.mha(inputs, inputs, inputs)  # Self-attention
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)  # Residual connection and layer normalization

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # Residual connection and layer normalization

        return out2

class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, embedding_dim, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding, embedding_dim)
        self.enc_layers = [EncoderBlock(embedding_dim, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        seq_length = tf.shape(inputs)[1]
        mask = tf.math.logical_not(tf.math.equal(inputs, 0))
        mask = tf.expand_dims(mask, -1)
        inputs = self.embedding(inputs)  # Adding embedding
        inputs *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))
        # Expanding the positional encoding along the batch dimension
        pos_encoding = tf.tile(tf.expand_dims(self.pos_encoding, axis=0), [tf.shape(inputs)[0], 1, 1])
        inputs += pos_encoding[:, :seq_length, :]
        inputs = self.dropout(inputs, training=training)
        for enc_layer in self.enc_layers:
            inputs = enc_layer(inputs, training)  # Remove 'mask' from the arguments

        return inputs



Build and compile the transformer-based spelling correction model

In [16]:
# Step 4: Build and compile the transformer-based spelling correction model
num_encoder_layers = 2
input_ids = Input(shape=(max_length,), dtype='int32')
encoder_output = Encoder(num_encoder_layers, embedding_dim, num_heads, dff, num_chars, max_length)(input_ids)
output = Dense(num_chars, activation='softmax')(encoder_output)

model = Model(inputs=input_ids, outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


Train the model

In [17]:
# Step 5: Train the model
model.fit(train_input_ids, train_labels, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x78c2f44cc640>

Evaluate the model

In [22]:
# Step 6: Evaluation
predictions = model.predict(test_input_ids)
decoded_predictions = tokenizer.sequences_to_texts(np.argmax(predictions, axis=-1))
decoded_predictions_no_spaces = [''.join(prediction.split()) for prediction in decoded_predictions]
# Calculate accuracy
correct_predictions = np.array(test_data['correct_word']) == np.array(decoded_predictions_no_spaces)
accuracy = np.mean(correct_predictions)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.14


Show the predicted corrections

In [23]:
# Show the predicted corrections alongside the original incorrect words and target correct words
for idx, (incorrect_words, target_words, predicted_word, is_correct) in enumerate(zip(test_data['incorrect_word'], test_data['correct_word'], decoded_predictions_no_spaces, correct_predictions)):
    print(f"Example {idx + 1}:")
    print(f"Incorrect Word(s): {incorrect_words}")
    print(f"Target Correct Word(s): {target_words}")
    print(f"Predicted Word(s): {predicted_word}")
    print(f"Is Correct: {is_correct}")
    print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Is Correct: False

Example 168:
Incorrect Word(s): apalituuka
Target Correct Word(s): akalituuka
Predicted Word(s): alalituuka
Is Correct: False

Example 169:
Incorrect Word(s): enayumbi
Target Correct Word(s): enayumba
Predicted Word(s): enayumbi
Is Correct: False

Example 170:
Incorrect Word(s): boonozna
Target Correct Word(s): boonoona
Predicted Word(s): boonoana
Is Correct: False

Example 171:
Incorrect Word(s): siyimiridde
Target Correct Word(s): kiyimiridde
Predicted Word(s): siyimiridde
Is Correct: False

Example 172:
Incorrect Word(s): tokuŋŋabayanga
Target Correct Word(s): tokuŋŋaaayanga
Predicted Word(s): tokuŋŋabayanga
Is Correct: False

Example 173:
Incorrect Word(s): aresusito
Target Correct Word(s): amesusito
Predicted Word(s): atesusito
Is Correct: False

Example 174:
Incorrect Word(s): adunoni
Target Correct Word(s): alunoni
Predicted Word(s): adunoni
Is Correct: False

Example 175:
Incorrect Word(s): badd