<a href="https://colab.research.google.com/github/pnabende/spelling-correction-for-East-African-languages/blob/master/spellcorrectmodel_tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Import Packages



In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

Open your Google drive where your dataset folder is

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!nvidia-smi

Data Preprocessing

In [4]:
# Step 1: Data Preprocessing
train_data = pd.read_csv('/content/drive/MyDrive/research/spelling-correction/data/sampled-10000train-1000test/train-10000-luganda-double-tripple-errors.csv')
test_data = pd.read_csv('/content/drive/MyDrive/research/spelling-correction/data/sampled-10000train-1000test/test-1000-single-A-error.csv')


In [None]:
train_data[:10]

In [6]:
# Convert 'incorrect_word' and 'correct_word' columns to string type
train_data['incorrect_word'] = train_data['incorrect_word'].astype(str)
train_data['correct_word'] = train_data['correct_word'].astype(str)
test_data['incorrect_word'] = test_data['incorrect_word'].astype(str)
test_data['correct_word'] = test_data['correct_word'].astype(str)


In [None]:
test_data[:10]

In [8]:
# Tokenize words and create sequences
tokenizer = Tokenizer(char_level=True, lower=False)
tokenizer.fit_on_texts(train_data['incorrect_word'] + train_data['correct_word'])
train_seq_incorrect = tokenizer.texts_to_sequences(train_data['incorrect_word'])
train_seq_correct = tokenizer.texts_to_sequences(train_data['correct_word'])
test_seq_incorrect = tokenizer.texts_to_sequences(test_data['incorrect_word'])
test_seq_correct = tokenizer.texts_to_sequences(test_data['correct_word'])

In [None]:
test_seq_incorrect[:10]

In [10]:
# Pad sequences to make them of equal length
max_sequence_length = max(max(len(seq) for seq in train_seq_incorrect),
                         max(len(seq) for seq in test_seq_incorrect))
train_seq_incorrect = pad_sequences(train_seq_incorrect, maxlen=max_sequence_length)
train_seq_correct = pad_sequences(train_seq_correct, maxlen=max_sequence_length)
test_seq_incorrect = pad_sequences(test_seq_incorrect, maxlen=max_sequence_length)
test_seq_correct = pad_sequences(test_seq_correct, maxlen=max_sequence_length)


In [None]:
test_seq_incorrect[:10]

Model architecture

In [12]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100  # Adjust as needed
hidden_units = 256  # Adjust as needed

# Encoder
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

Training

In [13]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.fit([train_seq_incorrect, train_seq_incorrect], np.expand_dims(train_seq_correct, -1), epochs=200, batch_size=32)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x79cd27981ba0>

Evaluation

In [14]:
predictions = model.predict([test_seq_incorrect, test_seq_incorrect])
decoded_predictions = np.argmax(predictions, axis=-1)
accuracy = np.mean(np.array(test_seq_correct) == decoded_predictions)

print("Test Accuracy:", accuracy)


Test Accuracy: 0.91025


Print first 10 predictions

In [15]:
decoded_predictions[:10]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5, 15,  1,
        17,  4,  9,  1, 11,  8,  1, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  5,
         4,  3,  5,  5, 13,  1,  5,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  7,  2,
        14,  1,  9,  2, 11, 19, 12,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  7,
        13,  3, 14,  2, 13,  3,  8,  3],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  8, 13,  1, 18,  2, 11,
         8,  2,  8, 12,  1,  7,  6,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  3, 10,  2, 17,  1, 10,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5, 15,  6,  7,
         4, 16,  1,  1, 16,  2, 14,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         3,  8,  2, 12,  4,  5, 25,  4],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5, 19,  2, 17,
         3,  5, 19,  3, 

In [16]:
# Step 4: Evaluation and Predictions
predictions = model.predict([test_seq_incorrect, test_seq_incorrect])
decoded_predictions = []

# Convert numerical predictions to words
for prediction in predictions:
    # Convert the prediction to a list of character indices
    predicted_word_indices = [np.argmax(char_vector) for char_vector in prediction]
    # Convert the list of indices to a string of characters
    predicted_word = tokenizer.sequences_to_texts([predicted_word_indices])[0]
    decoded_predictions.append(predicted_word)

# Show the predicted corrections alongside the original incorrect words and target correct words
for idx, (incorrect_words, target_words, predicted_word) in enumerate(zip(test_data['incorrect_word'], test_data['correct_word'], decoded_predictions)):
    print(f"Example {idx + 1}:")
    print(f"Incorrect Word(s): {incorrect_words}")
    print(f"Target Correct Word(s): {target_words}")
    print(f"Predicted Word(s): {predicted_word}")
    print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Example 1:
Incorrect Word(s): nAatugambay
Target Correct Word(s): nAatugamba
Predicted Word(s): n A a t u g a m b a y

Example 2:
Incorrect Word(s): nguennyana
Target Correct Word(s): ngAennyana
Predicted Word(s): n n u e n n y a n a

Example 3:
Incorrect Word(s): ekiragimdwa
Target Correct Word(s): ekiragiddwa
Predicted Word(s): e k i r a g i m d w a

Example 4:
Incorrect Word(s): ekyeentebe
Target Correct Word(s): ekyAentebe
Predicted Word(s): e k y e r i y e b e

Example 5:
Incorrect Word(s): ebyazimbibwakox
Target Correct Word(s): ebyazimbibwako
Predicted Word(s): e b y a z i m b i b w a k o a

Example 6:
Incorrect Word(s): elitala
Target Correct Word(s): ekitala
Predicted Word(s): e l i t a l a

Example 7:
Incorrect Word(s): nAokusaaeira
Target Correct Word(s): nAokusaasira
Predicted Word(s): n A o k u s a a s i r a

Example 8:
Incorrect Word(s): ebiwknvu
Target Correct Word(s): ebiwonvu
Predicted Word(s): e b i w u 