<a href="https://colab.research.google.com/github/pnabende/spelling-correction-for-East-African-languages/blob/master/spellingCorrection_feedforward_neural_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Load the data
data = pd.read_csv('/content/drive/MyDrive/research/spelling-correction/data/6000random-3error-types-luganda-tabbed.txt', sep='\t', header=None, names=['incorrect', 'correct'])
data = data.sample(frac=1)  # shuffle the data
data['incorrect'] = data['incorrect'].astype(str)  # convert float values to strings
data['correct'] = data['correct'].astype(str)  # convert float values to strings
vocab = set(''.join(data['incorrect'].tolist() + data['correct'].tolist()))  # get the vocabulary

In [7]:
print(len(data))

6001


In [8]:
# Prepare the data
char_to_idx = {char: idx for idx, char in enumerate(sorted(vocab))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
max_len = max([len(word) for word in data['incorrect']] + [len(word) for word in data['correct']])
num_samples = len(data)

In [9]:
X = np.zeros((num_samples, max_len, len(vocab)), dtype=np.float32)
y = np.zeros((num_samples, max_len, len(vocab)), dtype=np.float32)

In [10]:
for i, (incorrect, correct) in enumerate(zip(data['incorrect'], data['correct'])):
    for j, char in enumerate(incorrect):
        X[i, j, char_to_idx[char]] = 1
    for j, char in enumerate(correct):
        y[i, j, char_to_idx[char]] = 1

In [11]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)


In [12]:
# Build and train the model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(max_len, len(vocab))),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(max_len * len(vocab), activation='softmax'),
    tf.keras.layers.Reshape((max_len, len(vocab)))
])

model.compile(optimizer='adam', loss='categorical_crossentropy')

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f359c833340>

In [17]:
# Evaluate the model on the test dataset
test_data = pd.read_csv('/content/drive/MyDrive/research/spelling-correction/data/600-3error-test-set-luganda-tabbed.txt', sep='\t', header=None, names=['incorrect', 'correct'])
test_data = test_data.sample(frac=1)  # shuffle the data
test_data['incorrect'] = data['incorrect'].astype(str)  # convert float values to strings
test_data['correct'] = data['correct'].astype(str)  # convert float values to strings
vocab = set(''.join(data['incorrect'].tolist() + data['correct'].tolist()))  # get the vocabulary

X_test = np.zeros((len(test_data), max_len, len(vocab)), dtype=np.float32)
y_test = np.zeros((len(test_data), max_len, len(vocab)), dtype=np.float32)

for i, (incorrect, correct) in enumerate(zip(test_data['incorrect'], test_data['correct'])):
    for j, char in enumerate(incorrect):
        X_test[i, j, char_to_idx[char]] = 1
    for j, char in enumerate(correct):
        y_test[i, j, char_to_idx[char]] = 1

test_loss = model.evaluate(X_test, y_test)
print(f'Test loss: {test_loss}')

Test loss: 0.13181531429290771


In [18]:
# Evaluate the model on the test data and compute accuracy
total = len(X_test)
correct = 0
for i in range(total):
    x = X_test[i:i+1]
    y_true = y_test[i:i+1]
    y_pred = model.predict(x)
    y_pred = np.argmax(y_pred, axis=-1)
    y_true = np.argmax(y_true, axis=-1)
    if np.array_equal(y_pred, y_true):
        correct += 1

accuracy = correct / total
print(f'Test accuracy: {accuracy:.4f}')

Test accuracy: 0.0017
