<a href="https://colab.research.google.com/github/pnabende/spelling-correction-for-East-African-languages/blob/master/spellingCorrectionLevenshtein.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Levenshtein
  Downloading Levenshtein-0.20.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.5/175.5 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<3.0.0,>=2.3.0
  Downloading rapidfuzz-2.15.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.20.9 rapidfuzz-2.15.0


In [4]:
import Levenshtein # library for calculating edit distance
import numpy as np

# Load training data from a tab-separated file where each line contains an incorrect and correct word pair.
def load_data(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
        data = [line.strip().split('\t') for line in lines]
        return data

# Calculate the edit distance between two words.
def edit_distance(word1, word2):
    return Levenshtein.distance(word1, word2)

# Train the spelling correction model on the training data.
def train_model(data):
    model = {}
    for pair in data:
        incorrect, correct = pair
        if incorrect not in model:
            model[incorrect] = {}
        distance = edit_distance(incorrect, correct)
        if distance not in model[incorrect]:
            model[incorrect][distance] = {}
        if correct not in model[incorrect][distance]:
            model[incorrect][distance][correct] = 1
        else:
            model[incorrect][distance][correct] += 1
    return model

# Predict the correct spelling of an input word based on the spelling correction model.
def predict(model, word):
    if word in model:
        return word
    else:
        candidates = {}
        for incorrect in model:
            distance = edit_distance(incorrect, word)
            if distance in model[incorrect]:
                for correct in model[incorrect][distance]:
                    count = model[incorrect][distance][correct]
                    if correct not in candidates:
                        candidates[correct] = count
                    else:
                        candidates[correct] += count
        if len(candidates) > 0:
            return max(candidates, key=candidates.get)
        else:
            return word

# Evaluate the accuracy of the spelling correction model on a test dataset.
def evaluate(model, data):
    correct = 0
    total = len(data)
    for pair in data:
        incorrect, correct_word = pair
        predicted = predict(model, incorrect)
        if predicted == correct_word:
            correct += 1
    accuracy = correct / total
    return accuracy

# Load the training and test datasets.
train_data = load_data('/content/drive/MyDrive/research/spelling-correction/data/5400random-3error-train-set-luganda-tabbed.txt')
test_data = load_data('/content/drive/MyDrive/research/spelling-correction/data/600-3error-test-set-luganda-tabbed.txt')

# Train the spelling correction model on the training dataset.
model = train_model(train_data)

# Evaluate the accuracy of the spelling correction model on the test dataset.
accuracy = evaluate(model, test_data)

# Print the accuracy of the spelling correction model.
print('Accuracy: {:.2f}%'.format(accuracy * 100))


Accuracy: 18.83%
