<a href="https://colab.research.google.com/github/pnabende/spelling-correction-for-East-African-languages/blob/master/edit_distance_and_ngram_spell_correction_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import the necessary packages

In [1]:
import pandas as pd
import numpy as np
from nltk.util import ngrams
from nltk.metrics import edit_distance

Open the Google drive folder that has the datasets

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load the datasets

In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/research/spelling-correction/data/sampled-10000train-1000test/train-10000-luganda-double-tripple-errors.csv')
test_df = pd.read_csv('/content/drive/MyDrive/research/spelling-correction/data/sampled-10000train-1000test/test-1000-single-A-error.csv')

Define edit distance model

In [4]:
# Step 2: Edit Distance Model
def edit_distance_correction(word, candidates):
    return min(candidates, key=lambda candidate: edit_distance(word, candidate))


Define ngram model

In [5]:
# Step 3: N-gram Model
def generate_ngrams(word, n):
    return list(ngrams(word, n))

def ngram_correction(word, candidates, n):
    word_ngrams = generate_ngrams(word, n)
    candidate_scores = {candidate: 0 for candidate in candidates}

    for candidate in candidates:
        candidate_ngrams = generate_ngrams(candidate, n)
        common_ngrams = set(word_ngrams) & set(candidate_ngrams)
        candidate_scores[candidate] = len(common_ngrams)

    return max(candidate_scores, key=candidate_scores.get)

Define training function:

- Note that there is nothing to train for edit-distance
- ngram model uses ngram frequencies in the training data

In [6]:
def train_spell_correction_models(train_df):
    # Edit Distance Model: Nothing to train, as it's rule-based
    # N-gram Model: Count n-grams frequencies in the training data

    n_gram_freq = {}
    n_max = 4  # Maximum n-gram size

    for n in range(1, n_max + 1):
        for _, row in train_df.iterrows():
            incorrect_word = str(row['incorrect_word'])  # Convert to string
            correct_word = str(row['correct_word'])  # Convert to string
            candidates = [correct_word]  # Add the correct word as a candidate

            # Use character-level tokenization
            incorrect_ngrams = generate_ngrams(incorrect_word, n)
            correct_ngrams = generate_ngrams(correct_word, n)
            n_grams = set(incorrect_ngrams + correct_ngrams)

            for n_gram in n_grams:
                if n_gram not in n_gram_freq:
                    n_gram_freq[n_gram] = {'correct_word': 0, 'incorrect_word': 0}

                if n_gram in correct_ngrams:
                    n_gram_freq[n_gram]['correct_word'] += 1
                elif n_gram in incorrect_ngrams:
                    n_gram_freq[n_gram]['incorrect_word'] += 1

    return n_gram_freq


Evaluate the models

In [7]:
# Evaluate the models and show top 1 candidates
def evaluate_models(test_df, edit_distance_model=False, n_gram_model=False):
    if edit_distance_model:
        edit_distance_correct_count = 0
        edit_distance_top1 = []

    if n_gram_model:
        n_gram_freq = train_spell_correction_models(train_df)
        n_gram_correct_count = 0
        n_gram_top1 = []

    total_count = len(test_df)

    for _, row in test_df.iterrows():
        incorrect_word = row['incorrect_word']
        correct_word = row['correct_word']

        if edit_distance_model:
            edit_distance_correction_result = edit_distance_correction(
                incorrect_word, [correct_word])
            if edit_distance_correction_result == correct_word:
                edit_distance_correct_count += 1
            edit_distance_top1.append(edit_distance_correction_result)

        if n_gram_model:
            max_n = 4  # Maximum n-gram size to check
            n_gram_candidates = set()

            for n in range(1, max_n + 1):
                n_gram_candidates.add(ngram_correction(
                    incorrect_word, [correct_word], n))

            if correct_word in n_gram_candidates:
                n_gram_correct_count += 1
            n_gram_top1.append(list(n_gram_candidates)[0] if n_gram_candidates else "")

    if edit_distance_model:
        edit_distance_accuracy = edit_distance_correct_count / total_count
        print(f"Edit Distance Model Accuracy: {edit_distance_accuracy:.2%}")
        print("Edit Distance Model Top 1 Candidates:")
        for i, candidate in enumerate(edit_distance_top1):
            print(f"   {i+1}. Incorrect: '{test_df['incorrect_word'][i]}', Suggested: '{candidate}'")

    if n_gram_model:
        n_gram_accuracy = n_gram_correct_count / total_count
        print(f"N-gram Model Accuracy: {n_gram_accuracy:.2%}")
        print("N-gram Model Top 1 Candidates:")
        for i, candidate in enumerate(n_gram_top1):
            print(f"   {i+1}. Incorrect: '{test_df['incorrect_word'][i]}', Suggested: '{candidate}'")


Train the models

In [8]:
n_gram_freq = train_spell_correction_models(train_df)

Evaluate the models

In [9]:
evaluate_models(test_df, edit_distance_model=True, n_gram_model=True)


Edit Distance Model Accuracy: 100.00%
Edit Distance Model Top 1 Candidates:
   1. Incorrect: 'nAatugambay', Suggested: 'nAatugamba'
   2. Incorrect: 'nguennyana', Suggested: 'ngAennyana'
   3. Incorrect: 'ekiragimdwa', Suggested: 'ekiragiddwa'
   4. Incorrect: 'ekyeentebe', Suggested: 'ekyAentebe'
   5. Incorrect: 'ebyazimbibwakox', Suggested: 'ebyazimbibwako'
   6. Incorrect: 'elitala', Suggested: 'ekitala'
   7. Incorrect: 'nAokusaaeira', Suggested: 'nAokusaasira'
   8. Incorrect: 'ebiwknvu', Suggested: 'ebiwonvu'
   9. Incorrect: 'nditenderhza', Suggested: 'nditendereza'
   10. Incorrect: 'kugobokoka', Suggested: 'kugolokoka'
   11. Incorrect: 'gwAonoonyg', Suggested: 'gwAonoonya'
   12. Incorrect: 'wanunila', Suggested: 'wanunula'
   13. Incorrect: 'beagalwa', Suggested: 'baagalwa'
   14. Incorrect: 'luauli', Suggested: 'lumuli'
   15. Incorrect: 'kosuri', Suggested: 'kosiri'
   16. Incorrect: 'ngAoxufa', Suggested: 'ngAokufa'
   17. Incorrect: 'irkesi', Suggested: 'ikkesi'
   18. 