<a href="https://colab.research.google.com/github/sketcher03/learning_app_v1/blob/test/Spelling_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [165]:
import pandas as pd

# Data Pre-processing

In [166]:
def convert_dat_to_csv(dat_file, output_csv):
    """
    Converts a .dat file (Birkbeck, Aspell, or similar) into a structured CSV format.
    Each correct word (marked with `$`) is associated with its misspellings.
    """
    correct_word = None
    data = []

    # Read the .dat file line by line
    with open(dat_file, "r", encoding="utf-8") as file:
        for line in file:
            word = line.strip()
            if not word:
                continue  # Skip empty lines

            # If the word starts with "$", it's a correct word
            if word.startswith("$"):
                correct_word = word[1:]  # Remove "$" prefix
            elif correct_word:
                # Add misspelled word along with its correct counterpart
                data.append((correct_word, word))

    # Convert to DataFrame
    df = pd.DataFrame(data, columns=["Correct_Word", "Misspelled_Word"])

    # Save as CSV
    df.to_csv(output_csv, index=False)
    print(f"✅ Converted '{dat_file}' to '{output_csv}'")

In [167]:
convert_dat_to_csv("missp.dat", "birkbeck_errors.csv")
convert_dat_to_csv("aspell.dat", "aspell_errors.csv")
convert_dat_to_csv("wikipedia.dat", "wikipedia_errors.csv")

✅ Converted 'missp.dat' to 'birkbeck_errors.csv'
✅ Converted 'aspell.dat' to 'aspell_errors.csv'
✅ Converted 'wikipedia.dat' to 'wikipedia_errors.csv'


In [168]:
# Load the three datasets
df_birkbeck = pd.read_csv("birkbeck_errors.csv")
df_aspell = pd.read_csv("aspell_errors.csv")
df_wikipedia = pd.read_csv("wikipedia_errors.csv")

In [169]:
# Combine all three datasets into one
df_combined = pd.concat([df_birkbeck, df_aspell, df_wikipedia], ignore_index=True)

In [170]:
df_combined

Unnamed: 0,Correct_Word,Misspelled_Word
0,Albert,Ab
1,America,Ameraca
2,America,Amercia
3,American,Ameracan
4,April,Apirl
...,...,...
39114,years,eyars
39115,years,eyasr
39116,years,yeasr
39117,years,yeras


In [171]:
# Convert to lowercase and trim whitespace
df_combined["Correct_Word"] = df_combined["Correct_Word"].str.lower().str.strip()

df_combined

Unnamed: 0,Correct_Word,Misspelled_Word
0,albert,Ab
1,america,Ameraca
2,america,Amercia
3,american,Ameracan
4,april,Apirl
...,...,...
39114,years,eyars
39115,years,eyasr
39116,years,yeasr
39117,years,yeras


In [172]:
# Find duplicate entries (same Correct_Word and Misspelled_Word appearing more than once)
df_duplicates = df_combined[df_combined.duplicated(subset=["Correct_Word", "Misspelled_Word"], keep="first")]

df_duplicates

Unnamed: 0,Correct_Word,Misspelled_Word
21107,miss,mis
36140,absorption,absorbtion
36141,accidentally,accidently
36142,accommodate,accomodate
36145,acquaintance,aquantance
...,...,...
39088,witch,wich
39090,with,wiht
39095,within,withing
39110,written,writen


In [173]:
# Remove duplicates by keeping unique (Correct_Word, Misspelled_Word) pairs
df_combined.drop_duplicates(subset=["Correct_Word", "Misspelled_Word"], keep="last", inplace=True)

df_combined

Unnamed: 0,Correct_Word,Misspelled_Word
0,albert,Ab
1,america,Ameraca
2,america,Amercia
3,american,Ameracan
4,april,Apirl
...,...,...
39114,years,eyars
39115,years,eyasr
39116,years,yeasr
39117,years,yeras


In [174]:
# Remove words containing apostrophes, hyphens, or underscores
df_cleaned = df_combined[~df_combined["Correct_Word"].str.contains(r"[-_'’]", regex=True)]

df_cleaned

Unnamed: 0,Correct_Word,Misspelled_Word
0,albert,Ab
1,america,Ameraca
2,america,Amercia
3,american,Ameracan
4,april,Apirl
...,...,...
39114,years,eyars
39115,years,eyasr
39116,years,yeasr
39117,years,yeras


In [175]:
# Remove words with 1 or 2 letters
df_cleaned = df_cleaned[df_cleaned["Correct_Word"].str.len() > 2]

df_cleaned

Unnamed: 0,Correct_Word,Misspelled_Word
0,albert,Ab
1,america,Ameraca
2,america,Amercia
3,american,Ameracan
4,april,Apirl
...,...,...
39114,years,eyars
39115,years,eyasr
39116,years,yeasr
39117,years,yeras


In [176]:
# Sort the dataset alphabetically by Correct_Word
df_sorted = df_cleaned.sort_values(by=["Correct_Word"], ascending=True)

df_sorted

Unnamed: 0,Correct_Word,Misspelled_Word
36751,abandon,abondon
36752,abandoned,abondoned
36753,abandoning,abondoning
36754,abandons,abondons
212,abattoir,abbatoir
...,...,...
36130,youth,yuth
36131,zealous,zelous
36132,zenith,zeenith
36748,zionist,Sionist


In [177]:
# Save the cleaned dataset
df_sorted.to_csv("spelling_errors_cleaned.csv", index=False)

print("✅ Merged dataset saved as 'spelling_errors.csv'")

✅ Merged dataset saved as 'spelling_errors.csv'


# Feature Engineering

In [178]:
!pip install metaphone



In [179]:
import nltk
from nltk.metrics import edit_distance
from nltk.corpus import cmudict
from metaphone import doublemetaphone
import jellyfish  # For Jaccard Similarity
import string

In [180]:
def classify_cook_error(correct, misspelled):
    """
    Classifies spelling errors based on Cook’s (1999) classification.
    """

    if correct == misspelled:
        return "No Error"

    # Make sure both words are of the same length for comparison
    min_len = min(len(correct), len(misspelled))

    # If multiple changes occur
    if edit_distance(correct, misspelled) > 1:
        return "Multiple Errors"

    if len(correct) == len(misspelled):
        # Substitution Error: Same length but different letters
        differences = sum(1 for c, m in zip(correct, misspelled) if c != m)
        if differences == 1:
            return "Substitution Error"

        # Check for transpositions by comparing adjacent characters
        for i in range(min_len - 1):  # -1 to prevent index out of range
            # Check if current and next characters in misspelled word are swapped in correct word
            if (misspelled[i] == correct[i + 1] and
                misspelled[i + 1] == correct[i]):
                return "Transposition Error"

    # Omission Error: A letter is missing
    if len(correct) > len(misspelled):
        return "Omission Error"

    # Insertion Error: An extra letter is added
    if len(correct) < len(misspelled):
        return "Insertion Error"

    return "Unknown"

In [181]:
# Check for double letter mistakes (addition or omission)
def detect_double_letter_mistake(correct, misspelled):

    count1=0
    count2=0
    reset=0

    if correct == misspelled:
        return 0

    # Double Letter Omission
    for i in range(len(correct) - 1):
        if correct[i] == correct[i + 1]:  # If the correct word has a double letter
            count1+=1
            modified_misspelled = misspelled[:i] + correct[i] + misspelled[i:]  # Simulate missing double letter
            print(modified_misspelled)
            if modified_misspelled == correct:
                reset+=1
                return 1  # Double letter omitted


    for i in range(len(misspelled) - 1):
        if misspelled[i] == misspelled[i + 1]:  # If the misspelled word has a double letter
            count2+=1
            modified_misspelled = misspelled[:i] + misspelled[i+1:]  # Simulate extra double letter
            print(modified_misspelled)
            if modified_misspelled == correct:
                reset+=1
                return 2  # Double letter added

    if count1<=1 and count2<=1:
        return 3 # Double letter substitution
    elif count1>1 or count2>1:
        return 4 # Multiple double letter error
    else:
        return 0  # No double letter mistake detected

In [182]:
def common_prefix_length(correct_word, misspelled_word):
    # Initialize a variable to keep track of the length of the common prefix
    prefix_length = 0

    # Find the length of the shortest word to avoid index errors
    min_len = min(len(correct_word), len(misspelled_word))

    # Compare characters from the start (prefix)
    for i in range(min_len):
        if correct_word[i] == misspelled_word[i]:
            prefix_length += 1
        else:
            break  # Stop as soon as characters do not match

    return prefix_length


def common_suffix_length(correct_word, misspelled_word):
    # Initialize a variable to keep track of the length of the common suffix
    suffix_length = 0

    # Find the length of the shortest word to avoid index errors
    min_len = min(len(correct_word), len(misspelled_word))

    # Compare characters from the end (suffix)
    for i in range(1, min_len + 1):
        if correct_word[-i] == misspelled_word[-i]:
            suffix_length += 1
        else:
            break  # Stop as soon as characters do not match

    return suffix_length

In [183]:
def calculate_vowel_difference(correct_word, user_answer):
    vowels = 'aeiou'
    vowel_difference_count = 0

    # Ensure both words are of the same length (pad with spaces if necessary)
    length = max(len(correct_word), len(user_answer))

    if len(correct_word) != len(user_answer):
      return 100

    # Compare each character by its position
    for i in range(length):
        # If the index is out of bounds in one of the words, treat it as a mismatch
        correct_char = correct_word[i]
        user_char = user_answer[i]

        # Check if they are vowels
        correct_is_vowel = correct_char in vowels
        user_is_vowel = user_char in vowels

        if correct_is_vowel == user_is_vowel:
          if correct_char != user_char:
            vowel_difference_count += 1

    return vowel_difference_count


def calculate_consonant_difference(correct_word, user_answer):
    vowels = 'aeiou'
    consonant_difference_count = 0

    # Ensure both words are of the same length (pad with spaces if necessary)
    length = max(len(correct_word), len(user_answer))

    if len(correct_word) != len(user_answer):
      return 100

    # Compare each character by its position
    for i in range(length):
        # If the index is out of bounds in one of the words, treat it as a mismatch
        correct_char = correct_word[i]
        user_char = user_answer[i]

        # Check if they are consonants
        if correct_char.isalpha() and user_char.isalpha():
            correct_is_consonant = correct_char not in vowels
            user_is_consonant = user_char not in vowels

            if correct_is_consonant == user_is_consonant:
              if correct_char != user_char:
                consonant_difference_count += 1

    return consonant_difference_count

In [184]:
def extract_features(correct, misspelled, misspelled_cleaned):
    features = {}

    # 1️⃣ Edit Distance (Levenshtein)
    features["edit_distance"] = edit_distance(correct, misspelled_cleaned)

    # 2️⃣ Word Length Difference
    features["length_diff"] = abs(len(correct) - len(misspelled_cleaned))

    # 3️⃣ Phonetic Similarity (Metaphone)
    correct_phonetic = doublemetaphone(correct)[0]
    misspelled_phonetic = doublemetaphone(misspelled_cleaned)[0]
    features["phonetic_match"] = int(correct_phonetic == misspelled_phonetic)

    # Phonetic Edit Distance (Levenshtein)
    features["phonetic_edit_distance"] = edit_distance(correct_phonetic, misspelled_phonetic)

    # 4️⃣ Jaccard Similarity (Character Overlap)
    correct_set, misspelled_set = set(correct), set(misspelled_cleaned)
    features["jaccard_similarity"] = len(correct_set & misspelled_set) / len(correct_set | misspelled_set)

    # Common Prefix/Suffix Length
    features["common_prefix_len"] = common_prefix_length(correct, misspelled_cleaned)
    features["common_suffix_len"] = common_suffix_length(correct, misspelled_cleaned)

    # 7️⃣ Vowel Difference Count
    vowels = "aeiou"
    features["vowel_diff_count"] = calculate_vowel_difference(correct, misspelled_cleaned)

    # 8️⃣ Consonant Difference Count
    consonants = set(string.ascii_lowercase) - set(vowels)
    features["consonant_diff_count"] = calculate_consonant_difference(correct, misspelled_cleaned)

    # 9️⃣ Number of Insertions
    features["num_insertions"] = max(0, len(misspelled_cleaned) - len(correct))

    # 🔟 Number of Deletions
    features["num_deletions"] = max(0, len(correct) - len(misspelled_cleaned))

    # 🔟 Updated Double Letter Mistake
    features["double_letter_error"] = detect_double_letter_mistake(correct, misspelled_cleaned)

    # 🔟 Cook’s Classification Label
    features["cook_error_type"] = classify_cook_error(correct, misspelled_cleaned)

    return features

In [185]:
# Convert Correct_Word and Misspelled_Word to strings and fill missing values
df_sorted["Correct_Word"] = df_sorted["Correct_Word"].astype(str).fillna("")
df_sorted["Misspelled_Word"] = df_sorted["Misspelled_Word"].astype(str).fillna("")

# Create a cleaned version of words without special characters
df_sorted["Misspelled_Cleaned"] = df_sorted["Misspelled_Word"].str.replace(r"[-_'’]", "", regex=True)

df_sorted


Unnamed: 0,Correct_Word,Misspelled_Word,Misspelled_Cleaned
36751,abandon,abondon,abondon
36752,abandoned,abondoned,abondoned
36753,abandoning,abondoning,abondoning
36754,abandons,abondons,abondons
212,abattoir,abbatoir,abbatoir
...,...,...,...
36130,youth,yuth,yuth
36131,zealous,zelous,zelous
36132,zenith,zeenith,zeenith
36748,zionist,Sionist,Sionist


In [186]:
# Save the cleaned dataset
df_sorted.to_csv("spelling_errors_sorted.csv", index=False)

print("✅ Merged dataset saved as 'spelling_errors_sorted.csv'")

✅ Merged dataset saved as 'spelling_errors_sorted.csv'


In [187]:
# Apply feature extraction using cleaned words
df_features = df_sorted.apply(
    lambda row: extract_features(row["Correct_Word"], row["Misspelled_Word"], row["Misspelled_Cleaned"]),
    axis=1
)

# Convert extracted features into DataFrame
df_features = pd.DataFrame(df_features.tolist())

# df_features

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
occoasionaly
occcosonoly
occosonolly
ocosonoly
occconsially
occonsiallly
oconsially
occonsialy
occconsionally
occonsionlally
oconsionally
occonsionaly
occcosionally
occosionallly
ocosionally
occosionaly
occcoumely
occoumelyl
ocoumely
occctionally
occtionallly
octionally
occtionaly
occkationaly
ockationally
ockaysionaly
okaysionally
ockenously
okenouslyl
acccosinly
accosinlyl
acosinly
occcusionaly
occusionally
ocusionaly
occconaly
occonalyl
oconaly
occcasionaly
occasionally
occcasiole
occasiolel
ocasiole
occcasely
occaselyl
ocasely
Occcasionlly
Occasionllly
Ocasionlly
Occasionly
accanelly
acanellyl
acanely
accationaly
acationally
accationly
acationlyl
acccagionaly
accagionally
acagionaly
acccasionally
accasionallly
acasionally
accasionaly
acccassionally
accassionlally
acassionally
accasionally
accassionaly
accconazilyg
acconazillyg
aconazilyg
accconlly
acconllyl
aconlly
acconly
accconnly
acconnlyl
aconnly
acconly
acccossin

In [188]:
df_sorted = pd.read_csv("spelling_errors_sorted.csv")

# Merge extracted features with original dataset
df = pd.concat([df_sorted, df_features], axis=1)

df

Unnamed: 0,Correct_Word,Misspelled_Word,Misspelled_Cleaned,edit_distance,length_diff,phonetic_match,phonetic_edit_distance,jaccard_similarity,common_prefix_len,common_suffix_len,vowel_diff_count,consonant_diff_count,num_insertions,num_deletions,double_letter_error,cook_error_type
0,abandon,abondon,abondon,1,0,1,0,1.000000,2,4,1,1,0,0,3,Substitution Error
1,abandoned,abondoned,abondoned,1,0,1,0,1.000000,2,6,1,1,0,0,3,Substitution Error
2,abandoning,abondoning,abondoning,1,0,1,0,1.000000,2,7,1,1,0,0,3,Substitution Error
3,abandons,abondons,abondons,1,0,1,0,1.000000,2,5,1,1,0,0,3,Substitution Error
4,abattoir,abbatoir,abbatoir,2,0,1,0,1.000000,2,4,0,0,0,0,3,Multiple Errors
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37641,youth,yuth,yuth,1,1,1,0,0.800000,1,3,100,100,0,1,3,Omission Error
37642,zealous,zelous,zelous,1,1,1,0,0.857143,2,4,100,100,0,1,3,Omission Error
37643,zenith,zeenith,zeenith,1,1,1,0,1.000000,2,5,100,100,1,0,2,Insertion Error
37644,zionist,Sionist,Sionist,1,0,1,0,0.714286,0,6,1,1,0,0,3,Substitution Error




In [189]:
df["cook_error_type"].value_counts()

Unnamed: 0_level_0,count
cook_error_type,Unnamed: 1_level_1
Multiple Errors,25301
Substitution Error,4854
Omission Error,4402
Insertion Error,2861
No Error,228


In [190]:
# Save the cleaned dataset
df.to_csv("spelling_errors_features.csv", index=False)

print("✅ Merged dataset saved as 'spelling_errors_features.csv'")

✅ Merged dataset saved as 'spelling_errors_features.csv'


In [191]:
df.columns

Index(['Correct_Word', 'Misspelled_Word', 'Misspelled_Cleaned',
       'edit_distance', 'length_diff', 'phonetic_match',
       'phonetic_edit_distance', 'jaccard_similarity', 'common_prefix_len',
       'common_suffix_len', 'vowel_diff_count', 'consonant_diff_count',
       'num_insertions', 'num_deletions', 'double_letter_error',
       'cook_error_type'],
      dtype='object')

# Rule-Based System

In [22]:
# Load the suffixes from the CSV file
suffixes_df = pd.read_csv('/content/suffixes.csv')
suffixes = suffixes_df['suffixes'].tolist()  # Convert the suffix column to a list

In [23]:
# Load the prefixes from the CSV file
prefixes_df = pd.read_csv('/content/prefixes.csv')
prefixes = prefixes_df['prefixes'].tolist()  # Assuming the column name is 'prefix'

In [26]:
# Function to check if a word has any suffix from the list
def detect_all_suffixes(word, suffixes):
    matched_suffixes = []  # List to store all matched suffixes
    for suffix in suffixes:
        if word.endswith(suffix):
            matched_suffixes.append(suffix)

    if matched_suffixes:
        return f"Detected suffixes: {', '.join(matched_suffixes)}"
    else:
        return "No common suffix detected."

In [27]:
# Function to check if a word has any prefix from the list
def detect_all_prefixes(word, prefixes):
    matched_prefixes = []  # List to store all matched prefixes
    for prefix in prefixes:
        if word.startswith(prefix):
            matched_prefixes.append(prefix)

    if matched_prefixes:
        return f"Detected prefixes: {', '.join(matched_prefixes)}"
    else:
        return "No common prefix detected."

In [28]:
# Test the function with a sample word
word = "unhappiness"
print(detect_all_suffixes(word, suffixes))
print(detect_all_prefixes(word, prefixes))

Detected suffixes: ess, ness
Detected prefixes: un


In [136]:
def check_transposition_error(correct_word, misspelled_word):
    # Initialize the list of transposition errors
    transposition_errors = []

    # Make sure both words are of the same length for comparison
    min_len = min(len(correct_word), len(misspelled_word))

    # Check for transpositions by comparing adjacent characters
    for i in range(min_len - 1):  # -1 to prevent index out of range
        # Check if current and next characters in misspelled word are swapped in correct word
        if (misspelled_word[i] == correct_word[i + 1] and
            misspelled_word[i + 1] == correct_word[i]):
            transposition_errors.append((i, i + 1))  # Store indices of swapped characters

    return transposition_errors

In [135]:
def check_substitution_error(correct_word, misspelled_word):
    substitution_errors = 0

    # Ensure both words are of the same length (pad with spaces if necessary)
    min_len = min(len(correct_word), len(misspelled_word))

    # Compare each character by its position
    for i in range(min_len):
        if correct_word[i] != misspelled_word[i]:
            substitution_errors += 1  # A substitution error occurs when characters are different at the same position

    return substitution_errors