# Links to go through
- https://www.youtube.com/results?search_query=spell+check+using+trie
- https://medium.com/@yashj302/spell-check-and-correction-nlp-python-f6a000e3709d
- https://www.scaler.com/topics/nlp/create-a-spell-check-with-nlp/
- https://www.geeksforgeeks.org/correcting-words-using-nltk-in-python/
- https://www.youtube.com/watch?v=OIJBKqzrlX0
- https://medium.com/@surmenok/deep-learning-for-spell-checking-2ffdbad65554#:~:text=Just%20create%20artificial%20dataset%20by,errors%20to%20a%20correct%20text.

# First Attempt

In [1]:
import nltk
from nltk.util import ngrams
from nltk.metrics.distance import edit_distance

In [2]:
sentences = {
    "Mr Patrick is our new xyz.": ["principal", "principle"],
    "The company xyz all the terms.": ["accepted", "excepted"],
    "Please don’t keep your dog on the xyz.": ["loose", "lose"],
    "The xyz is my best friend.": ["latter", "later"],
    "I need some xyz products for my craftwork.": ["stationery", "stationary"],
    "The actor xyz the Oscar.": ["accepted", "excepted"],
    "I will call you xyz in the evening.": ["later", "latter"],
    "Covid xyz the lungs.": ["affects", "effects"],
    "The xyz of the ministers were sworn in yesterday.": ["council", "counsel"],
    "Robert xyz wants to accompany us to the park.": ["too", "to"],
    "Mia will xyz me about choosing fashion as my career.": ["counsel", "council"],
    "The xyz at the zoo was very playful.": ["bear", "bare"],
    "The sheep have a lot of xyz that keeps them warm.": ["fur", "far"], "The hot spring is at the xyz corner of the street.": ["farthest", "furthest"],
    "Can you xyz me on how to study for exams?": ["advise", "advice"],
    "The team will xyz the match if they don’t play well.": ["lose", "loose"], "Can you go xyz the market for me?": ["to", "too"],
    "The teachers asked the students to keep xyz.": ["quiet", "quite"],
    "The xyz of garbage should be cleaned immediately.": ["heap", "hip"], 
    "This is xyz house.": ["their", "there"]
}

In [3]:
def find_best_fit_word(sentence, choices):
    n = len(sentence.split())
    sentence_ngrams = list(ngrams(sentence.split(), n))
    best_fit_word = None
    min_distance = float('inf') 
    for choice in choices:
        choice_ngrams = list(ngrams(choice.split(), n))
        distance = edit_distance(sentence_ngrams, choice_ngrams) 
        if distance < min_distance:
            min_distance = distance
            best_fit_word = choice 
    return best_fit_word

In [4]:
results = {}
for sentence, choices in sentences.items():
    best_fit_word = find_best_fit_word(sentence, choices)
    results[sentence] = best_fit_word

In [5]:
for sentence, best_fit_word in results.items(): 
    print(sentence.replace('xyz', best_fit_word))

Mr Patrick is our new principal.
The company accepted all the terms.
Please don’t keep your dog on the loose.
The latter is my best friend.
I need some stationery products for my craftwork.
The actor accepted the Oscar.
I will call you later in the evening.
Covid affects the lungs.
The council of the ministers were sworn in yesterday.
Robert too wants to accompany us to the park.
Mia will counsel me about choosing fashion as my career.
The bear at the zoo was very playful.
The sheep have a lot of fur that keeps them warm.
The hot spring is at the farthest corner of the street.
Can you advise me on how to study for exams?
The team will lose the match if they don’t play well.
Can you go to the market for me?
The teachers asked the students to keep quiet.
The heap of garbage should be cleaned immediately.
This is their house.


# Using Tries for Context Words (Given a sentence, find the correct spelling)
[From Here](https://github.com/LaVivien/SpellingCorrector)

# Using Distance Metrics for Individual Words

In [7]:
import pandas as pd
import numpy as np
%pip install textdistance
import textdistance
import re
from collections import Counter

Note: you may need to restart the kernel to use updated packages.


In [18]:
words = []
with open('./resources/spelling_correction.txt', 'r') as f:
    for line in f:
        words.extend(re.findall(r'\w+', line.lower()))


# This is our vocabulary
V = set(words)
# print("Top ten words in the text are:", {words[0:10]})
print("Total Unique words are : ", {len(V)})

Total Unique words are :  {17647}


In [19]:
# Word Frequency Counter
word_freq = {}  
word_freq = Counter(words)
print(word_freq.most_common()[0:10])

[('the', 14703), ('of', 6742), ('and', 6517), ('a', 4799), ('to', 4707), ('in', 4238), ('that', 3081), ('it', 2534), ('his', 2530), ('i', 2120)]


In [20]:
# Relative Frequency of Words
probs = {}     
Total = sum(word_freq.values())    
for k in word_freq.keys():
    probs[k] = word_freq[k]/Total

In [21]:
# Sort Similar Words according to Jaccard Distance
def my_autocorrect(input_word):
    input_word = input_word.lower()
    if input_word in V:
        return('Your word seems to be correct')
    else:
        sim = [1-(textdistance.Jaccard(qval=2).distance(v,input_word)) for v in word_freq.keys()]
        df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
        df = df.rename(columns={'index':'Word', 0:'Prob'})
        df['Similarity'] = sim
        output = df.sort_values(['Similarity', 'Prob'], ascending=False).head()
        return(output)

In [22]:
my_autocorrect('neverteless')

Unnamed: 0,Word,Prob,Similarity
2571,nevertheless,0.000225,0.75
13657,boneless,1.3e-05,0.416667
12684,elevates,4e-06,0.416667
1105,never,0.000925,0.4
7136,level,0.000108,0.4
