In [36]:
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from string import punctuation

In [37]:
# Task 1: Get a list of valid words in English
nltk.download('words')
valid_words = set(words.words()[:20000])  # Using the first 20,000 entries

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [38]:
# Task 2: Display the first 20 words
print("First 20 words in the list:", list(valid_words)[:20])

First 20 words in the list: ['avania', 'aly', 'autocratrix', 'acaudal', 'another', 'adolescently', 'Attic', 'Anglicization', 'airfoil', 'afterhope', 'altared', 'Anthophila', 'absconded', 'apocrenic', 'arcuale', 'agglutinogen', 'acephalia', 'aln', 'Ainu', 'Agdistis']


In [39]:
# Task 3: Normalize  casing for all terms
valid_words_normalized = set(word.lower() for word in valid_words)

In [40]:
# Task 4: Create a unique list after normalizing
print("Unique list of valid words (after normalization):", list(valid_words_normalized)[:20])


Unique list of valid words (after normalization): ['arabophil', 'avania', 'aly', 'autocratrix', 'antiaris', 'acaudal', 'another', 'alejandro', 'adolescently', 'airfoil', 'afterhope', 'altared', 'absconded', 'apocrenic', 'arcuale', 'agglutinogen', 'acephalia', 'aln', 'anthracosis', 'aethusa']


In [41]:
# Task 5: Create a list of stop words
stop_words = set(nltk.corpus.stopwords.words('english') + list(punctuation))


In [42]:
# Task 6: Define a function to get the correct term
def get_correct_term(term):
    # Calculate edit distance with each term in the first 20,000 entries
    edit_distances = {valid_word: nltk.edit_distance(term, valid_word) for valid_word in list(valid_words_normalized)[:20000]}

    # Sort the dictionary by edit distance in ascending order
    sorted_distances = sorted(edit_distances.items(), key=lambda x: x[1])

    # Return the first entry (term with minimum edit distance)
    return sorted_distances[0][0]

In [43]:
# Task 7: Make a set from the list of valid words for faster lookup
valid_words_set = set(valid_words_normalized)

In [44]:
# Task 8: Define a function for spelling correction
def correct_spelling(sentence):
    # Tokenize the sentence after making all terms lowercase
    tokenized_sentence = word_tokenize(sentence.lower())

    # Check and correct each term
    corrected_sentence = [word if word in valid_words_set else get_correct_term(word) for word in tokenized_sentence]

    # Return the joined string as output
    return ' '.join(corrected_sentence)

In [45]:
# Task 9: Test the function
input_sentence = "The new abacos is great"
output_sentence = correct_spelling(input_sentence)
print("Corrected Sentence:", output_sentence)

Corrected Sentence: ahem anew abacus as area
