In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import words

# Sample corpus
corpus = [
    "This is a sample document.",
    "Another example document.",
    "Yet another document."
]

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get the vocabulary and idf scores
vocabulary = tfidf_vectorizer.get_feature_names_out()

# Create a dictionary mapping words to their idf scores
idf_scores = dict(zip(vocabulary, tfidf_vectorizer.idf_))

# Function to suggest words based on similarity to misspelled word
def suggest_words(misspelled_word, vocabulary, idf_scores, top_n=3):
    similar_words = []
    if misspelled_word in vocabulary:
        return [misspelled_word]

    # Calculate cosine similarity between the misspelled word and all words in the vocabulary
    for word in vocabulary:
        similarity = cosine_similarity(tfidf_vectorizer.transform([misspelled_word]), tfidf_vectorizer.transform([word]))[0][0]
        similar_words.append((word, similarity))

    # Sort similar words based on cosine similarity
    similar_words.sort(key=lambda x: x[1], reverse=True)

    # Return top n similar words
    return [word[0] for word in similar_words[:top_n]]

# Function to check if a word is spelled correctly
def spell_check(word, vocabulary):
    return word in vocabulary

# Example usage
misspelled_word = "documnet"
if spell_check(misspelled_word, vocabulary):
    print("The word is spelled correctly.")
else:
    suggestions = suggest_words(misspelled_word, vocabulary, idf_scores)
    print("The word is misspelled. Suggestions:", suggestions)


The word is misspelled. Suggestions: ['another', 'document', 'example']
