In [13]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [24]:
pip install python-Levenshtein

Note: you may need to restart the kernel to use updated packages.


In [50]:
import re
from fuzzywuzzy import process

# Load Tamil words from your dataset
def load_tamil_words(dataset_path):
    with open(dataset_path, 'r', encoding='utf-8') as file:
        tamil_words = set(file.read().splitlines())  # Read words and store them in a set for quick lookup
    return tamil_words

# Function to correct spelling with an increased threshold for fuzzy matching
def correct_spelling(word, tamil_words, threshold=90):  # Increase threshold for stricter matching
    closest_word, score = process.extractOne(word, tamil_words)  # Fuzzy matching
    return closest_word if score >= threshold else word  # Only correct if score is high enough

# Function to correct a paragraph
def correct_paragraph(paragraph, tamil_words):
    words = re.findall(r'\w+', paragraph)  # Extract words from the paragraph
    corrected_words = [correct_spelling(word, tamil_words) for word in words]  # Correct each word
    corrected_paragraph = ' '.join(corrected_words)  # Join words back into a paragraph
    return corrected_paragraph

# Function to calculate accuracy based on corrected words
def calculate_accuracy(original, corrected):
    original_words = original.split()
    corrected_words = corrected.split()
    
    # Count how many words were actually corrected
    corrected_count = sum(1 for orig, corr in zip(original_words, corrected_words) if orig != corr)
    accuracy = (1 - corrected_count / len(original_words)) * 100 if original_words else 0
    return accuracy

# Test the spell checker
def test_spell_checker(paragraphs, dataset_path):
    # Load Tamil words and initialize the spell checker
    tamil_words = load_tamil_words(dataset_path)

    results = []
    for paragraph in paragraphs:
        corrected_paragraph = correct_paragraph(paragraph, tamil_words)
        accuracy = calculate_accuracy(paragraph, corrected_paragraph)
        results.append((corrected_paragraph, accuracy))
    
    return results

# Example usage
dataset_path = r'C:\Users\STRANGERS\Desktop\Semester 7 - 2024\EC9640-Artificial Intelligence\Project\TamilWordsDataset.txt'
# Example paragraphs to test (replace with your actual paragraphs)
paragraphs = [
    "தமிழ் கலை மற்றும் இலக்கயம் மிகப்பெரிய செல்வமாகும்.",
    "மக்கள் கல்வி பெறுவதற்கான முயற்சகளில் ஈடுபட்டுள்ளனர்.",
]
results = test_spell_checker(paragraphs, dataset_path)

# Print only corrected paragraph and accuracy for each paragraph
for i, (corrected, acc) in enumerate(results, 1):
    print(f"Corrected Paragraph {i}: {corrected}")
    print(f"Accuracy of paragraph {i}: {acc:.2f}%\n")


Corrected Paragraph 1: தமி குடக்குழி கல மறி ஈறுகட்டி மை இலகடம் கயம் மை கபி பூ ரு யே சோ லோ தேவமேரையாய் கொ மை
Accuracy of paragraph 1: 0.00%

Corrected Paragraph 2: மகீ களி கல வி பூ ஈறுகட்டி வதறுதல் கொ படைச்சனம் மை இயற்றமிழ் சகி லோ ஈடை படி டீ தகளி ரு
Accuracy of paragraph 2: 0.00%

