In [None]:
import random
import re
from collections import Counter, defaultdict
import unicodedata

def preprocess(file):
    with open(file,'r',encoding='utf-8') as file:
        text=file.read()

    cleaned_chars=[]
    for ch in text:
        char_name=unicodedata.name(ch, '')
        if 'DEVANAGARI' in char_name or ch.isspace():
            cleaned_chars.append(ch)

    cleaned_data=''.join(cleaned_chars)
    words=cleaned_data.split()
    return words

In [None]:
#Design a bidram model

def generate_bigram_model(words):
    # Add sentence boundaries
    padded_words=['<START>']+words+['<END>']# since words are represented as <s> and </s>
    # Now length of padded words=length of word in total
    bigrams=[]
    for i in range(len(padded_words)-1):
        bigram=(padded_words[i],padded_words[i+1])
        bigrams.append(bigram)

    bigram_counts=Counter(bigrams)
    word_counts=Counter(words)

    # Add smoothing factor to handle unseen bigrams
    smoothing_factor=0.1
    vocab_size=len(word_counts)

    bigram_prob=defaultdict(lambda: 1/(vocab_size +1))  # Default probability for unseen bigrams# Laplace Smoothing
    for (w1,w2), count in bigram_counts.items():
        bigram_prob[(w1, w2)]= (count+smoothing_factor)/(word_counts[w1]+smoothing_factor*vocab_size)

    return bigram_prob,word_counts

'''def generate_trigram_model(words):
    # Add sentence boundaries
    padded_words = ['<START>'] + words + ['<END>']

    trigrams = []
    for i in range(len(padded_words)-2):
        trigram = (padded_words[i], padded_words[i+1], padded_words[i+2])
        trigrams.append(trigram)

    trigram_counts = Counter(trigrams)
    word_counts = Counter(words)

    # Add smoothing factor to handle unseen bigrams
    smoothing_factor = 0.1
    vocab_size = len(word_counts)

    trigram_prob = defaultdict(lambda: 1/(vocab_size + 1))  # Default probability for unseen bigrams
    for (w1, w2), count in trigram_counts.items():
        trigram_prob[(w1, w2)] = (count + smoothing_factor) / (word_counts[w1] + smoothing_factor * vocab_size)

    return trigram_prob, word_counts

def generate_fourgram_model(words):
    # Add sentence boundaries
    padded_words = ['<START>'] + words + ['<END>']

    fourgrams = []
    for i in range(len(padded_words)-2):
        fourgram = (padded_words[i], padded_words[i+1], padded_words[i+2])
        fourgrams.append(fourgram)

    fourgram_counts = Counter(fourgrams)
    word_counts = Counter(words)

    # Add smoothing factor to handle unseen bigrams
    smoothing_factor = 0.1
    vocab_size = len(word_counts)

    fourgram_prob = defaultdict(lambda: 1/(vocab_size + 1))  # Default probability for unseen bigrams
    for (w1, w2), count in fourgram_counts.items():
        fourgram_prob[(w1, w2)] = (count + smoothing_factor) / (word_counts[w1] + smoothing_factor * vocab_size)

    return fourgram_prob, word_counts'''

def generate_trigram_model(words):
    padded_words=['<START>']+words+['<END>']
    trigrams=[]
    for i in range(len(padded_words)-2):
        trigram=(padded_words[i],padded_words[i+1],padded_words[i+2])
        trigrams.append(trigram)

    trigram_counts=Counter(trigrams)
    word_counts=Counter(words)
    smoothing_factor=0.1
    vocab_size=len(word_counts)
    trigram_prob=defaultdict(lambda: 1/(vocab_size+1))
    for (w1,w2,w3), count in trigram_counts.items():
        trigram_prob[(w1,w2,w3)]=(count+smoothing_factor)/(word_counts[w1]+smoothing_factor*vocab_size)

    return trigram_prob,word_counts

# Fourgram Model
def generate_fourgram_model(words):
    padded_words=['<START>']+words+['<END>']
    fourgrams=[]
    for i in range(len(padded_words)-3):
        fourgram=(padded_words[i],padded_words[i+1],padded_words[i+2],padded_words[i+3])
        fourgrams.append(fourgram)
    fourgram_counts=Counter(fourgrams)
    word_counts=Counter(words)
    smoothing_factor=0.1
    vocab_size=len(word_counts)

    fourgram_prob=defaultdict(lambda: 1/(vocab_size+1))
    for (w1,w2,w3,w4), count in fourgram_counts.items():
        fourgram_prob[(w1,w2,w3,w4)]=(count+smoothing_factor)/(word_counts[w1]+smoothing_factor*vocab_size)

    return fourgram_prob,word_counts

In [None]:
def edits1(word):
   #this functions make different combinations of words if encounter a new word and then tries to find the closest match
    # Expanded Hindi character set including common conjuncts and matras
    letters=('अआइईउऊएऐओऔकखगघचछजझटठडढतथदधनपफबभमयरलवशषसह'
              'ािीुूेैोौंःृँ्')

    splits=[(word[:i],word[i:]) for i in range(len(word)+1)]
    edits=set()
    # L means left substrinf and R means right substring. also c means the generated combination
    # Deletions
    edits.update(L+R[1:] for L,R in splits if R)

    # Insertions (only at valid positions)
    edits.update(L+c+R for L,R in splits for c in letters
                if not(c in 'ािीुूेैोौंःृँ्' and (not L or L[-1] in 'ािीुूेैोौंःृँ्')))

    # Replacements (considering character types)
    edits.update(L+c+R[1:] for L,R in splits if R
                for c in letters if (c in 'ािीुूेैोौंःृँ्') == (R[0] in 'ािीुूेैोौंःृँ्'))
    return edits

In [None]:
def known(words, word_counts):
    return set(w for w in words if w in word_counts)

def correct_word(word, previous_word, bigram_prob, word_counts):
    if word in word_counts:
        return word

    # Generate and filter candidates
    candidates=known(edits1(word),word_counts)
    if not candidates:
        return word

    # Choose the candidate with highest bigram probability
    best_candidate=max(candidates,
                        key=lambda w: bigram_prob[(previous_word, w)])
    return best_candidate

def correct_word_trigram(word, prev_words, trigram_prob, word_counts):

    if word in word_counts:
        return word

    candidates=known(edits1(word),word_counts)
    if not candidates:
        return word

    # Choose candidate with highest trigram probability
    best_candidate=max(candidates,
                        key=lambda w: trigram_prob.get((prev_words[0], prev_words[1], w), 0))
    return best_candidate


def correct_word_fourgram(word,prev_words,fourgram_prob,word_counts):

    if word in word_counts:
        return word

    candidates=known(edits1(word),word_counts)
    if not candidates:
        return word

    # Choose candidate with highest fourgram probability
    best_candidate=max(candidates,
                        key=lambda w: fourgram_prob.get((prev_words[0], prev_words[1], prev_words[2], w), 0))
    return best_candidate

In [None]:
def evaluate_accuracy(test_text, bigram_prob, word_counts):
    # Preprocess test text
    cleaned=''.join(ch for ch in test_text
                     if 'DEVANAGARI' in unicodedata.name(ch, '') or ch.isspace())
    words = cleaned.split()

    total_words=len(words)
    correct=0

    test_words=[]
    for word in words:
        if random.random() < 0.3:
            possible_typos=list(edits1(word))
            if possible_typos:
                typo_word=random.choice(possible_typos)
                test_words.append(typo_word)
            else:
                test_words.append(word)
        else:
            test_words.append(word)


    corrected_words=[]
    previous_word='<START>'
    for word in test_words:
        corrected_word=correct_word(word,previous_word,bigram_prob,word_counts)
        corrected_words.append(corrected_word)
        previous_word=corrected_word

    # Calculate accuracy
    for original,corrected in zip(words,corrected_words):
        if original==corrected:
            correct+= 1

    accuracy=(correct / total_words) * 100
    return accuracy,' '.join(corrected_words)


def evaluate_accuracy_trigram(test_text, trigram_prob, word_counts):
    # Preprocess test text
    cleaned= ''.join(ch for ch in test_text
                     if 'DEVANAGARI' in unicodedata.name(ch, '') or ch.isspace())
    words=cleaned.split()

    total_words=len(words)
    correct=0
    test_words=[]
    for word in words:
        if random.random() < 0.3:
            possible_typos=list(edits1(word))
            if possible_typos:
                typo_word = random.choice(possible_typos)
                test_words.append(typo_word)
            else:
                test_words.append(word)
        else:
            test_words.append(word)


    corrected_words = []
    prev_words = ('<START>', '<START>')

    for word in test_words:
        corrected_word=correct_word_trigram(word, prev_words, trigram_prob, word_counts)
        corrected_words.append(corrected_word)

        prev_words=(prev_words[1],corrected_word)

    # Calculate accuracy
    for original,corrected in zip(words,corrected_words):
        if original==corrected:
            correct+= 1

    accuracy=(correct/total_words)*100
    return accuracy,' '.join(corrected_words)


def evaluate_accuracy_fourgram(test_text, fourgram_prob, word_counts):

    cleaned=''.join(ch for ch in test_text
                     if 'DEVANAGARI' in unicodedata.name(ch, '') or ch.isspace())
    words=cleaned.split()

    total_words=len(words)
    correct=0


    test_words=[]
    for word in words:
        if random.random() < 0.3:
            possible_typos = list(edits1(word))
            if possible_typos:
                typo_word = random.choice(possible_typos)
                test_words.append(typo_word)
            else:
                test_words.append(word)
        else:
            test_words.append(word)


    corrected_words = []
    prev_words = ('<START>', '<START>', '<START>')  # Initialize with start tokens

    for word in test_words:
        corrected_word=correct_word_fourgram(word, prev_words, fourgram_prob, word_counts)
        corrected_words.append(corrected_word)

        prev_words = (prev_words[1], prev_words[2], corrected_word)


    for original, corrected in zip(words, corrected_words):
        if original == corrected:
            correct += 1

    accuracy=(correct/total_words) * 100
    return accuracy,' '.join(corrected_words)

In [None]:
    file_path='tourism.hi.txt'
    words=preprocess(file_path)
    bigram_prob,word_counts=generate_bigram_model(words)

    # Test with sample text
    test_text = """आज भारत के नक्शे में प्रमुख पर्यटन स्थल भरतपुर, 5वीं शताब्दी से कई स्थितियों में गया है कई परिवहन साध अद्भुत दृश्य प्रस्तु"""
    bigram_accuracy, corrected_text = evaluate_accuracy(test_text, bigram_prob, word_counts)
    print("\nTesting with Bigrams:")

    print(f"Bigram Accuracy: {bigram_accuracy:.2f}%")

    print("Corrected Hindi Text:")
    print(corrected_text)
    trigram_prob, word_counts = generate_trigram_model(words)
    print("\nTesting with Trigrams:")
    trigram_accuracy, trigram_corrected = evaluate_accuracy_trigram(test_text, trigram_prob, word_counts)
    print(f"Trigram Accuracy: {trigram_accuracy:.2f}%")
    print("Trigram Corrected Text:")
    print(trigram_corrected)

    # Test with fourgrams
    fourgram_prob, word_counts = generate_fourgram_model(words)
    print("\nTesting with Fourgrams:")
    fourgram_accuracy, fourgram_corrected = evaluate_accuracy_fourgram(test_text, fourgram_prob, word_counts)
    print(f"Fourgram Accuracy: {fourgram_accuracy:.2f}%")
    print("Fourgram Corrected Text:")
    print(fourgram_corrected)

    # Compare results
    print("\nComparison Summary:")
    print(f"Bigram Accuracy:  {bigram_accuracy:.2f}%")
    print(f"Trigram Accuracy: {trigram_accuracy:.2f}%")
    print(f"Fourgram Accuracy: {fourgram_accuracy:.2f}%")


Testing with Bigrams:
Bigram Accuracy: 86.96%
Corrected Hindi Text:
आज भारत के नक्शे में प्रमुख पर्यटक स्थल भरतपुर वीं शताब्दी से कई स्थितियों में गया है कई परिवहन साधु अद्भुत दृश्य प्रस्तुत

Testing with Trigrams:
Trigram Accuracy: 82.61%
Trigram Corrected Text:
फज भारत का नक्शे में प्रमुख पर्यटन स्थल भरतपुर वीं शताब्दी से कई स्थितियों में गया है कई परिवहन साथ अद्भुत दृश्य प्रस्गु

Testing with Fourgrams:
Fourgram Accuracy: 91.30%
Fourgram Corrected Text:
आज भारत के नक्शे में प्रमुख पर्यटन स्थल भरतपुर वीं शताब्दी से कई स्थितियों में गया है कई परिवहन साधु अद्भुत दृश्य प्रस्तुत

Comparison Summary:
Bigram Accuracy:  86.96%
Trigram Accuracy: 82.61%
Fourgram Accuracy: 91.30%


In [None]:

words = preprocess(file_path)
bigram_prob, word_counts = generate_bigram_model(words)
fourgram_prob, word_counts = generate_fourgram_model(words)
trigram_prob, word_counts = generate_trigram_model(words)
num_words = 50
random_words = random.sample(words, num_words)
test_text = ' '.join(random_words)

print("Randomly Generated Test Text:\n")
print(test_text)


bigram_accuracy, corrected_text = evaluate_accuracy(test_text, bigram_prob, word_counts)
trigram_accuracy, trigram_corrected = evaluate_accuracy_trigram(test_text, trigram_prob, word_counts)
fourgram_accuracy, fourgram_corrected = evaluate_accuracy_fourgram(test_text, fourgram_prob, word_counts)

print(f"Bigram Accuracy: {bigram_accuracy:.2f}%\n")
print("Corrected Hindi Text:\n")
print(corrected_text)
print(f"\nTrigram Accuracy: {trigram_accuracy:.2f}%")
print("Corrected Hindi Text trigram:\n")
print(trigram_corrected)
print(f"\nFourgram Accuracy: {fourgram_accuracy:.2f}%")
print("Corrected Hindi Text fourgram:\n")
print(fourgram_corrected)



Randomly Generated Test Text:

उद्यान धार्मिक के की अधिक जलमहल में में एक नहीं माइक्रोनिशिया पहाड़ी का उसका आया। भगवान से शिंगरी ग्रेव्स के विन्टेज पहाड़ों तश्तरियों को तरह नीचे विस्मयकारी के के और पुरूष पहले व बड़ा केरला की के है बाजार बोट हैं। की है। विशाल के नव भूमिका अधिकतर खोजा। सिर्फ
Bigram Accuracy: 92.00%

Corrected Hindi Text:

उद्यान धार्मिक के की अधिक जलमहल में में एक नहीं माइक्रोनिशिया पहाड़ी की उसका आया। भगवान से शिंगरी ग्रेव्स कैट विन्टेज पहाड़ों तश्तरियों को तरह नीचे विस्मयकारी ले कि और पुरूष पहले व बड़ा केरला की के है बाजार बोट हैं। की है। विशाल के नव भूमिका अधिकतर खोजा। सिर्फ

Trigram Accuracy: 90.00%
Corrected Hindi Text trigram:

उद्यान धार्मिक के की अधिक जलमहल में में एक नहीं माइक्रोनिशिया पहाड़ी का उसका आया। भगवान से शिंगरी ग्रेव्स के विन्टेज पहाड़ों तश्तरियों को तरह नीचे विस्मयकारी के ले और पुरूष पहले व बडा केरला कि कटे है बाजार बोट हैं। की है। विशाल ढके नव भूमिका अधिकतर खोजा। सिर्फ

Fourgram Accuracy: 88.00%
Corrected Hindi Text fourgram:

उद्यान धार्मिक इसे की अ