<a href="https://colab.research.google.com/github/saumyea/NLP-Mini-Project/blob/main/Final_NLP_PROJ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Installing the necessary libraries**

In [2]:
!pip install indic-nlp-library
!pip install nltk                    #Natural Language Toolkit - for handling tokenization
!pip install numpy                   #For handling arrays
!pip install python-Levenshtein      #For applying edit distance approach
!pip install gradio                  #For GUI

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.1-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx>=5.1.0 (from sphinx-argparse->indic-nlp-library)
  Downloading sphinx-8.1.3-py3-none-any.whl.metadata (6.4 kB)
Collecting docutils>=0.19 (from sphinx-argparse->indic-nlp-library)
  Downloading docutils-0.21.2-py3-none-any.whl.metadata (2.8 kB)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
!pip install swalign

Collecting swalign
  Downloading swalign-0.3.7.tar.gz (7.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: swalign
  Building wheel for swalign (setup.py) ... [?25l[?25hdone
  Created wheel for swalign: filename=swalign-0.3.7-py3-none-any.whl size=8499 sha256=5380ef143ca6b57d27585a7b631a9faa07e27fe16a9bb77bad244ab9e859c88a
  Stored in directory: /root/.cache/pip/wheels/e8/f6/21/329277dca6b477bbad22bedb5ce92ce474f6b75252713b06ca
Successfully built swalign
Installing collected packages: swalign
Successfully installed swalign-0.3.7


# **Importing the necessary modules**

In [5]:
from indicnlp.tokenize import sentence_tokenize, indic_tokenize
from indicnlp import common     #Provides common functions and configuration for Indic NLP Library.
import string
import pandas as pd
import numpy as np
import Levenshtein
import gradio as gr
import swalign
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# **Setting Indic NLP Resoure path**

In [6]:
# Set the Indic NLP path
INDIC_NLP_RESOURCES = "/root/indic_nlp_resources"
common.set_resources_path(INDIC_NLP_RESOURCES)       #Sets the path to the resources that the Indic NLP library will use.
                                                     #Ensures that the tokenization functions can access the necessary models and data files

# **Uploading the CSV File**

In [7]:
# Provide the full path to the CSV file in your Google Drive
df = pd.read_csv('/content/drive/MyDrive/sem7/NLP/new try project/main/hindi_dataset.csv', encoding='utf-8')

# Preview the first few rows to check the content
df.head()

# The column with text is named 'text'
top_1000 = df['text'][:1000]

# **Sentence Tokenization**

In [8]:
def tokenize_sentences(top_1000):

    tokenized_sentences = []

    for sentence in top_1000:
        # Ensuring the sentence is a string (skips NaNs or non-string types)
        sentence = str(sentence)

        # Sentence tokenization
        sentences = sentence_tokenize.sentence_split(sentence, lang='hi')
        tokenized_sentences.extend(sentences)  # Collect sentences

    return tokenized_sentences

tokenized_sentences = tokenize_sentences(top_1000)

print(tokenized_sentences[:5])  # Print the first 5 tokenized sentences

['भारतीय राजनीति में पिछड़ेपन के विचार का सूत्रीकरण उसे खत्म करने के मकसद से किया गया था', 'लेकिन अब ऐसा लग रहा है कि यह समझ बैकवर्डनेस खत्म करने के बजाय उसकी विभिन्न किस्मों के बीच होड़ में पतित हो गई है', 'जो पिछडे़ की श्रेणी में है, वह पिछड़ा बने रहने पर तुला है', 'जिन व्यक्तियों और समुदायों की आत्मछवि पिछड़े की नहीं थी, वे भी पिछड़ा बनने में जुट गए हैं', 'व्यावहारिक धरातल पर पिछड़ापन मानवीय अस्तित्व की अवमानना न हो कर उसके लिए लाभकारी मान लिया गया है']


# **Word Tokenization**

In [9]:
def tokenize_words(sentences):

    tokenized_words = []

    for sent in sentences:
        words = indic_tokenize.trivial_tokenize(sent)  # Tokenizing words
        tokenized_words.extend(words)

    return tokenized_words

tokenized_words = tokenize_words(tokenized_sentences)

print(tokenized_words[:5])  # Print the first 5 tokenized words

print(tokenized_words[:50])  # Print the first 50 tokenized words

['भारतीय', 'राजनीति', 'में', 'पिछड़ेपन', 'के']
['भारतीय', 'राजनीति', 'में', 'पिछड़ेपन', 'के', 'विचार', 'का', 'सूत्रीकरण', 'उसे', 'खत्म', 'करने', 'के', 'मकसद', 'से', 'किया', 'गया', 'था', 'लेकिन', 'अब', 'ऐसा', 'लग', 'रहा', 'है', 'कि', 'यह', 'समझ', 'बैकवर्डनेस', 'खत्म', 'करने', 'के', 'बजाय', 'उसकी', 'विभिन्न', 'किस्मों', 'के', 'बीच', 'होड़', 'में', 'पतित', 'हो', 'गई', 'है', 'जो', 'पिछडे़', 'की', 'श्रेणी', 'में', 'है', ',', 'वह']


# **Punctuation & White Space Removal**

In [10]:
def remove_punctuation(tokenized_words):

    # Create a set of Hindi punctuation marks
    hindi_punctuation = set(['।', '“', '”', '‘', '’', '(', ')', '[', ']', '{', '}', ',', ';', ':', '?', '!', '...'])

    # Combine English and Hindi punctuation for removal
    all_punctuation = set(string.punctuation).union(hindi_punctuation)       #string module is used

    # Remove punctuation from tokenized words
    cleaned_words = [word.strip() for word in tokenized_words if word not in all_punctuation] #WHITE SPACE REMOVAL

    return cleaned_words

cleaned_words = remove_punctuation(tokenized_words)
print(cleaned_words[:50])

['भारतीय', 'राजनीति', 'में', 'पिछड़ेपन', 'के', 'विचार', 'का', 'सूत्रीकरण', 'उसे', 'खत्म', 'करने', 'के', 'मकसद', 'से', 'किया', 'गया', 'था', 'लेकिन', 'अब', 'ऐसा', 'लग', 'रहा', 'है', 'कि', 'यह', 'समझ', 'बैकवर्डनेस', 'खत्म', 'करने', 'के', 'बजाय', 'उसकी', 'विभिन्न', 'किस्मों', 'के', 'बीच', 'होड़', 'में', 'पतित', 'हो', 'गई', 'है', 'जो', 'पिछडे़', 'की', 'श्रेणी', 'में', 'है', 'वह', 'पिछड़ा']


# **Stop Words Removal**

In [11]:
def stop_word_removal(cleaned_words):
# Load stop words directly from the URL using pandas
    url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-hi/refs/heads/master/stopwords-hi.txt"
    stop_words_df = pd.read_csv(url, header=None)

    # Convert the DataFrame to a set of stop words
    stop_words = set(stop_words_df[0].tolist())

    # Remove stop words from the cleaned tokenized words
    filtered_words = [word for word in cleaned_words if word not in stop_words]

    return filtered_words

filtered_words = stop_word_removal(cleaned_words)

# Display the top 5 filtered words
print(filtered_words[0:50])

['भारतीय', 'राजनीति', 'पिछड़ेपन', 'विचार', 'सूत्रीकरण', 'खत्म', 'मकसद', 'अब', 'ऐसा', 'लग', 'समझ', 'बैकवर्डनेस', 'खत्म', 'बजाय', 'उसकी', 'विभिन्न', 'किस्मों', 'बीच', 'होड़', 'पतित', 'गई', 'पिछडे़', 'श्रेणी', 'पिछड़ा', 'बने', 'रहने', 'तुला', 'व्यक्तियों', 'समुदायों', 'आत्मछवि', 'पिछड़े', 'पिछड़ा', 'बनने', 'जुट', 'गए', 'व्यावहारिक', 'धरातल', 'पिछड़ापन', 'मानवीय', 'अस्तित्व', 'अवमानना', 'लाभकारी', 'मान', 'लिया', 'कम', 'पिछड़ा', 'खुद', 'ज्यादा', 'पिछड़ा', 'बता']


# **Duplicates Removal**

In [12]:
def duplicates_removal(filtered_words):
    unique_filtered_words = pd.Series(filtered_words).drop_duplicates().tolist()
    return unique_filtered_words

unique_filtered_words = duplicates_removal(filtered_words)

# Display the top 50 unique filtered words
print(unique_filtered_words[:50])

['भारतीय', 'राजनीति', 'पिछड़ेपन', 'विचार', 'सूत्रीकरण', 'खत्म', 'मकसद', 'अब', 'ऐसा', 'लग', 'समझ', 'बैकवर्डनेस', 'बजाय', 'उसकी', 'विभिन्न', 'किस्मों', 'बीच', 'होड़', 'पतित', 'गई', 'पिछडे़', 'श्रेणी', 'पिछड़ा', 'बने', 'रहने', 'तुला', 'व्यक्तियों', 'समुदायों', 'आत्मछवि', 'पिछड़े', 'बनने', 'जुट', 'गए', 'व्यावहारिक', 'धरातल', 'पिछड़ापन', 'मानवीय', 'अस्तित्व', 'अवमानना', 'लाभकारी', 'मान', 'लिया', 'कम', 'खुद', 'ज्यादा', 'बता', 'निहित', 'स्वार्थ', 'बनाने', 'जघन्य']


# **Storing Tokens as NumPy Array Elements**

In [13]:
# Convert the tokenized words list to a numpy array
tokenized_array = np.array(unique_filtered_words)

# Display the array
print(tokenized_array)
print(len(tokenized_array))

['भारतीय' 'राजनीति' 'पिछड़ेपन' ... 'काला' 'समानांतर' 'गुना']
4359


# **User Input**

In [14]:
user_input = input("Enter your text: ")

# eg for usage : भारतीय भरत विवध संस्कृतियो, परंपरा और परिदृश्यों

Enter your text: भारतीय भरत विवध संस्कृतियो, परंपरा और परिदृश्यों


# **Preprocessing the User Input**

In [15]:
# Tokenizing sentences
ts = tokenize_sentences([user_input])

# Tokenizing words
tw = tokenize_words(ts)

print(ts)
print(tw)

['भारतीय भरत विवध संस्कृतियो, परंपरा और परिदृश्यों']
['भारतीय', 'भरत', 'विवध', 'संस्कृतियो', ',', 'परंपरा', 'और', 'परिदृश्यों']


In [16]:
# Punctuation removal
text_wo_punct= remove_punctuation(tw)
print(text_wo_punct)

['भारतीय', 'भरत', 'विवध', 'संस्कृतियो', 'परंपरा', 'और', 'परिदृश्यों']


In [17]:
# Stop Word Removal
text_wo_stopw=stop_word_removal(text_wo_punct)
print(text_wo_stopw)

['भारतीय', 'भरत', 'विवध', 'संस्कृतियो', 'परंपरा', 'परिदृश्यों']


In [18]:
# Duplicates removal
text_wo_duplicate=duplicates_removal(text_wo_stopw)
print(text_wo_duplicate)

['भारतीय', 'भरत', 'विवध', 'संस्कृतियो', 'परंपरा', 'परिदृश्यों']


In [19]:
user_array = np.array(text_wo_duplicate)
print("Tokenized Array:", user_array)
print("Length of Tokenized Array:", len(user_array))

Tokenized Array: ['भारतीय' 'भरत' 'विवध' 'संस्कृतियो' 'परंपरा' 'परिदृश्यों']
Length of Tokenized Array: 6


# **Accuracy calculation function**

In [20]:
# **Accuracy Calculation**
def calculate_accuracy(correct_words, total_words):
    return (correct_words / total_words) * 100 if total_words > 0 else 0

# **Prediction using Levenshtein Distance**

In [21]:
def find_closest_words(target_word, word_list, top_n=5):
    # Calculate distances and store them in a list
    distances = []
    for w in word_list:
        if w == target_word:
            return "correct"  # If exact match, return "correct"
        else:
            distances.append((w, Levenshtein.distance(target_word, w)))

    # Sort by edit distance (ascending order)
    distances.sort(key=lambda x: x[1])

    # Get top N closest words with their distances
    return distances[:top_n]

# Variables for accuracy calculation
total_words = len(user_array)
correct_predictions_levenshtein = 0

# Find the closest words for each unique word in the user_array
for word in user_array:
    closest_words = find_closest_words(word, tokenized_array)

    if closest_words == "correct":
        correct_predictions_levenshtein += 1
        print(f"\nWord '{word}' is correct.")
    else:
        # If the word is incorrect, print "incorrect"
        print(f"\nWord '{word}' is incorrect.")
        print("\nSuggestions with Levenshtein edit distances:")
        for closest_word, distance in closest_words:
            print(f"Word: '{closest_word}', Edit Distance: {distance}")

        # Print the predicted word (the one with the smallest edit distance)
        predicted_word = closest_words[0][0]
        print(f"\nPredicted word for '{word}': '{predicted_word}'")

# Calculate and print accuracy
accuracy_levenshtein = calculate_accuracy(correct_predictions_levenshtein, total_words)
print(f"\nLevenshtein Accuracy: {accuracy_levenshtein:.2f}%")


Word 'भारतीय' is correct.

Word 'भरत' is incorrect.

Suggestions with Levenshtein edit distances:
Word: 'भूत', Edit Distance: 1
Word: 'भारत', Edit Distance: 1
Word: 'भरा', Edit Distance: 1
Word: 'भर', Edit Distance: 1
Word: 'भरे', Edit Distance: 1

Predicted word for 'भरत': 'भूत'

Word 'विवध' is incorrect.

Suggestions with Levenshtein edit distances:
Word: 'विविध', Edit Distance: 1
Word: 'विरोध', Edit Distance: 2
Word: 'विल', Edit Distance: 2
Word: 'विवाद', Edit Distance: 2
Word: 'विफल', Edit Distance: 2

Predicted word for 'विवध': 'विविध'

Word 'संस्कृतियो' is incorrect.

Suggestions with Levenshtein edit distances:
Word: 'स्थितियों', Edit Distance: 5
Word: 'हस्तियों', Edit Distance: 5
Word: 'व्यक्तियों', Edit Distance: 6
Word: 'सदस्य', Edit Distance: 6
Word: 'संस्थागत', Edit Distance: 6

Predicted word for 'संस्कृतियो': 'स्थितियों'

Word 'परंपरा' is correct.

Word 'परिदृश्यों' is incorrect.

Suggestions with Levenshtein edit distances:
Word: 'विदेशियों', Edit Distance: 4
Word: 'परि

# **Prediction using Smith-Waterman**

In [22]:
# INBUILT - SWALIGN

# Initialize the swalign aligner with custom scoring
match_score = 2
mismatch_penalty = -1
gap_penalty = -1

# Define scoring matrix
scoring = swalign.NucleotideScoringMatrix(match_score, mismatch_penalty)
aligner = swalign.LocalAlignment(scoring, gap_penalty)  # Smith-Waterman is a local alignment algorithm

# Function to compute Smith-Waterman alignment score using swalign
def smith_waterman_swalign(s1, s2):
    alignment = aligner.align(s1, s2)
    return alignment.score

# Function to find the closest words using swalign for Smith-Waterman alignment
def find_closest_words_sw_swalign(target_word, word_list, top_n=5):
    scores = []

    for word in word_list:
        if word == target_word:
            return "correct"  # Exact match
        else:
            # Calculate the Smith-Waterman score using swalign
            score = smith_waterman_swalign(target_word, word)
            scores.append((word, score))

    # Sort by score in descending order (higher score means closer match)
    scores.sort(key=lambda x: x[1], reverse=True)

    # Return the top N closest words with their scores
    return scores[:top_n]

# Variables for accuracy calculation
correct_predictions_sw = 0
total_words = len(user_array)

for word in user_array:
    closest_words_sw = find_closest_words_sw_swalign(word, tokenized_array)

    if closest_words_sw == "correct":
        correct_predictions_sw += 1
        print(f"\nWord '{word}' is correct.")
    else:
        print(f"\nWord '{word}' is incorrect.")
        print("\nSuggestions with Smith-Waterman scores:")
        for closest_word, score in closest_words_sw:
            print(f"Word: '{closest_word}', Score: {score}")

        # Predicted word based on highest Smith-Waterman score
        predicted_word_sw = closest_words_sw[0][0]
        print(f"\nPredicted word for '{word}': '{predicted_word_sw}'")

# Calculate and print accuracy
accuracy_sw = calculate_accuracy(correct_predictions_sw, total_words)
print(f"\nSmith-Waterman Accuracy: {accuracy_sw:.2f}%")


Word 'भारतीय' is correct.

Word 'भरत' is incorrect.

Suggestions with Smith-Waterman scores:
Word: 'उभरती', Score: 6
Word: 'भरते', Score: 6
Word: 'निर्भरता', Score: 6
Word: 'भारतीय', Score: 5
Word: 'भारत', Score: 5

Predicted word for 'भरत': 'उभरती'

Word 'विवध' is incorrect.

Suggestions with Smith-Waterman scores:
Word: 'विविध', Score: 7
Word: 'विवादित', Score: 6
Word: 'विवाद', Score: 6
Word: 'रविवार', Score: 6
Word: 'विवादों', Score: 6

Predicted word for 'विवध': 'विविध'

Word 'संस्कृतियो' is incorrect.

Suggestions with Smith-Waterman scores:
Word: 'व्यक्तियों', Score: 10
Word: 'परिस्थितियों', Score: 10
Word: 'स्थितियों', Score: 10
Word: 'हस्तियों', Score: 10
Word: 'नियुक्तियों', Score: 9

Predicted word for 'संस्कृतियो': 'व्यक्तियों'

Word 'परंपरा' is correct.

Word 'परिदृश्यों' is incorrect.

Suggestions with Smith-Waterman scores:
Word: 'दृश्य', Score: 10
Word: 'विदेशियों', Score: 10
Word: 'क्योंकि', Score: 8
Word: 'क्यों', Score: 8
Word: 'तथ्यों', Score: 8

Predicted word for 

In [23]:
# MANUAL

# Smith-Waterman implementation
def smith_waterman_score(s1, s2, match=2, mismatch=-1, gap=-1):
    m, n = len(s1), len(s2)
    score_matrix = np.zeros((m + 1, n + 1))

    max_score = 0

    # Populate the score matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                score = match
            else:
                score = mismatch

            diagonal_score = score_matrix[i - 1][j - 1] + score
            up_score = score_matrix[i - 1][j] + gap
            left_score = score_matrix[i][j - 1] + gap
            score_matrix[i][j] = max(0, diagonal_score, up_score, left_score)

            # Track the max score
            if score_matrix[i][j] > max_score:
                max_score = score_matrix[i][j]

    return max_score

# Function to find the closest words using Smith-Waterman
def find_closest_words_sw(target_word, word_list, top_n=5):
    scores = []

    for word in word_list:
        if word == target_word:
            return "correct"  # Exact match
        else:
            # Calculate the Smith-Waterman score
            score = smith_waterman_score(target_word, word)
            scores.append((word, score))

    # Sort by score in descending order (higher score means closer match)
    scores.sort(key=lambda x: x[1], reverse=True)

    # Return the top N closest words with their scores
    return scores[:top_n]

# Variables for accuracy calculation
correct_predictions_sw = 0

for word in user_array:
    closest_words_sw = find_closest_words_sw(word, tokenized_array)  # Same structure as Levenshtein

    if closest_words_sw == "correct":
        correct_predictions_sw += 1
        print(f"\nWord '{word}' is correct.")
    else:
        print(f"\nWord '{word}' is incorrect.")
        print("\nSuggestions with Smith-Waterman scores:")
        for closest_word, score in closest_words_sw:
            print(f"Word: '{closest_word}', Score: {score}")

        # Predicted word based on highest Smith-Waterman score
        predicted_word_sw = closest_words_sw[0][0]
        print(f"\nPredicted word for '{word}': '{predicted_word_sw}'")

# Calculate and print accuracy
accuracy_sw = calculate_accuracy(correct_predictions_sw, total_words)
print(f"\nSmith-Waterman Accuracy: {accuracy_sw:.2f}%")


Word 'भारतीय' is correct.

Word 'भरत' is incorrect.

Suggestions with Smith-Waterman scores:
Word: 'उभरती', Score: 6.0
Word: 'भरते', Score: 6.0
Word: 'निर्भरता', Score: 6.0
Word: 'भारतीय', Score: 5.0
Word: 'भारत', Score: 5.0

Predicted word for 'भरत': 'उभरती'

Word 'विवध' is incorrect.

Suggestions with Smith-Waterman scores:
Word: 'विविध', Score: 7.0
Word: 'विवादित', Score: 6.0
Word: 'विवाद', Score: 6.0
Word: 'रविवार', Score: 6.0
Word: 'विवादों', Score: 6.0

Predicted word for 'विवध': 'विविध'

Word 'संस्कृतियो' is incorrect.

Suggestions with Smith-Waterman scores:
Word: 'व्यक्तियों', Score: 10.0
Word: 'परिस्थितियों', Score: 10.0
Word: 'स्थितियों', Score: 10.0
Word: 'हस्तियों', Score: 10.0
Word: 'नियुक्तियों', Score: 9.0

Predicted word for 'संस्कृतियो': 'व्यक्तियों'

Word 'परंपरा' is correct.

Word 'परिदृश्यों' is incorrect.

Suggestions with Smith-Waterman scores:
Word: 'दृश्य', Score: 10.0
Word: 'विदेशियों', Score: 10.0
Word: 'क्योंकि', Score: 8.0
Word: 'क्यों', Score: 8.0
Word: '

# **GUI using Gradio**

In [24]:
import gradio as gr
def process_text_with_sw(user_input):
    # Tokenize the sentences and words
    tokenized_sentences = tokenize_sentences([user_input])
    tokenized_words = tokenize_words(tokenized_sentences)

    # Remove punctuation
    text_wo_punct = remove_punctuation(tokenized_words)

    # Remove stop words
    text_wo_stopw = stop_word_removal(text_wo_punct)

    # Remove duplicates
    text_wo_duplicate = duplicates_removal(text_wo_stopw)

    # Convert to NumPy array
    user_array = np.array(text_wo_duplicate)

    lev_correct = 0
    sw_correct = 0
    total_words = len(user_array)

    lev_results = ""
    sw_results = ""

    for word in user_array:
        # Levenshtein distance
        closest_words_lev = find_closest_words(word, tokenized_array)
        if closest_words_lev == "correct":
            lev_results += f"\nWord '{word}' is correct (Levenshtein).\n"
            lev_correct += 1
        else:
            predicted_word_lev = closest_words_lev[0][0]
            lev_results += f"\nWord '{word}' is incorrect (Levenshtein).\nPredicted word: '{predicted_word_lev}'\n"

        # Smith-Waterman distance
        closest_words_sw = find_closest_words_sw(word, tokenized_array)
        if closest_words_sw == "correct":
            sw_results += f"\nWord '{word}' is correct (Smith-Waterman).\n"
            sw_correct += 1
        else:
            predicted_word_sw = closest_words_sw[0][0]
            sw_results += f"\nWord '{word}' is incorrect (Smith-Waterman).\nPredicted word: '{predicted_word_sw}'\n"

    # Calculate accuracy for both algorithms
    lev_accuracy = calculate_accuracy(lev_correct, total_words)
    sw_accuracy = calculate_accuracy(sw_correct, total_words)

    # Append accuracy results
    lev_results += f"\nLevenshtein Accuracy: {lev_accuracy:.2f}% ({lev_correct}/{total_words})"
    sw_results += f"\nSmith-Waterman Accuracy: {sw_accuracy:.2f}% ({sw_correct}/{total_words})"

    return lev_results, sw_results

# Define the Gradio interface for both algorithms
iface_sw = gr.Interface(
    fn=process_text_with_sw,
    inputs=gr.Textbox(label="Enter Hindi Text Here :"),
    outputs=[
        gr.Textbox(label="Levenshtein Results"),
        gr.Textbox(label="Smith-Waterman Results")
    ],
    title="SHUDDH लेखन : Hindi Spelling Checker",
    description="Enter your Hindi text to tokenize it, remove punctuation and stop words, and find the closest words based on Levenshtein Algorithm and Smith-Waterman algorithm."
)

# Launch the Gradio interface
iface_sw.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e891c34f89c76bad43.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


