# Group BackBenchers
    Shubham Agrawal C0911596
    Tanmay Sharma C0912911  
    Rahul Mehta C0910406
    Hardik C0913846 


# TeleGram Bot URL: https://t.me/VerbaForge_bot

In [1]:
#importing all necessary libraries
import nest_asyncio
nest_asyncio.apply()

from telegram import Update
from telegram.ext import Application, CommandHandler, MessageHandler, filters
from tensorflow.keras.models import load_model
from sentence_transformers import SentenceTransformer
import numpy as np
import PyPDF2
import docx
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import string
from textstat import flesch_reading_ease
import pdfplumber
import os
from tqdm import tqdm
import cohere




In [2]:
#Load the trained hybrid model and the SentenceTransformer model
hybrid_model = load_model('hybrid_model.keras')
embedding_model = SentenceTransformer('embedding_model')

print("Hybrid model and embedding model loaded successfully!")

Hybrid model and embedding model loaded successfully!


In [3]:
# Initialize the Cohere client
cohere_client = cohere.Client('Use API')

In [4]:
#Defining the text cleaning function 
def clean_text(text, update=None):
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    #converting to lowercase
    text = text.lower()
    #tokenize text into words
    words = word_tokenize(text)
    total_words = len(words)
    
    #progress bar 
    processed_words = []
    for word in tqdm(words, desc="Processing words", total=total_words, ncols=100):
        if word.isalnum() and word not in stop_words:
            processed_words.append(lemmatizer.lemmatize(word))
        if update and len(processed_words) % 100 == 0:  #update every 100 words
            update.message.reply_text(f"Processing {len(processed_words)} words...")

    return ' '.join(processed_words)

In [5]:
#defining a function for preprocessing new essays
def preprocess_essay(essay, embedding_model, update=None):
    #cleaning the essay
    cleaned_text = clean_text(essay, update)
    
    #extracting features
    num_sentences = len(sent_tokenize(essay))    #number of sentences in the essay
    words = word_tokenize(essay)                  #tokenize essay into words
    num_words = len(words)
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0      #avg sentence length
    punctuation_density = sum(1 for char in essay if char in string.punctuation) / len(words) if num_words > 0 else 0       #punction density
    vocab_richness = len(set(cleaned_text.split())) / len(cleaned_text.split()) if len(cleaned_text.split()) > 0 else 0     #vaocab richness
    readability = flesch_reading_ease(cleaned_text)      #readability score

    #extracting embeddings
    embedding = embedding_model.encode([cleaned_text])[0]

    #combining features into the correct format
    features = np.array([[num_sentences, avg_sentence_length, punctuation_density, vocab_richness, readability]])
    return embedding, features

In [6]:
#function for extracting text from PDF file
def extract_text_from_pdf(file_path, update):
    with pdfplumber.open(file_path) as pdf:
        text = ""                         #storing extracted data
        total_pages = len(pdf.pages)      #getting total pages in PDF
        for i, page in enumerate(pdf.pages):
            text += page.extract_text()         #extract text from the current page
            if (i + 1) % 2 == 0 and update:     #send progress update every 2 pages
                update.message.reply_text(f"Extracting page {i + 1}/{total_pages}...")
    return text

In [7]:
#function to extract text from Word document (.docx)
def extract_text_from_word(file_path, update):
    doc = docx.Document(file_path)
    text = ""
    total_paragraphs = len(doc.paragraphs)
    for i, para in enumerate(doc.paragraphs):
        text += para.text               #append the text of the current paragraph to the accumulated text
        if (i + 1) % 5 == 0 and update:  #send progress update every 5 paragraphs
            update.message.reply_text(f"Extracting paragraph {i + 1}/{total_paragraphs}...")
    return text

In [8]:
#function for handling uploaded files
async def handle_file(update: Update, context):
    #retrieving file details from the user's message
    file = update.message.document
    file_id = file.file_id
    file_name = file.file_name
    file_extension = file_name.split('.')[-1].lower()  #extracting file extension

    #downloading the file
    new_file = await context.bot.get_file(file_id)
    file_path = f"temp.{file_extension}"      # save the file with a temporary name and extension
    await new_file.download_to_drive(file_path)

    #extracting text based on file type
    if file_extension == "pdf":
        text = extract_text_from_pdf(file_path, update)     #extracting text from PDF

    elif file_extension == "docx":
        text = extract_text_from_word(file_path, update)    #extracting from word document
    elif file_extension == "txt":
        with open(file_path, 'r') as f:                     #read text from plain text file           
            text = f.read()
    else:
        await update.message.reply_text("Unsupported file type. Please upload a PDF, DOCX, or TXT file.")
        #notify the user about unsupported file types
        os.remove(file_path)         #clean the temporary file
        return

    #preprocess the extracted text and predict the score
    embedding, features = preprocess_essay(text, embedding_model, update)  #extract features,embeddings
    embedding = np.array([embedding])  #wrap the embedding into a batch of size 1
    predicted_score = hybrid_model.predict([embedding, features])     #predicting the essay score

    #send the predicted score
    await update.message.reply_text(f"Score for the essay out of 6: {predicted_score[0][0]:.2f}")

    #ask if feedback is needed by the user
    await update.message.reply_text("Do you want feedback on the essay? Type 'feedback' to receive it.")

    #saving text for feedback generation
    context.user_data['last_essay'] = text

    #clean up the file
    os.remove(file_path)

In [9]:
#function for generating feedback
async def generate_feedback(update: Update, context):
    if 'last_essay' not in context.user_data:         #checking if essay is available for feedback
        await update.message.reply_text("No essay found for feedback. Please upload an essay or provide text input first.")
        return

    essay = context.user_data['last_essay']        #retrieving essay from user data

    try:
        #creating the prompt for detailed feedback
        prompt = (
            "Provide detailed and constructive feedback for the following essay. Include:\n"
            "- Strengths of the essay.\n"
            "- Areas for improvement in content, structure, and grammar.\n"
            "- Suggestions for enhancing clarity and readability.\n\n"
            f"Essay: {essay}"
        )
        
        #generate feedback using Cohere API
        response = cohere_client.generate(
            model='command',
            prompt=prompt,
            max_tokens=500  #we can adjust this if more or less feedback is needed
        )
        feedback = response.generations[0].text.strip()

        #send the feedback
        await update.message.reply_text(f"Here is the feedback for your essay:\n{feedback}")
    except Exception as e:
        #handling errors during feedback generation
        await update.message.reply_text("An error occurred while generating feedback. Please try again later.")
        print(f"Error: {e}")        

In [10]:
#defining a function to handle greetings
async def handle_greeting(update: Update, context):
    await update.message.reply_text("Hello! Hope you are doing well. Please upload an essay to score or provide text input.")

In [11]:
#command to score the essay or respond to greetings
async def score_essay(update: Update, context):
    user_message = update.message.text.strip().lower()   #normalize the user's input text

    #defining a list of greetings
    greetings = ["hi", "hello", "hey", "greetings", "good morning", "good evening", "good afternoon"]

    #check if the input is a greeting
    if user_message in greetings:
        await handle_greeting(update, context)     #handle greeting msg
    elif user_message == "feedback":
        await generate_feedback(update, context)   #handling feedback request
    else:
        #if not a greeting or feedback request, treat the input as an essay
        new_essay = update.message.text
        #preprocessing the essay for feature and embedding extraction
        embedding, features = preprocess_essay(new_essay, embedding_model, update)
        embedding = np.array([embedding])  #wrap the embedding into a batch of size 1

        #predicting the score using the hybrid model
        predicted_score = hybrid_model.predict([embedding, features])
        await update.message.reply_text(f"Predicted Score for the essay: {predicted_score[0][0]:.2f}")

        #saving the essay for feedback generation
        context.user_data['last_essay'] = new_essay
        await update.message.reply_text("Do you want feedback on the essay? Type 'feedback' to receive it.")

In [None]:
#function for starting the bot
async def main():
    #Telegram bot token
    bot_token = 'Use api'

    #creating an application object and initialzing with bot's token
    application = Application.builder().token(bot_token).build()

    #command to start the bot
    application.add_handler(CommandHandler("start", handle_greeting))

    #handle text messages (greetings or essays)
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, score_essay))

    #handle file uploads (PDF, DOCX, TXT)
    application.add_handler(MessageHandler(filters.Document.ALL, handle_file))

    #starting the bot
    await application.run_polling()

await main()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  update.message.reply_text(f"Processing {len(processed_words)} words...")
Processing words: 100%|████████████████████████████████████████| 1105/1105 [00:07<00:00, 145.70it/s]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Processing words: 100%|███████████████████████████████████████| 668/668 [00:00<00:00, 147633.84it/s]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Processing words: 100%|████████████████████████████████████████| 439/439 [00:00<00:00, 97043.29it/s]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahul.RA01/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Processing words: 100%|███████████████████████████████████████| 579/579 [00:00<00:00, 163041.42it/s]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
