In [2]:
import pandas as pd
import nltk
nltk.download('punkt_tab')
import string
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load a Hugging Face Transformer model (BERT-like)
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Efficient and lightweight
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Lemmatizer for better text normalization
lemmatizer = WordNetLemmatizer()

# Function to preprocess text with lemmatization
def preprocess_text(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    return set(tokens)  # Using a set for faster keyword checking

# Function to compute TF-IDF similarity
def compute_tfidf_similarity(model_answers, student_answer):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(model_answers + [student_answer])
    similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]
    return similarities

# Function to compute BERT-based similarity using Hugging Face
def compute_bert_similarity(model_answers, student_answer):
    def get_embedding(text):
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        # Using mean pooling with attention mask
        attention_mask = inputs['attention_mask']
        embeddings = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
        masked_embeddings = embeddings * mask
        summed = torch.sum(masked_embeddings, dim=1)
        summed_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
        return summed / summed_mask

    student_embedding = get_embedding(student_answer)
    similarities = [torch.cosine_similarity(student_embedding, get_embedding(ans)).item() for ans in model_answers]
    return similarities

# Main grading function
def grade_essays(questions_file, responses_file):
    # Load questions and responses
    questions_df = pd.read_csv(questions_file)
    responses_df = pd.read_csv(responses_file)

    # Align responses with questions
    student_responses = responses_df.set_index('Question')['Response'].to_dict()

    final_scores = []
    feedbacks = []

    for idx, row in questions_df.iterrows():
        question = row['Question']
        model_answer = row['Answer']
        student_answer = student_responses.get(question, "")

        # If no response, assign zero score
        if not student_answer:
            final_scores.append(0.0)
            feedbacks.append("No response provided.")
            continue

        # Preprocess student response
        student_words = preprocess_text(student_answer)

        # Compute similarities
        tfidf_scores = compute_tfidf_similarity([model_answer], student_answer)
        bert_scores = compute_bert_similarity([model_answer], student_answer)

        # Extract keywords for this question
        keywords = row['Keywords'].split(',') if pd.notna(row['Keywords']) else []
        keywords = [word.strip().lower() for word in keywords]

        # Keyword Matching Score (0 to 10)
        matched_keywords = sum(1 for word in keywords if word in student_words)
        keyword_score = (matched_keywords / len(keywords)) * 10 if keywords else 0.0

        # TF-IDF Score (scaled to 10)
        tfidf_score = round(tfidf_scores[0] * 10, 2)

        # BERT Semantic Score (scaled to 10)
        bert_score = round(bert_scores[0] * 10, 2)

        # Final Score (Weighted Combination)
        final_score = round((0.4 * keyword_score) + (0.3 * tfidf_score) + (0.3 * bert_score), 2)
        final_scores.append(final_score)

        # Feedback Mechanism
        if final_score >= 8:
            feedback = "Excellent response! Demonstrates a strong understanding of the concept."
        elif final_score >= 6:
            feedback = "Good response. Minor improvements could enhance clarity and depth."
        elif final_score >= 4:
            feedback = "Fair response. Needs further development and elaboration on key points."
            missing_keywords = [kw for kw in keywords if kw not in student_words]
            if missing_keywords:
                feedback += f" Consider including these keywords: {', '.join(missing_keywords)}."
        else:
            feedback = "Response needs significant improvement. Focus on addressing the core concepts of the question."
            if keywords:
                feedback += f" Make sure to include key terms like: {', '.join(keywords)}."
        feedbacks.append(feedback)

    # Assign scores and feedback
    questions_df['score'] = final_scores
    questions_df['Feedback'] = feedbacks

    # Calculate total score
    total_score = questions_df['score'].sum()

    return questions_df[['Question', 'score', 'Feedback']], total_score

# Example Usage:
questions_file = '/content/python_essay_short_questions_answers.csv'
responses_file = '/content/questions_responses_5.csv'
result_df, total_score = grade_essays(questions_file, responses_file)

print(result_df)
print(f"Total Score: {total_score}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

                                             Question  score  \
0   Explain the difference between lists and tuple...   3.37   
1         Describe the role of indentation in Python.   3.50   
2              What are Python's built-in data types?   2.92   
3       Explain how Python handles memory management.   3.82   
4   What are Python decorators, and how are they u...   4.20   
5   What is the difference between deep copy and s...   3.74   
6               What are Python modules and packages?   4.89   
7   Explain the Global Interpreter Lock (GIL) in P...   3.56   
8         How does exception handling work in Python?   1.91   
9   What is the purpose of the `__init__` method i...   4.34   
10  How do you create a virtual environment in Pyt...   3.54   
11  What is the difference between `is` and `==` i...   0.00   
12         How do you open and read a file in Python?   0.00   
13               What is a lambda function in Python?   0.00   
14        How does list comprehension wo