<a href="https://colab.research.google.com/github/sherinshaban/Question-Generation-Evaluation/blob/main/Project_Defense_Simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch
!pip install transformers
!pip install sentencepiece
!pip install PyPDF2
!pip install python-docx

In [None]:
!pip install -U transformers accelerate huggingface_hub

In [None]:
# -*- coding: utf-8 -*-

import PyPDF2
import docx
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import login
import os
import json

from google.colab import drive
try:
    drive.mount('/content/drive')
except Exception:
    print("Drive already mounted or error occurred during mounting.")

DRIVE_PATH = '/content/drive/MyDrive/Project_Files_For_Graduation/'

CHUNK_SIZE = 800
MAX_PAGES = 30

def read_pdf(file_path, max_pages):
    """Reads text content from a PDF file."""
    text = ""
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            num_pages = min(len(reader.pages), max_pages)
            for page in reader.pages[:num_pages]:
                text += page.extract_text() or ""
    except FileNotFoundError:
        return f"Error: The file at {file_path} was not found."
    except Exception as e:
        return f"An error occurred while reading the file: {e}"
    return text

def read_docx(file_path, max_pages):
    """Reads text content from a DOCX file."""
    text = ""
    try:
        doc = docx.Document(file_path)
        limit_words = max_pages * 500
        words_count = 0
        for para in doc.paragraphs:
            para_text = para.text + "\n"
            text += para_text
            words_count += len(para_text.split())
            if words_count >= limit_words:
                break
    except FileNotFoundError:
        return f"Error: The file at {file_path} was not found."
    except Exception as e:
        return f"An error occurred while reading the file: {e}"
    return text

def split_text(text, chunk_size):
    """Splits a large text into smaller chunks."""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunks.append(" ".join(words[i:i + chunk_size]))
    return chunks

def generate_questions_with_mistral(text, prompt, model, tokenizer):
    """
    Generates questions using the Mistral-7B-Instruct model with a specific prompt.
    NOTE: This function now accepts the model and tokenizer objects.
    """
    try:
        # Create a prompt to instruct the model
        full_prompt = f"""
        [INST]
        {prompt}

        Project Content:
        {text}
        [/INST]
        """
        # Tokenize the prompt and generate questions
        encoded_input = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False).to(model.device)
        generated_ids = model.generate(
            **encoded_input,
            max_new_tokens=4000,
            do_sample=True,
            top_p=0.95,
            temperature=0.7,
            num_return_sequences=1
        )

        # Decode the generated output and clean it up
        output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        start_index = output.find("[/INST]")
        if start_index != -1:
            questions_text = output[start_index + len("[/INST]"):].strip()
            # Split the output by lines, remove empty lines, and return a list
            questions_list = [q.strip() for q in questions_text.split('\n') if q.strip()]
            return questions_list
        else:
            return ["No questions could be generated. The model response was not as expected."]

    except Exception as e:
        return [f"An error occurred during model inference: {e}"]

def generate_essay_questions_only(text, model, tokenizer):
    """Generates exactly 7 in-depth essay questions."""
    all_questions = {}

    essay_prompt = "Generate exactly 7 in-depth essay questions that require a detailed, comprehensive answer about the project's core concepts or impact. Provide a sample model answer for each question."

    all_questions['Essay Questions'] = generate_questions_with_mistral(text, essay_prompt, model, tokenizer)

    return all_questions

def save_questions_to_json(questions_dict, file_name="essay_questions_and_answers.json"):
    """Saves the generated questions and answers to a JSON file in Google Drive."""

    full_path = os.path.join(DRIVE_PATH, file_name)

    try:

        os.makedirs(os.path.dirname(full_path), exist_ok=True)

        with open(full_path, 'w', encoding='utf-8') as f:
            json.dump(questions_dict, f, ensure_ascii=False, indent=4)

        print(f"Essay questions and answers have been successfully saved to the file: {full_path}")
    except IOError as e:
        print(f"An error occurred while trying to save the file: {e}")

def main():
    """Main function to run the script."""
    file_path = input("Please enter the path to your PDF or DOCX file (e.g., /content/drive/MyDrive/Project.pdf): ")

    if file_path.endswith('.pdf'):
        document_text = read_pdf(file_path, MAX_PAGES)
    elif file_path.endswith('.docx'):
        document_text = read_docx(file_path, MAX_PAGES)
    else:
        print("Error: Unsupported file type. Please use a .pdf or .docx file.")
        return

    if document_text.startswith("Error"):
        print(document_text)
        return

    print("\n-----------------------------------------------------")
    print("Generating only 7 Essay Questions with Mistral...")
    print("-----------------------------------------------------")

    # --- Load the model and tokenizer ONCE ---
    try:
        login()
    except Exception:
        pass

    model_id = "mistralai/Mistral-7B-Instruct-v0.2"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float16, device_map="auto")

    text_chunks = split_text(document_text, CHUNK_SIZE)

    all_questions = {
        'Essay Questions': []
    }

    for i, chunk in enumerate(text_chunks):
        print(f"Processing chunk {i+1}/{len(text_chunks)}...")
        chunk_questions = generate_essay_questions_only(chunk, model, tokenizer)

        if 'Essay Questions' in chunk_questions:
             all_questions['Essay Questions'].extend(chunk_questions['Essay Questions'])

    save_questions_to_json(all_questions)

if __name__ == "__main__":
    main()

In [None]:
# -*- coding: utf-8 -*-

import PyPDF2
import docx
import json
import os
import re
from huggingface_hub import login
from google.colab import drive
import torch

# NEW LIBRARIES FOR FAST SIMILARITY EVALUATION
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Connect Google Drive
try:
    drive.mount('/content/drive')
except Exception:
    print("Drive already mounted or error occurred during mounting.")

DRIVE_PATH = '/content/drive/MyDrive/Project_Files_For_Graduation/'
QUESTIONS_FILE_NAME = "essay_questions_and_answers.json"

# Settings for the Similarity Model
SIMILARITY_MODEL_NAME = 'paraphrase-multilingual-mpnet-base-v2'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


def load_questions_from_json(file_name=QUESTIONS_FILE_NAME):
    """Loads the generated questions and answers from a JSON file in Google Drive."""
    full_path = os.path.join(DRIVE_PATH, file_name)
    try:
        with open(full_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

            if 'Essay Questions' in data:
                 return data
            else:
                 print("Error: JSON file structure is missing 'Essay Questions' key.")
                 return None

    except FileNotFoundError:
        print(f"Error: Questions file not found at {full_path}. Please generate questions first.")
        return None
    except Exception as e:
        print(f"An error occurred while loading the file: {e}")
        return None

def parse_qa_string(qa_string):
    """
    Splits a question/model answer string based on the separator (Model Answer: or Correct Answer:).
    """
    match = re.search(r'(.*?)(Model Answer|Correct Answer|True or False)\s*:?\s*(.*)', qa_string, re.DOTALL | re.IGNORECASE)

    if match:
        question = match.group(1).strip()
        model_answer = match.group(3).strip()
        question = re.sub(r'^\d+\.?\s*Question\s*\d*\s*:?\s*', '', question, flags=re.IGNORECASE).strip()
        return question, model_answer
    else:
        return qa_string.strip(), "N/A - Model Answer not found in expected format."

def load_similarity_model():
    """Loads the fast Sentence Transformer model once and puts it on the correct device."""
    try:
        similarity_model = SentenceTransformer(SIMILARITY_MODEL_NAME, device=DEVICE)
        return similarity_model
    except Exception as e:
        print(f"Error loading similarity model: {e}")
        return None

def evaluate_by_similarity(model_answer, student_answer, similarity_model):
    """
    Evaluates the student's answer by measuring its cosine similarity
    to the model answer using vector embeddings.
    """
    if not similarity_model:
        return 0, "Error: Similarity model not loaded."

    sentences = [model_answer, student_answer]
    embeddings = similarity_model.encode(sentences)

    similarity_score = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[1].reshape(1, -1))[0][0]

    score_out_of_5 = round(similarity_score * 5, 1)

    if score_out_of_5 >= 4.5:
        feedback = "Excellent! Your answer is highly accurate and comprehensive (Similarity: {:.2f}).".format(similarity_score)
    elif score_out_of_5 >= 3.5:
        feedback = "Very Good. Your answer is largely correct, but slightly misses some key details (Similarity: {:.2f}).".format(similarity_score)
    elif score_out_of_5 >= 2.5:
        feedback = "Fair. Your answer shows a partial understanding of the concept (Similarity: {:.2f}).".format(similarity_score)
    else:
        feedback = "Needs improvement. Your answer has significant discrepancies or is incomplete (Similarity: {:.2f}).".format(similarity_score)

    return score_out_of_5, feedback

def run_simulation(questions_dict, similarity_model):
    """Simulates the Q&A process, collects student answers, and evaluates them using the FAST method."""

    print("\n=====================================================")
    print("Starting Project Defense Simulation (FAST EVALUATION) ")
    print("=====================================================")

    target_q_type = 'Essay Questions'

    questions_list = questions_dict[target_q_type]

    print(f"Displaying ALL {len(questions_list)} questions of type: {target_q_type}")

    for i, qa_string in enumerate(questions_list):

        question, model_answer = parse_qa_string(qa_string)

        print(f"\n--- Question {i+1} ---")
        print(f"QUESTION: {question}")

        student_answer = input("Your Answer (Student): ")

        print("\n[FAST GRADER is calculating similarity score...]")

        score, feedback = evaluate_by_similarity(model_answer, student_answer, similarity_model)

        print("\n--- Evaluation Result ---")
        print(f"SCORE: {score}/5")
        print(f"FEEDBACK: {feedback}")
        print(f"\nModel Answer (Reference): {model_answer}")
        print("------------------------------------------\n")

# =========================================================
# === 4. Main Execution Function (Loads only Similarity model) ===
# =========================================================

def main():
    """Main function to run the script."""


    print("\n Loading fast Similarity Model...")
    similarity_model = load_similarity_model()
    if similarity_model is None:
        return
    print("Similarity Model loaded successfully.")

    # 3. Logic to Load Questions
    all_questions = load_questions_from_json()

    if all_questions is None:
        print("\nFATAL ERROR: Questions file not loaded. Cannot run simulation.")
        return

    # 4. Run Simulation and Evaluation
    if all_questions:
        run_simulation(all_questions, similarity_model)

if __name__ == "__main__":
    main()
