### Grading Tool code and Experimental setup.

In [1]:
import os
import pandas as pd
import json
import re
import requests
import openai
from dotenv import load_dotenv

In [2]:
# Load environment variables from the .env file
load_dotenv()

api_key = os.getenv("API_KEY")
api_endpoint = "https://api.openai.com/v1/chat/completions"

# Headers for the API request
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

In [3]:
def read_metadata(file_path):
    """
    Reads metadata and grading criteria from a CSV file.
    """
    return pd.read_excel(file_path)

def read_questions(file_path):
    """
    Reads questions, correct answers, and max marks from a CSV file.
    """
    return pd.read_excel(file_path)

def read_student_answers(file_path):
    """
    Reads all student data from processed file.
    """
    return pd.read_excel(file_path).set_index("Student Id")

In [4]:
def create_system_content(metadata, rubric_level = "full"):
    """
    Creates the system content based on metadata and grading rubric criteria (no, partial, full).
    """
    if rubric_level == 'no':
        prompt = """
        You are an AI assistant specialized in grading student exam answers.
        Grade the student answers based on given correct answer and maximum marks.
        """
    elif rubric_level == 'partial':
        prompt = f"""
        You are an AI assistant specialized in grading student exam answers based on specified criteria.
        Evaluate answers provided by students based on the following exam instructions from the professor: "{metadata['Grading remarks'][0]}".
        """
    else:
        prompt = f"""
        You are an AI assistant specialized in grading student exam answers based on specified criteria.
        Evaluate answers provided by students based on provided correct answers and grading instructions.
        You are grading for the subject: {metadata['Subject'][0]}, at the {metadata['Class Level'][0]} level 
        who are doing a major in {metadata['Major'][0]}, focusing on the following topics: {', '.join(metadata['Topics'])}.
        Strongly consider the grading remarks from the professor for this particular exam: "{metadata['Grading remarks'][0]}"
        
        Additional Grading criteria to consider:
        - Correctness: {metadata['Correctness'][0]}
        - Argument Quality and Analysis: {metadata['Argument Quality and Analysis'][0]}
        - Grammar: {metadata['Grammar'][0]}
        - Clarity and Structure: {metadata['Clarity and Structure'][0]}

        Based on the selected criterias marked 'Yes' and {metadata['Grading strictness'][0]} level of grading strictness, 
        create a detailed internal rubric for yourself to evaluate the student's response.
        Consider it for grading and feedback of each student.
        """

    return prompt


# - Grading strictness #-> very low, low, moderate, high, very high

In [5]:
def user_question_prompt(question, correct_answer, student_answer, max_marks):
    """
    Calls OpenAI API to grade a student's answer.
    """
    prompt = f"""
    Based on the passage text.
    Question: "{question}"
    
    Correct Answer/guideline: "{correct_answer}"
    
    Student's Answer: "{student_answer}"
    
    Provide a grade out of {max_marks}, which can be fractional, and a very concise constructive feedback on the answer 
    if incorrect or only partially correct, explaining what was wrong. If right, just say "Correct" in the feedback.
    Note - Donot reveal the exact grading criteria in the feedback.
    
    Follow response format:
    Grade : 
    Feedback : 
    """

    return prompt


In [6]:
def log_token_usage(total_tokens, file_path='token_usage.json'):
    """
    Define the function to log token usage
    """
    
    # Initialize or update the JSON file
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
    else:
        data = {"total_tokens": 0}

    # Update the cumulative total
    data["total_tokens"] += total_tokens
    
    # Write back to the JSON file
    with open(file_path, 'w') as file:
        json.dump(data, file)

In [21]:
def call_openai_api(message_history, rubric_level, model_name, iter):
    """
    Function to call OpenAI API and log token usage to a JSON file.
    """
    
    payload = {
        "model": model_name,
        "messages": message_history,
        "temperature": 0.1
    }

    # Make the API call
    response = requests.post(api_endpoint, headers=headers, json=payload)
    response_data = response.json()
    print(response_data)
    total_tokens = response_data['usage']['total_tokens']
    
    # Sanitize model name for file compatibility
    safe_model_name = re.sub(r'[^A-Za-z0-9]', '_', model_name)
    # Log the token usage
    log_file = f"/Users/rrishabh/Documents/Thesis related docs/Thesis Data/Output/token_usage_{safe_model_name}_{rubric_level}_{iter}.json"
    log_token_usage(total_tokens, log_file)

    # Return the API response content
    return response_data['choices'][0]['message']['content']

In [8]:
def parse_response(response):
    """
    Parse the API response to extract grade and feedback
    """
    grade = None
    feedback = ""
    for line in response.split('\n'):
        parts = [p.strip() for p in line.split(":", 1)]
        if len(parts) == 2:
            if parts[0].lower() == "grade":
                # grade = parts[1]
                try:
                    grade = float(parts[1])
                except ValueError:
                    grade = None
            elif parts[0].lower() == "feedback":
                feedback = parts[1]
    return {"Grade": grade, "Feedback": feedback}

In [9]:
# def process_passage_questions(student_id, passage, questions_df, student_answers_df, system_context):
def process_passage_questions(student_id, passage, questions_df, student_answers_df, system_context, rubric_level, model_name, iter):
    """
    Process each passage with associated questions, updating message history
    """
    
    # Initialize message history with system context and passage text
    message_history = [{"role": "system", "content": system_context}, 
                       {"role": "user", "content": f"Following questions are based on the given passage text: {passage}"},
                       {"role": "assistant", "content": "Understood"}]
    results = {}
    
    # Process each question under the same passage
    for _, question_row in questions_df.iterrows():
        question_id = question_row["Question ID"]
        student_answer = student_answers_df.loc[student_id, question_id]
        
        question_prompt = user_question_prompt(
            question=question_row["Question"],
            correct_answer=question_row["Correct Answer"],
            student_answer=student_answer,
            max_marks=question_row["Maximum Marks"]
        )

        # Append question prompt to message history and call API
        message_history.append({"role": "user", "content": question_prompt})
        # response = call_openai_api(message_history)
        response = call_openai_api(message_history, rubric_level, model_name, iter)
        
        # Parse and save response (grade and feedback)
        parsed_response = parse_response(response)
        results[question_id] = parsed_response
        
        # Append response to message history
        message_history.append({"role": "assistant", "content": response})
    
    return results, message_history


In [10]:
def generate_overall_feedback(message_history, rubric_level, model_name, iter):
    """
    Generate overall feedback for the student using full message history
    """

    overall_feedback_prompt = """
    Considering all the data so far, generate an in-depth summary of the student’s performance, 
    highlighting key weaknesses and deeper knowledge gaps they can address to improve their subject understanding. 
    Limit feedback to 1-2 paragraphs and keep it very concise."""
    
    message_history.append({"role": "user", "content": overall_feedback_prompt})
    
    # Save message history to a JSON file for analysis.
    # file_path = "/Users/rrishabh/Documents/Thesis related docs/Thesis Data/Output/message_history.json"
    # with open(file_path, "w") as file:
    #     json.dump(message_history, file, indent=4)

    return call_openai_api(message_history, rubric_level, model_name, iter)

In [11]:
# Function to export results to Excel
def export_results_to_excel(results, rubric_level, model_name, iter):
    data = []
    for student_id, student_data in results.items():
        student_row = {"Student ID": student_id}
        total_grade = 0

        for passage_id, questions in student_data.items():
            if passage_id != "Overall Feedback":
                for question_id, result in questions.items():
                    student_row[f"{question_id}_Grade"] = result["Grade"]
                    student_row[f"{question_id}_Feedback"] = result["Feedback"]
                    total_grade += result["Grade"]
            else:
                student_row["Overall Feedback"] = questions

        student_row["Total_grade"] = total_grade
        data.append(student_row)
    
    df = pd.DataFrame(data)
    # Sanitize model name for file compatibility
    safe_model_name = re.sub(r'[^A-Za-z0-9]', '_', model_name)
    file_name = f"/Users/rrishabh/Documents/Thesis related docs/Thesis Data/Output/student_grades_{safe_model_name}_{rubric_level}_{iter}.xlsx"

    df.to_excel(file_name, index=False)

In [12]:
def grade_students(metadata_file, questions_file, student_answers_file, rubric_level = "full", model_name = "gpt-4o-mini", iter=1):
    """
    Main function to execute grading for all students
    """

    # Read data and setup
    metadata = read_metadata(metadata_file)
    questions_df = read_questions(questions_file)
    student_answers_df = read_student_answers(student_answers_file)
    system_context = create_system_content(metadata, rubric_level)
    global_results = {}

    # Process each student
    for student_id in student_answers_df.index:
        student_results = {}
        complete_message_history = [{"role": "system", "content": system_context}]

        # Process each passage
        for passage_id, passage_questions in questions_df.groupby("Passage ID"):
            passage_text = passage_questions.iloc[0]["Passage text"] if pd.notnull(passage_id) else ""
            # results, message_history = process_passage_questions(student_id, passage_text, passage_questions, student_answers_df, system_context)
            results, message_history = process_passage_questions(student_id, passage_text, passage_questions, student_answers_df, system_context, rubric_level, model_name, iter)
            student_results[passage_id] = results
            
            # Append passage's message history to the complete message history
            complete_message_history.extend(message_history[1:])  # Skip the initial system context in each call

        # Generate overall feedback using complete message history
        overall_feedback = generate_overall_feedback(complete_message_history, rubric_level, model_name, iter)
        student_results["Overall Feedback"] = overall_feedback
        global_results[student_id] = student_results
    
    # Convert global results to DataFrame and save to Excel
    export_results_to_excel(global_results, rubric_level, model_name, iter)

### Running the experiments

In [22]:
# Input Files
questions_file = "/Users/rrishabh/Documents/Thesis related docs/Thesis Data/Input/questions.xlsx"
metadata_file = "/Users/rrishabh/Documents/Thesis related docs/Thesis Data/Input/metadata.xlsx"

student_answers_file = "/Users/rrishabh/Documents/Thesis related docs/Thesis Data/Input/compiled_student_answers.xlsx"
# student_answers_file = "/Users/rrishabh/Documents/Thesis related docs/Thesis Data/Input/compiled_student_answers_test.xlsx"

In [None]:
# Define the parameters for the experiment

# rubric_levels = ["full", "partial", "no"]
# models = ["gpt-4o", "gpt-4o-mini"]  # Add other relevant OpenAI models as needed
iterations = 1 #4
rubric_levels = ["full"]
models = ["gpt-4o"]

# Loop through each combination of parameters
for rubric_level in rubric_levels:
    for model_name in models:
        for iter_num in range(1, iterations + 1):
            print(f"Running experiment with rubric level: {rubric_level}, model: {model_name}, iteration: {iter_num}")
            grade_students(
                metadata_file=metadata_file,
                questions_file=questions_file,
                student_answers_file=student_answers_file,
                rubric_level=rubric_level,
                model_name=model_name,
                iter=iter_num
            )

### Back up prompts

In [None]:
# # -------------------------- Backup Cell --------------

# def create_system_content(metadata, rubric_level = "full"):
#     """
#     Creates the system content based on metadata and grading rubric criteria (no, partial, full).
#     """
#     if rubric_level == 'no':

#         prompt = """
#         You are an AI assistant specialized in grading student exam answers.
#         Grade the student answers based on given correct answer and maximum marks.
#         """

#     elif rubric_level == 'partial':
       
#         prompt = f"""
#         You are an AI assistant specialized in grading student exam answers based on 
#         specified criteria.
#         Evaluate answers provided by students based on the following exam instructions 
#         from the professor: "{metadata['Grading remarks'][0]}".
#         """

#     else:

#         prompt = f"""
#         You are an AI assistant specialized in grading student exam answers based on specified criteria.
#         Evaluate answers provided by students based on provided correct answers.
#         You are grading for the subject: {metadata['Subject'][0]}, at the {metadata['Class Level'][0]} level 
#         who are doing a major in {metadata['Major'][0]}, focusing on the following topics: {', '.join(metadata['Topics'])}.
#         Strongly consider the grading remarks from the professor for this particular exam: "{metadata['Grading remarks'][0]}"
        
#         Additional Grading criteria to consider:
#         - Completion: {metadata['Completion'][0]}
#         - Correctness: {metadata['Correctness'][0]}
#         - Argument Quality and Analysis: {metadata['Argument Quality and Analysis'][0]}
#         - Originality and Creative: {metadata['Originality and Creative'][0]}
#         - Grammar: {metadata['Grammar'][0]}
#         - Clarity and Structure: {metadata['Clarity and Structure'][0]}
#         - Length & Conciseness: {metadata['Length & Conciseness'][0]}
#         - Evidence use: {metadata['Evidence use'][0]}
#         - Grading strictness: {metadata['Grading strictness'][0]}

#         Based on the selected criterias marked 'Yes' and given level of Grading strictness, 
#         create a detailed internal rubric for yourself to evaluate the student's response.
#         Consider it for grading and feedback of each student.
#         """

#     return prompt


# # - Grading strictness #-> very low, low, moderate, high, very high

In [None]:
# Grading instructions by professor.
# For numerial questions, if the answer is within the range of +-1%, then give full designated marks, if its close, then partial marks, else 0
#For theoretical questions, the sample answer consists of keywords/key arguments. Give full marks if 70% of the key arguments are discussed else give marks based on the proportional percentage of keywords/keyarguments present in the student answers.