In [3]:
import faiss
import numpy as np
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import pandas as pd
from openai import OpenAI
import anthropic
import os

In [4]:
from llms import CallLLM

In [5]:
from get_content_from_app_prop import read_properties
props = read_properties('application.properties')

In [6]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [7]:
def load_pdf_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

In [8]:
def chunk_text(text, chunk_size=550, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i+chunk_size]))
    return chunks

In [9]:
def create_faiss_index(chunks):
    embeddings = embedder.encode(chunks, convert_to_numpy=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings.astype("float32"))
    return index, embeddings

In [10]:
def rubric_generation(rubric_file_csv):
    rubric = pd.read_csv(rubric_file_csv)

    criterion = rubric["Criterion"].unique().tolist()
    levels = rubric["Level"].unique().tolist()
    
    rubric_prompt = ""

    for c in criterion:
        weight = rubric[rubric["Criterion"]==c]["Weight"].values[0]
        rubric_prompt += f"- Criterion: {c} (Weight: {weight})"
        for l in levels:
            rubric[(rubric["Criterion"] == c) & (rubric["Level"] == l)]["Description"].values
            rubric_prompt += f"\n  - Level => {l}: {rubric[(rubric["Criterion"] == c) & (rubric["Level"] == l)]['Description'].values[0]}"
        rubric_prompt += "\n\n"

    return rubric_prompt


In [None]:
# üéöÔ∏è Strictness guidance text
def get_strictness_tone(strictness):
    if strictness == 1:
        tone = (
            '''Be lenient. Assume partial understanding even if not perfectly phrased. Give the benefit of the doubt when the student shows general relevance to the topic.'''
        )
    elif strictness == 2:
        tone = (
            "Be moderately lenient. Focus on comprehension and intent over precise phrasing. Minor off-topic points or missing citations should not heavily reduce scores."
        )

    elif strictness == 3:
        tone = (
            "Be balanced and fair. Follow the rubric exactly, rewarding relevance and clarity, but penalizing factual errors or unsupported claims moderately."
        )

    elif strictness == 4:
        tone = (
            "Be rigorous. Deduct marks for vague or unsupported statements. Only award high scores for detailed, well-supported, and precise answers."
        )
    elif strictness == 5:
        tone = (
            "Be very strict. Grade as a top-tier academic evaluator. Do not give credit unless the student's response exactly matches information in the notes. Heavily penalize unsupported or off-topic claims."
        )


    return tone

In [28]:
def evaluate_with_rubric(question, 
                         student_answer, 
                         chunks, index, 
                         rubric_path,
                         minimum_word_requirement=50, 
                         overall_score=100, 
                         k=3, llm="gpt"):


    q_embedding = embedder.encode(question, convert_to_numpy=True)

    # retrieve relevant context
    D, I = index.search(np.array([q_embedding]).astype("float32"), k)
    retrieved_chunks = [chunks[i] for i in I[0]]

    context = "\n\n".join(retrieved_chunks)

    print("Retrieved Context for Grading:\n", context)

    # build rubric text
    rubric = rubric_generation(rubric_path)

    prompt = f"""
You are grading a student's answer using the following rubric:

Rubric:
{rubric}

Minimum word requirement for the answer: {minimum_word_requirement} words.

Context from notes:
{context}

Question: {question}
Student Answer: {student_answer}

Student's Answer Length: {len(student_answer.split())} words.

Instructions:
- Score each criterion separately on a scale of 0 to 1.
- Provide reasoning for each score.
- Compute the weighted final score out of {overall_score}.
- If something is wrong or missing, explain why.

I want you to give the output in a JSON format only. I want you to follow the following template strictly:

{{
  "criteria": [
    {{
      "criterion": "{'{criterion_name}'}",
      "weight": "{'{weight}'}",
      "feedback": "{'{feedback}'}",
      "score_received": "{'{score_received}'}"
    }}
  ],
  "final_weighted_score_calculation": [
    {{
      "criterion": "{'{criterion_name}'}",
      "calculation": "{'{score_received} * {weight}'}",
      "result": "{'{calculation_result}'}"
    }}
  ],
  "total_score": {{
    "calculation": "{'{score1} + {score2} + ... + {scoreN}'}",
    "result": "{'{total_score}'}"
  }},
  "final_score": {{
    "out_of": "{'{overall_score}'}",
    "score": "{'{final_score}'}"
  }},
  "overall_feedback": "{'{overall_feedback}'}"
}}
"""
    llms = CallLLM()
    if llm == "gpt":
        return llms.autograder_openai(prompt)
    
    if llm == "claude":
        return llms.autograder_anthropic(prompt)

    if llm == "ollama":
        return llms.autograder_ollama(prompt)
    


In [12]:
# pdf_path = "data/Lecture Native American Cosmologies.pdf"
pdf_path = "data/history_10th_ssc_textbook.pdf"
text = load_pdf_text(pdf_path)
chunks = chunk_text(text)
index, embeddings = create_faiss_index(chunks)



# Define rubric
# rubric = {
#     "criteria": [
#         {"name": "Accuracy", "weight": 0.5},
#         {"name": "Clarity", "weight": 0.3},
#         {"name": "Completeness", "weight": 0.2}
#     ]
# }

In [13]:
rubric = "data/rubric.csv"

In [12]:
# Example question
question = "Why is land considered central to the religion and cultural identity of the Dakota people?"

# correct answer a
student_answer_correct_a = "Land is central to Dakota religion and cultural identity because it represents the center of history and culture, where time and place are contained. Sacred spaces, rivers, lakes, rocks, and village sites carry generational and spiritual meaning, linking humans to their origin and the star tribe. Loss of land through trickery, wars, and urban development has been experienced as a loss of cultural identity, making indigenous religion inseparable from geographic spaces. The Dakota view land not just as territory but as memory and sacred power, holding stories, language, and ceremonies that maintain their spiritual and cultural continuity."

# correct answer b
student_answer_correct_b = "Land is central to Dakota religion and cultural identity because it is seen as the place of human origin and the center of the world, where sacred stories, ceremonies, and generational memory connect people to their ancestors and the spiritual realm. Loss of land disrupts this connection, making geography inseparable from their religious and cultural life."

# wrong answer
student_answer_wrong = "Land is not important to the Dakota people‚Äôs religion or culture; they primarily focus on reading sacred texts in temples, and their spiritual identity does not depend on geographic locations or natural features."

# unclear answer
student_answer_unclear = "Land matters somehow because it has stories and things tied to people and history, and also rivers and rocks and ceremonies, which all connect in ways that maybe affect memory and identity, though it‚Äôs complicated and not exactly like just owning or using the land."

# short answer
student_answer_short = "Land is sacred to the Dakota because it holds their history, stories, and spiritual power."

# gemini answer
student_answer_gemini = '''land is considered central to the religion and cultural identity of the Dakota people for several profound reasons.

The Dakota view MniSotaMakoce (Minnesota) as the place of human origin‚Äîthe center of the world and the Prime Meridian of their cosmology (religion). The geography is where the earth is conjoined with the star tribe and where humans come from.

Land is essentially memory and history; it contains time and place and is marked by sacred stories and ceremonies. The entirety of the environment‚Äîrivers, lakes, rocks, and village sites‚Äîall hold generational and spiritual meaning. Therefore, the geographic space is seen as the basis of their sacred power. Because of this intimate spiritual connection, the loss of land is experienced directly as a loss of cultural identity. The Dakota believe the land remembers their stories, even if the people themselves have forgotten.'''


In [34]:
# Example question
question = "Write the stages of arranging history in the history research method?"

# correct answer a
student_answer_correct_a = '''The historical research method is a very detailed and systematic process that historians use to study, analyse, and arrange the events of the past in a meaningful and scientific manner. It helps in understanding how societies developed, how cultures changed, and how human life evolved through different periods of time. The process of arranging history begins with examining the relevant references of the available historical information. This includes reading old documents, inscriptions, manuscripts, coins, travel accounts, and various written records that provide clues about the past. After that, historians move to the second stage, which is collecting historical information. In this stage, they gather as much data as possible and highlight the processes that led to important historical transitions, such as wars, reforms, revolutions, and discoveries. They also carry out comparative analysis between different time periods or regions to identify similarities and differences.
The third stage involves understanding the references regarding time and space of the given historical events and the conceptual frameworks used in the research. This means historians must carefully study when and where an event happened and what social, political, and cultural ideas influenced it. The fourth stage is formulating relevant questions based on the historical references ‚Äî for example, ‚ÄúWhy did this event happen?‚Äù or ‚ÄúWhat were its effects?‚Äù These questions guide the direction of the research.
After this, historians proceed to the fifth stage, which is formulating hypotheses. A hypothesis is a possible explanation or assumption that can later be tested with evidence. It helps historians focus on proving or disproving certain ideas about the past. The next important step is critically examining various sources of history, which means checking how reliable, authentic, and unbiased the collected data is. Historians compare different sources to avoid mistakes or false information.
Finally, after completing all these steps, they write a historical narrative, which arranges the information in order and explains the causes and effects of events clearly. This whole process ensures that history is not just a story but a well-organized study based on facts, logic, and careful research. Thus, the stages of the historical research method help us to understand human civilization and the journey of mankind from the past to the present.
'''

In [26]:
student_answer_irrelevent = "My name is Srinivasan P and I am a student at the University of Auckland. I am pursuing a degree in Computer Science and have a keen interest in artificial intelligence and machine learning. In my free time, I enjoy hiking, reading science fiction novels, and experimenting with new programming languages. I believe that technology has the power to transform lives and I am excited to be part of this ever-evolving field."

In [40]:
result = evaluate_with_rubric(question, student_answer_correct_a, chunks, index, rubric)
print(result)

Retrieved Context for Grading:
 analysis. Thus, the scope of historiography kept continuously expanding. Writing of histories of various subjects like literature, architecture, sculpture, drawing and painting, music, dance, drama, films and television, etc. came into practice. Michel Foucault 61. (A) Choose the correct option from the given options and complete the statement. (1) It may be said that ‚Ä¶‚Ä¶.. was the founder of modern historiography. (a) Voltaire (b) Ren√© Descartes (c) Leopol d Rank√© (d) Karl Marx (2) ‚Ä¶‚Ä¶‚Ä¶‚Ä¶ wrote the book entitled ‚ÄòArchaeology of Knowledge‚Äô. (a) Karl Marx (b) Michel Foucault (c) Lucien Febvre (d) Voltaire (B) Identify and write the wrong pair in the following set. (1) Georg W ilhelm Friedrich Hegel - ‚ÄòReason in History‚Äô (2) Leopold von Rank√© - ‚ÄòThe theory and Practice of History‚Äô (3) Herodotus - ‚ÄòThe Histories‚Äô (4) Karl Marx - ‚ÄòDiscourse on the Method‚Äô 2. Write short notes. (1) Dialectics (2) Annales School 3. Explain the f

In [26]:
result = evaluate_with_rubric(question, student_answer_correct_b, chunks, index, rubric, strictness=2)
print(result)

```json
{
  "criteria": [
    {
      "criterion": "Critical Analysis (understanding of course materials)",
      "weight": "0.35",
      "feedback": "The student demonstrates a proficient understanding of the course materials, explaining the centrality of land to Dakota religion and cultural identity with relevant connections.",
      "score_received": "0.7"
    },
    {
      "criterion": "Academic and Scholarly Presentation",
      "weight": "0.35",
      "feedback": "The student's response is clear, concise, and free from errors. Although citations are not present, the response is within the word limit and otherwise well-prepared.",
      "score_received": "0.7"
    },
    {
      "criterion": "Portrays Insight (follows instructional questions)",
      "weight": "0.3",
      "feedback": "The response is on-topic and addresses the instructional question directly, providing insight into the importance of land for the Dakota people.",
      "score_received": "0.8"
    }
  ],
  "final_

In [27]:
result = evaluate_with_rubric(question, student_answer_correct_b, chunks, index, rubric, strictness=3)
print(result)

```json
{
  "criteria": [
    {
      "criterion": "Critical Analysis",
      "weight": "0.35",
      "feedback": "The student demonstrates a good understanding of the course materials by identifying the central role of land in Dakota religion and cultural identity. The response effectively connects land to human origin, sacred stories, and spiritual connections. However, more detailed examples or deeper contextual connections could enhance the analysis.",
      "score_received": "0.8"
    },
    {
      "criterion": "Academic and Scholarly Presentation",
      "weight": "0.35",
      "feedback": "The student's response is clear and free from errors. The answer is concise and meets the minimum word count requirement. There are no citations, but given the nature of the response, they may not be necessary. Overall, the presentation is well-prepared.",
      "score_received": "0.9"
    },
    {
      "criterion": "Portrays Insight",
      "weight": "0.3",
      "feedback": "The student pr

In [28]:
result = evaluate_with_rubric(question, student_answer_correct_b, chunks, index, rubric, strictness=4)
print(result)

```json
{
  "criteria": [
    {
      "criterion": "Critical Analysis (understanding of course materials)",
      "weight": "0.35",
      "feedback": "The student's answer demonstrates a good understanding of the course material, connecting land to Dakota religion and cultural identity. However, it lacks specific examples or deeper contextual explanations that would elevate it to exemplary.",
      "score_received": "0.8"
    },
    {
      "criterion": "Academic and Scholarly Presentation",
      "weight": "0.35",
      "feedback": "The answer is clearly articulated, free from grammatical errors, and meets the word count requirement. However, there are no citations included, which would be necessary for a higher score.",
      "score_received": "0.7"
    },
    {
      "criterion": "Portrays Insight (follows instructional questions)",
      "weight": "0.3",
      "feedback": "The answer addresses the question directly and follows the instructional directive. However, it provides limit

In [29]:
result = evaluate_with_rubric(question, student_answer_correct_b, chunks, index, rubric, strictness=5)
print(result)

```json
{
  "criteria": [
    {
      "criterion": "Critical Analysis (understanding of course materials)",
      "weight": "0.35",
      "feedback": "The student's response accurately captures the connection between land and Dakota religion and cultural identity, mentioning sacred stories, ceremonies, and generational memory. It reflects an understanding of the course materials.",
      "score_received": "1"
    },
    {
      "criterion": "Academic and Scholarly Presentation",
      "weight": "0.35",
      "feedback": "The student's answer is clear and free from errors. It meets the minimum word count requirement and aligns with the expected academic style. However, there is no citation since no direct references are made.",
      "score_received": "0.9"
    },
    {
      "criterion": "Portrays Insight (follows instructional questions)",
      "weight": "0.3",
      "feedback": "The student provides a direct response to the question, demonstrating an understanding of the course mate

### Question with CORRECT answer A | ChatGPT-4o-mini

In [26]:
result = evaluate_with_rubric(question, student_answer_correct_a, chunks, index, rubric)
print("\nEvaluation Correct answer A:\n", result)



Evaluation Correct answer A:
 {
  "criteria": [
    {
      "criterion": "Critical Analysis (understanding of course materials)",
      "weight": 0.35,
      "feedback": "The student's answer demonstrates an excellent understanding of the interconnections between land, culture, and religion among the Dakota people. Insightful references to historical events and the significance of land provide relevant and contextual explanations.",
      "score_received": 1
    },
    {
      "criterion": "Academic and Scholarly Presentation",
      "weight": 0.35,
      "feedback": "The presentation is clear and the ideas are articulated well without errors. The response follows proper formatting and the length requirement is met. However, there are no citations included for direct references to course materials.",
      "score_received": 0.9
    },
    {
      "criterion": "Portrays Insight (follows instructional questions)",
      "weight": 0.3,
      "feedback": "The answer engages with the instr

### Question with CORRECT answer B | ChatGPT-4o-mini

In [14]:
result = evaluate_with_rubric(question, student_answer_correct_b, chunks, index, rubric, overall_score=25)
print("\nEvaluation Correct answer B:\n", result)



Evaluation Correct answer B:
 ```json
{
  "criteria": [
    {
      "criterion": "Critical Analysis (understanding of course materials)",
      "weight": "0.35",
      "feedback": "The student demonstrates a proficient understanding of course materials, relating the importance of land to Dakota religion and cultural identity. Some contextual connections to sacred stories and generational memory are made, showing insight into the interconnectedness of culture and geography.",
      "score_received": "0.7"
    },
    {
      "criterion": "Academic and Scholarly Presentation",
      "weight": "0.35",
      "feedback": "The response is clear and largely free of errors, following proper formatting and meeting the word count requirement. However, there could be minor improvements in articulation for even greater clarity.",
      "score_received": "0.8"
    },
    {
      "criterion": "Portrays Insight (follows instructional questions)",
      "weight": "0.3",
      "feedback": "The answer p

### Question with WRONG answer | ChatGPT-4o-mini

In [25]:
result = evaluate_with_rubric(question, student_answer_wrong, chunks, index, rubric)
print("\nEvaluation wrong answer:\n", result)



Evaluation wrong answer:
 Criterion: Critical Analysis (understanding of course materials) - 0.35  
Feedback: The student's response contradicts the provided course materials, which underscore the importance of land in the religion and cultural identity of the Dakota people. The answer lacks any relevant connections or supporting details from the readings, ultimately demonstrating a poor understanding of the topic.  
Score Received: 0  

Criterion: Academic and Scholarly Presentation - 0.35  
Feedback: The answer is clear and free of grammatical errors; however, it does not meet the minimum word requirement of 50 words, as it is only 33 words long. Consequently, it falls short of the required length and lacks citations or references to support its claims.  
Score Received: 0  

Criterion: Portrays Insight (follows instructional questions) - 0.3  
Feedback: The student did not engage with the instructional question appropriately. The response dismisses the importance of land, which is 

### Question with UNCLEAR answer | ChatGPT-4o-mini

In [26]:
result = evaluate_with_rubric(question, student_answer_unclear, chunks, index, rubric)
print("\nEvaluation unclear answer A:\n", result)



Evaluation unclear answer A:
 Criterion: Critical Analysis (understanding of course materials) - 0.35  
Feedback: The student's answer shows a basic understanding of why land is important to the Dakota people, referencing stories, history, and ceremonies. However, the analysis is limited and lacks depth. There are no specific examples or insights drawn from the course materials that illustrate the unique relationship the Dakota have with the land.  
Score Received: 0.4  

Criterion: Academic and Scholarly Presentation - 0.35  
Feedback: The response is somewhat clear but contains vague wording ("matters somehow") and lacks academic rigor. There are no grammatical errors, but the informality and ambiguity reduce clarity. Additionally, the answer does not meet the minimum word count of 50 words.  
Score Received: 0.2  

Criterion: Portrays Insight (follows instructional questions) - 0.3  
Feedback: The answer attempts to engage with the instructional question regarding the centrality of

### Question with SHORT answer | ChatGPT-4o-mini

In [27]:
result = evaluate_with_rubric(question, student_answer_short, chunks, index, rubric)
print("\nEvaluation short answer:\n", result)


Evaluation short answer:
 Criterion: Critical Analysis (understanding of course materials) - 0.35  
Feedback: The student's answer demonstrates a minimal understanding of the course materials. While it touches on the significance of land to the Dakota in terms of sacredness and history, it lacks depth and does not provide specific examples, contextual explanations, or relevant experiences that clearly convey the complexity of their relationship with land. The response is too brief to showcase a comprehensive understanding.  
Score Received: 0.2  

Criterion: Academic and Scholarly Presentation - 0.35  
Feedback: The student's response is written clearly and is free from grammatical errors, but the length falls significantly short of the minimum word requirement of 50 words, which indicates a lack of effort in presentation. Since it is well under the minimum word count and does not adequately develop ideas, it fails to demonstrate proper scholarly presentation.  
Score Received: 0.1  


### Question with GEMINI answer | ChatGPT-4o-mini

In [28]:
result = evaluate_with_rubric(question, student_answer_gemini, chunks, index, rubric, overall_score=25)
print("\nEvaluation short answer:\n", result)


Evaluation short answer:
 Criterion: Critical Analysis (understanding of course materials) - 0.35  
Feedback: The student demonstrates a strong understanding of the course materials, providing insightful connections between land, memory, and Dakota identity. They articulate the significance of "Mni Sota Makoce" as a central aspect of Dakota cosmology and effectively integrate concepts of memory and cultural identity. However, a few more examples or personal reflections could further enhance the depth of analysis.  
Score Received: 0.8  

Criterion: Academic and Scholarly Presentation - 0.35  
Feedback: The answer is clear and well-articulated, free from grammatical errors. It adheres to proper formatting and exceeds the minimum word count requirement. However, the phrase "MniSotaMakoce" should include spaces for clarity (i.e., "Mni Sota Makoce"). The overall style is appropriate, though additional citation of sources could enhance academic rigor.  
Score Received: 0.9  

Criterion: Po

In [73]:
# import ollama

# response = ollama.chat(
#     model="llama3",  # make sure you have pulled this model: `ollama pull llama3`
#     messages=[{"role": "user", "content": "How are you?"}]
# )
# print(response["message"]["content"])

### Question with CORRECT answer A | llama3

In [29]:
result = evaluate_with_rubric(question, student_answer_correct_a, chunks, index, rubric, llm="ollama", model="llama3")
print("\nEvaluation Correct answer A:\n", result)


Evaluation Correct answer A:
 Here is the grading:

**Criterion: Critical Analysis**

Feedback: The student's answer demonstrates a good understanding of the course materials and its applications. They provide relevant connections between land, culture, and religion, using examples from Dakota history and practices. While not exhaustive, the answer provides sufficient insight into the importance of land in Dakota culture.

Score Received: 0.8

**Criterion: Academic and Scholarly Presentation**

Feedback: The student's answer is well-prepared, with clear articulation and proper citation (although only briefly mentioned). Minor formatting errors are present, but overall, the style of presentation follows the prompt. Word count exceeds the minimum requirement.

Score Received: 0.85

**Criterion: Portrays Insight**

Feedback: The student's answer provides good insight into the instructional questions, demonstrating a proficiency in understanding course materials. They engage with Dakota c

### Question with CORRECT answer B| llama3

In [137]:
result = evaluate_with_rubric(question, student_answer_correct_b, chunks, index, rubric, llm="ollama", model="llama3")
print("\nEvaluation Correct answer A:\n", result)


Evaluation Correct answer A:
 Here is the grading and feedback for each criterion:

Criterion: Critical Analysis (understanding of course materials) - Weight: 0.35
Feedback: The student demonstrates an excellent understanding of Dakota people's relationship with land, summarizing key points about their cultural identity and sacred spaces.
Score Received: 1

Criterion: Academic and Scholarly Presentation (Weight: 0.35)
Feedback: The answer is well-prepared, easy to understand, and free from errors. However, it could benefit from more supporting details and examples to further illustrate the student's points.
Score Received: 0.8

Criterion: Portrays Insight (follows instructional questions) - Weight: 0.3
Feedback: The student answers the question directly by explaining how land is central to Dakota religion and cultural identity, providing a clear and concise response that demonstrates their understanding of the material.
Score Received: 1

Final Weighted Score Calculation:

* Critical 

### Question with WRONG answer | llama3

In [138]:
result = evaluate_with_rubric(question, student_answer_wrong, chunks, index, rubric, llm="ollama", model="llama3")
print("\nEvaluation Correct answer A:\n", result)


Evaluation Correct answer A:
 Here is the grading report:

**Criterion: Critical Analysis**

Feedback: The student's answer does not demonstrate an understanding of the course materials. The statement "Land is not important to the Dakota people‚Äôs religion or culture" is a clear misunderstanding of the material provided, which emphasizes the significance of land in Dakota culture and spirituality.

Score Received: 0

**Criterion: Academic and Scholarly Presentation**

Feedback: The student's answer lacks proper formatting and does not follow the instructions. The response is brief and fails to address the question adequately. No sources are cited, and no attempt is made to provide supporting details or examples.

Score Received: 0

**Criterion: Portrays Insight (follows instructional questions)**

Feedback: The student's answer does not engage with the course materials or demonstrate an understanding of the instructional questions. The response is superficial and fails to address the

### Question with UNCLEAR answer | llama3

In [139]:
result = evaluate_with_rubric(question, student_answer_unclear, chunks, index, rubric, llm="ollama", model="llama3")
print("\nEvaluation Correct answer A:\n", result)


Evaluation Correct answer A:
 Here is the grading:

**Criterion: Critical Analysis (understanding of course materials)** - **0.35**

Feedback: The student's answer shows some understanding of the Dakota people's relationship with land, but it lacks depth and connection to the broader context. While they mention "stories and things tied to people and history," they do not demonstrate a clear understanding of how land is central to their religion and cultural identity.

Score Received: **0.6**

**Criterion: Academic and Scholarly Presentation (Weight: 0.35)**

Feedback: The student's answer is concise, but it lacks proper formatting and includes some minor errors in grammar and punctuation. Additionally, the student does not provide sufficient evidence to support their claims, such as specific examples or quotes.

Score Received: **0.7**

**Criterion: Portrays Insight (follows instructional questions)** - **0.3**

Feedback: The student's answer only partially addresses the question, pro

### Question with SHORT answer | llama3

In [140]:
result = evaluate_with_rubric(question, student_answer_short, chunks, index, rubric, llm="ollama", model="llama3")
print("\nEvaluation Correct answer A:\n", result)


Evaluation Correct answer A:
 Here's the grading:

**Criterion: Critical Analysis**

Feedback: The student's answer is brief and lacks depth in explaining why land is considered central to the religion and cultural identity of the Dakota people. While they mention that land holds their history, stories, and spiritual power, they do not provide any specific examples or connections to course materials.

Score Received: 0.20 ( Limited)

**Criterion: Academic and Scholarly Presentation**

Feedback: The student's answer is under the minimum word count requirement of 50 words. Additionally, there are no proper citations or references provided.

Score Received: 0.15 (Limited)

**Criterion: Portrays Insight**

Feedback: The student's answer does not fully engage with the instructional question and lacks insight into the topic. They only provide a brief statement without providing any specific examples or connections to course materials.

Score Received: 0.25 (Limited)

Final Weighted Score Ca

In [None]:
import pandas as pd
rubric = pd.read_csv("data/rubric.csv")

rubric.head()

Unnamed: 0,Criterion,Weight,Level,Description
0,Critical Analysis (understanding of course mat...,0.35,Exemplary,Reflection demonstrates an excellent understan...
1,Critical Analysis (understanding of course mat...,0.35,Proficient,Reflection displays an understanding of course...
2,Critical Analysis (understanding of course mat...,0.35,Limited,Reflection repeats and summarizes course mater...
3,Critical Analysis (understanding of course mat...,0.35,Unacceptable,Reflection shows little or no evidence that th...
4,Academic and Scholarly Presentation,0.35,Exemplary,Clear articulation free from errors. Easy to u...


- Criterion: Critical Analysis (understanding of course materials) (Weight: 0.35)
  - Level => Exemplary: Reflection demonstrates an excellent understanding of course materials and its applications. Insightful and relevant connections are made through contextual explanations, examples, or experiences.
  - Level => Proficient: Reflection displays an understanding of course materials and its applications. Connections are made through explanations, examples, or experiences.
  - Level => Limited: Reflection repeats and summarizes course materials with few connections made and lacks supporting details.
  - Level => Unacceptable: Reflection shows little or no evidence that the course materials have been read or understood. No examples or connections are made.

- Criterion: Academic and Scholarly Presentation (Weight: 0.35)
  - Level => Exemplary: Clear articulation free from errors. Easy to understand ideas. Follows proper formatting. Where source material is referenced, proper citation is i

In [71]:
rubric = "data/rubric.csv"

In [79]:
import json
def parse_llm_output(raw_text):

    raw_text = raw_text.strip()
    if raw_text.startswith("```"):
        parts = raw_text.split("```")
        if len(parts) > 1:
            raw_text = parts[1]
        raw_text = raw_text.replace("json", "").strip()

    # Parse JSON
    try:
        adapted_rubric = json.loads(raw_text)
        return adapted_rubric
    except Exception as e:
        print("Could not parse model response as JSON. Raw text below:\n", raw_text)
        raise ValueError(f"Failed to parse LLM response as JSON: {e}")

In [101]:
import pandas as pd
import json

def adapt_rubric_for_strictness_csv(csv_path, grading_strictness, llm="gpt"):
    """
    Reads a rubric from a CSV, rephrases the descriptions based on grading strictness using LLM,
    and optionally saves the updated rubric back to a CSV.
    """
    # Load the rubric
    rubric_df = pd.read_csv(csv_path)

    # Strictness instructions
    strictness_guidelines = {
        "lenient": "Rephrase each description to be slightly forgiving. Emphasize understanding over precision or structure, while keeping meaning and weight unchanged.",
        "moderate": "Keep the tone balanced and neutral. Maintain original intent, neither stricter nor more lenient.",
        "strict": "Rephrase each description to emphasize precision, rigor, and completeness. Penalize vague or incomplete responses, but keep meaning and weight unchanged."
    }

    # Convert to simple dict list for LLM
    rubric_list = rubric_df.to_dict(orient="records")

    # Build the LLM prompt
    prompt = f"""
You are rephrasing the 'Description' column of this grading rubric based on strictness level.

Strictness level: {grading_strictness}
Instruction: {strictness_guidelines[grading_strictness]}

Rubric:
{json.dumps(rubric_list, indent=2)}

Return the rubric in **valid JSON format**, preserving the same Criterion names and Weights, 
but with the 'Description' values rephrased according to the strictness.
"""

    # Call the LLM (example with OpenAI-style interface)
    llms = CallLLM()
    if llm == "gpt":
        response = llms.autograder_openai(prompt, max_tokens=1500)
    
    elif llm == "claude":
        response = llms.autograder_anthropic(prompt)

    elif llm == "ollama":
        response = llms.autograder_ollama(prompt)
    response = parse_llm_output(response)
    
    print(str(response))
    # Parse updated rubric JSON
    try:
        pass
        # adapted_rubric = json.loads(str(response))
    except Exception as e:
        raise ValueError(f"Failed to parse LLM response as JSON: {e}")

    # print("LLM Response:", adapted_rubric)
    adapted_df = pd.DataFrame(response)

    return adapted_df


In [87]:
def rubric_generation_based_on_strictness(path, strictness="moderate", llm="gpt"):

    rubric = adapt_rubric_for_strictness_csv(path, strictness, llm)

    criterion = rubric["Criterion"].unique().tolist()
    levels = rubric["Level"].unique().tolist()
    
    rubric_prompt = ""

    for c in criterion:
        weight = rubric[rubric["Criterion"]==c]["Weight"].values[0]
        rubric_prompt += f"- Criterion: {c} (Weight: {weight})"
        for l in levels:
            rubric[(rubric["Criterion"] == c) & (rubric["Level"] == l)]["Description"].values
            rubric_prompt += f"\n  - Level => {l}: {rubric[(rubric["Criterion"] == c) & (rubric["Level"] == l)]['Description'].values[0]}"
        rubric_prompt += "\n\n"

    return rubric_prompt

In [103]:
def evaluate_with_dynamic_rubric(question, 
                         student_answer, 
                         chunks, index, 
                         rubric_path,
                         strictness="moderate",
                         minimum_word_requirement=50, 
                         overall_score=100, 
                         k=3, llm="gpt"):


    q_embedding = embedder.encode(question, convert_to_numpy=True)

    # retrieve relevant context
    D, I = index.search(np.array([q_embedding]).astype("float32"), k)
    retrieved_chunks = [chunks[i] for i in I[0]]

    context = "\n\n".join(retrieved_chunks)

    # build rubric text
    rubric = rubric_generation_based_on_strictness(rubric_path, strictness, llm)

    prompt = f"""
You are grading a student's answer using the following rubric:

Rubric:
{rubric}

Minimum word requirement for the answer: {minimum_word_requirement} words.

Context from notes:
{context}

Question: {question}
Student Answer: {student_answer}

Student's Answer Length: {len(student_answer.split())} words.

Instructions:
- Score each criterion separately on a scale of 0 to 1.
- Provide reasoning for each score.
- Compute the weighted final score out of {overall_score}.
- If something is wrong or missing, explain why.

I want you to give the output in a JSON format only. I want you to follow the following template strictly:

{{
  "criteria": [
    {{
      "criterion": "{'{criterion_name}'}",
      "weight": "{'{weight}'}",
      "feedback": "{'{feedback}'}",
      "score_received": "{'{score_received}'}"
    }}
  ],
  "final_weighted_score_calculation": [
    {{
      "criterion": "{'{criterion_name}'}",
      "calculation": "{'{score_received} * {weight}'}",
      "result": "{'{calculation_result}'}"
    }}
  ],
  "total_score": {{
    "calculation": "{'{score1} + {score2} + ... + {scoreN}'}",
    "result": "{'{total_score}'}"
  }},
  "final_score": {{
    "out_of": "{'{overall_score}'}",
    "score": "{'{final_score}'}"
  }},
  "overall_feedback": "{'{overall_feedback}'}"
}}
"""
    llms = CallLLM()
    if llm == "gpt":
        return llms.autograder_openai(prompt, max_tokens=1500)
    
    if llm == "claude":
        return llms.autograder_anthropic(prompt)

    if llm == "ollama":
        return llms.autograder_ollama(prompt)

In [119]:
def save_text_to_file(text, filename):
    base_dir = os.path.join("data", "outputs")
    filepath = os.path.join(base_dir, f"{filename}.txt")
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    try:
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(text.strip())
        print(f"Saved successfully at: {filepath}")
    except Exception as e:
        print(f"Failed to save file: {e}")

In [124]:
def run_grader_for_testing(typeofevaluation, question, answer, natureofanswer, number_of_graders=5):
    for i in range(number_of_graders):
        result = evaluate_with_dynamic_rubric(question, answer, chunks, index, rubric, strictness=typeofevaluation, llm="gpt")
        save_text_to_file(result, natureofanswer + "/" + typeofevaluation + "_evaluation/gpt/result_" + str(i))

In [125]:
run_grader_for_testing("lenient", question, student_answer_correct_b, "student_answer_correct_b")
run_grader_for_testing("moderate", question, student_answer_correct_b, "student_answer_correct_b")
run_grader_for_testing("strict", question, student_answer_correct_b, "student_answer_correct_b")

[{'Criterion': 'Critical Analysis (understanding of course materials)', 'Weight': 0.35, 'Level': 'Exemplary', 'Description': 'Reflection shows a wonderful grasp of the course materials and how they can be applied. The connections made are thoughtful and meaningful, supported by contextual explanations, examples, or personal experiences.'}, {'Criterion': 'Critical Analysis (understanding of course materials)', 'Weight': 0.35, 'Level': 'Proficient', 'Description': 'Reflection reveals a good understanding of the course materials and their uses. Connections are generally made through explanations, examples, or experiences.'}, {'Criterion': 'Critical Analysis (understanding of course materials)', 'Weight': 0.35, 'Level': 'Limited', 'Description': 'Reflection touches on the course materials with some repetition and summaries. There are few connections and limited supporting details.'}, {'Criterion': 'Critical Analysis (understanding of course materials)', 'Weight': 0.35, 'Level': 'Unacceptab