# Pure Vibe Coding 

## Installation and Imports

In [1]:
%%capture
# Optional: Clean up old numpy to avoid ABI mismatch
!pip uninstall -y numpy

# ✅ Reinstall numpy first to ensure ABI compatibility
!pip install numpy==1.26.4 --force-reinstall

# ✅ Install vLLM and compatible versions
!pip install -U transformers==4.45.2 vllm==0.6.0
!pip install -U torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121

# Optional but helpful:
!pip uninstall -y pynvml
!pip install nvidia-ml-py

# For quantized model support:
!pip install optimum auto-gptq

In [None]:
import re
from typing import List, Optional
import torch
import vllm
from vllm import SamplingParams
from collections import defaultdict, Counter
from typing import List

## Loading Model and Tokenizer

In [None]:
# model_id = "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4"
model_id = "Qwen/Qwen2.5-14B-Instruct-AWQ"

llm = vllm.LLM(
    model_id,
    quantization="awq",
    max_model_len=4096,
    enable_prefix_caching=True,
    tensor_parallel_size=torch.cuda.device_count(),
)

tokenizer = llm.get_tokenizer()

## Defining System Prompt

In [4]:
SYS_PROMPT = """You are an evaluator that checks student answers based on the given question and rubric.

For every input, follow these steps in order:
1. **Evaluation**: Go through each rubric item and assign marks accordingly. 
   - Enclose marks in `<marks>` tags like this: `<marks>1/2</marks>`.
2. **Explanation**: For each rubric item, explain the reasoning behind the assigned marks.
3. **Final Score**: Sum the individual rubric scores and provide a total, using `<score>` tags. 
   - Format: `Final score: <score>7/8</score>`

Always use these three clearly labeled sections:  
`Evaluation:`  
`Explanation:`  
`Final Score:`  

Be objective, fair, and detailed. Do not skip any rubric.

Below are examples for you to learn the format:

**Example 1:**

question = Find the derivative of f(x) = x³ + 3x² + 1.

rubric = "1. States the general power rule (2 points)
2. Applies the power rule to x³ correctly (2 points)
3. Applies the power rule to 3x² correctly (2 points)
4. Simplifies the derivative properly (2 points)

answer = f'(x) = 3x² + 6x

Response:

Evaluation:
1. The student did not explicitly state the general power rule. <marks>0/2</marks>
2. The student correctly applied the power rule to x³. <marks>2/2</marks>
3. The student correctly applied the power rule to 3x². <marks>2/2</marks>
4. The simplification is correct. <marks>2/2</marks>

Explanation:
1. The general rule (d/dx[xⁿ] = n·xⁿ⁻¹) was not stated, though applied correctly.
2. d/dx[x³] = 3x², which the student wrote.
3. d/dx[3x²] = 6x, which is also correct.
4. There are no like terms, and the expression is simplified properly.

Final Score: <score>6/8</score>

**Example 2:**

question = What is the capital of France?

rubric = 1. Identifies the capital correctly (2 points)
2. Provides any additional context (1 point)

answer = Paris is the capital of France. It is known for the Eiffel Tower.

Response:

Evaluation:
1. Correct capital is stated. <marks>2/2</marks>
2. Additional context about Eiffel Tower is provided. <marks>1/1</marks>

Explanation:
1. Paris is indeed the capital of France.
2. Mentioning the Eiffel Tower adds informative context about the city.

Final Score: <score>3/3</score>

"""

## LLM Engine

In [5]:
def llm_engine(
    list_of_messages: List[List[dict]],
    stop_sequences: Optional[List[str]] = None,
    start_sequence: Optional[str] = None,
    temperature: float = 0.2,
    max_tokens: int = 2048
) -> List[str]:
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=0.9,
        max_tokens=max_tokens,
        stop=stop_sequences,
        include_stop_str_in_output=True,
    )

    prompts = [
        tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        for messages in list_of_messages
    ]

    if start_sequence:
        prompts = [prompt + start_sequence for prompt in prompts]

    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
    responses = [o.outputs[0].text.strip() for o in outputs]

    if start_sequence:
        responses = [start_sequence + response for response in responses]

    return responses


## Parsing Raw Responses

In [6]:
def parse_evaluation_response(response: str):
    """
    Extracts rubric-wise scores, explanations, and total score from the model's output.
    Assumes the structure:
    Evaluation:
    1. explanation <marks>X/Y</marks>
    2. explanation <marks>X/Y</marks>
    ...
    Explanation:
    1. ...
    2. ...
    Final Score:
    The final score is: <score>X/Y</score>
    """
    # --- Extract rubric-wise scores with explanations ---
    eval_section = re.search(r"Evaluation:(.*?)(Explanation:|Final Score:)", response, re.DOTALL)
    explanation_section = re.search(r"Explanation:(.*?)(Final Score:|$)", response, re.DOTALL)
    final_score = re.search(r"<score>(\d+)\s*/\s*(\d+)</score>", response)

    rubric_scores = []
    if eval_section:
        lines = eval_section.group(1).strip().split("\n")
        for line in lines:
            match = re.search(r"(.*)<marks>(\d+)\s*/\s*(\d+)</marks>", line.strip())
            if match:
                rubric_text = match.group(1).strip()
                score = int(match.group(2))
                total = int(match.group(3))
                rubric_scores.append((rubric_text, score, total))

    explanations = []
    if explanation_section:
        lines = explanation_section.group(1).strip().split("\n")
        explanations = [line.strip() for line in lines if line.strip()]

    total_score = None
    if final_score:
        total_score = (int(final_score.group(1)), int(final_score.group(2)))

    return rubric_scores, explanations, total_score


### Batchwise Parsing

In [7]:
def parse_batch_evaluations(responses: list[str]):
    """
    Processes a list of model responses, extracting:
    - Rubric-wise evaluations (text + marks)
    - Explanations
    - Final scores
    Returns a list of dicts, one per response.
    """
    results = []

    for idx, response in enumerate(responses):
        rubric_scores, explanations, total_score = parse_evaluation_response(response)

        entry = {
            "response_index": idx,
            "rubric_scores": rubric_scores,       # List of (rubric_text, score, total)
            "explanations": explanations,         # List of explanation strings
            "final_score": total_score            # Tuple (score, total) or None
        }

        results.append(entry)

    return results


### Helper function to make messages for LLM

In [10]:
def make_messages(question: str, answer: str, rubric: str) -> list[dict]:
    return [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": f"""QUESTION:
{question}

STUDENT ANSWER:
{answer}

RUBRIC:
{rubric}
"""}
    ]

## Function for Majority Voting

In [9]:
def majority_vote_with_explanations(parsed_batch):
    """
    Takes parsed batch outputs and returns majority-voted rubric scores with explanations.

    Returns:
    - List of tuples: (rubric_text, voted_score, total, explanation)
    """
    rubric_scores = defaultdict(list)  # i -> [(score, total)]
    rubric_explanations = defaultdict(list)  # i -> [explanation]
    rubric_texts = []

    for entry in parsed_batch:
        for i, (rubric_text, score, total) in enumerate(entry['rubric_scores']):
            rubric_scores[i].append((score, total))
            rubric_explanations[i].append(entry['explanations'][i])
            if len(rubric_texts) <= i:
                rubric_texts.append(rubric_text)

    results = []
    for i in range(len(rubric_scores)):
        # Majority vote on score
        score_counter = Counter(rubric_scores[i])
        (voted_score, voted_total), _ = score_counter.most_common(1)[0]

        # Most common explanation
        explanation_counter = Counter(rubric_explanations[i])
        voted_explanation, _ = explanation_counter.most_common(1)[0]

        results.append((rubric_texts[i], voted_score, voted_total, voted_explanation))

    return results


## For testing Individually

In [10]:
question = "A car initially at rest, achieved a speed of 60 km/h in a minute. Calculate the accelaration of the car"
answer = """Given u=0m/s,t=60s
v=60/3.6=16.67m/s
a=0.278ms^-2
"""
rubric = """1. Identifies the given informations correctly (3 points)
2. Mentions the formula for accelaration explicitely (2 points)
3. Uses the correct formula for accelaration (3 points)
4. Calculates the correct accelaration(Might be in m/s or km/h).(2 points)
"""

In [None]:
question = """
0 থেকে 2π পর্যন্ত y=sinx দ্বারা আবদ্ধ মোট ক্ষেত্রফল কত?
"""
answer = """
ধরি, 
\[
A = \int_{0}^{2\pi} \sin x \, dx
\]

আমরা জানি,
\[
\int \sin x \, dx = -\cos x + C
\]

অতএব,
\[
A = \left[-\cos x\right]_{0}^{2\pi}
= \left(-\cos 2\pi\right) - \left(-\cos 0\right)
\]

\[
= -(1) - (-1)
= -1 + 1 = 0
\]

সুতরাং,
\[
\boxed{ \text{ক্ষেত্রফল} = 0 \text{ একক} }
\]
"""

rubric = """
1. ক্ষেত্রফলের সাথে সম্পর্কিত নির্দিষ্ট ইন্টিগ্রাল চিহ্নিত করে। (4 points)
2. ক্ষেত্রফল ইন্টিগ্রেট করার জন্য সঠিক সূত্র ব্যবহার করে। (3 points)
3. সঠিক ক্ষেত্রফল গণনা করে। (3 points)
"""

In [None]:
question = "Write a short paragraph about industrial revolution"

answer = """The Industrial Revolushion was a time of big changes in the 1800s, when things started to be made by machines insted of by hand. 
            It started in Britain and spreaded to other countries. 
            It help make new technology and factories, but also caused problem like pollution and poor working conditions.
            It has great impact in the history of human kind. Without industrial revolution, nothing would be possible
        """

rubric = """
            1.Information is historically correct and relevant. (2 points)
    
            2.Sentences are grammatically correct with proper spelling. (2 points)
            
            3.Ideas are clearly expressed and logically connected. (2 points)
            
            4.Content stays focused on the Industrial Revolution. (2 points)
            
            5.Paragraph is brief but includes key details. (2 points)
        """

In [None]:
question = "Who wrote Hamlet?"
answer = "Kazi Nazrul Islam"
rubric = "1. Provides the correct answer (2 points)"

In [None]:
question = "Write a paragraph on Sundarban in Bangla"
answer = """
    সুন্দরবন একটি সুন্দর বন যা পাহাড়ে অবস্থিত। এটি সবসময় ঠান্ডা থাকে এবং এখানে বরফ পড়ে। 
    সুন্দরবনে অনেক ফুল ফোটে এবং মানুষ সেখানে বেড়াতে যায় পাহাড় দেখতে। সুন্দরবন মূলত একটি বড় পার্কের মতো, যেখানে পশুপাখি খাঁচায় থাকে।
"""
rubric = """
    1.তথ্যগত সঠিকতা (4 points) – সুন্দরবন সম্পর্কিত সঠিক তথ্য প্রদান করা হয়েছে কি না (অবস্থান, বৈশিষ্ট্য, প্রাণী)।

    2.বিষয়বস্তুর প্রাসঙ্গিকতা (2 points) – অনুচেদটি সুন্দরবন বিষয়ে কেন্দ্রভিত্তিক ও প্রাসঙ্গিক কি না।
    
    3.ভাষার গঠন ও ব্যাকরণ (2 points) – সঠিক ব্যাকরণ, বানান ও বাক্য গঠন রয়েছে কি না।
    
    4.পরিভাষার যথাযথ ব্যবহার (2 points) – পরিবেশ, বন্যপ্রাণী ও ভূগোল সম্পর্কিত সঠিক শব্দ ও পরিভাষা ব্যবহার করা হয়েছে কি না।
"""

In [None]:
question = "Write a paragraph on Sundarban in Bangla"
answer = """
    সুন্দরবন বিশ্বের সর্ববৃহৎ ম্যানগ্রোভ বন, যা বাংলাদেশ ও ভারতের একটি অংশ জুড়ে বিস্তৃত। এটি রয়েল বেঙ্গল টাইগার, চিত্রা হরিণ, কুমিরসহ অনেক বিরল প্রাণীর আবাসস্থল। 
    সুন্দরবন তার জটিল নদী-নালা, খাল এবং লবণাক্ত পরিবেশের জন্য পরিচিত। 
    এটি প্রাকৃতিক দুর্যোগ থেকে উপকূলীয় এলাকাগুলোকে রক্ষা করে এবং বাংলাদেশের পরিবেশ ও জীববৈচিত্র্যের জন্য অত্যন্ত গুরুত্বপূর্ণ।
"""
rubric = """
    1.তথ্যগত সঠিকতা (4 points) – সুন্দরবন সম্পর্কিত সঠিক তথ্য প্রদান করা হয়েছে কি না (অবস্থান, বৈশিষ্ট্য, প্রাণী)।

    2.বিষয়বস্তুর প্রাসঙ্গিকতা (2 points) – অনুচেদটি সুন্দরবন বিষয়ে কেন্দ্রভিত্তিক ও প্রাসঙ্গিক কি না।
    
    3.ভাষার গঠন ও ব্যাকরণ (2 points) – সঠিক ব্যাকরণ, বানান ও বাক্য গঠন রয়েছে কি না।
    
    4.পরিভাষার যথাযথ ব্যবহার (2 points) – পরিবেশ, বন্যপ্রাণী ও ভূগোল সম্পর্কিত সঠিক শব্দ ও পরিভাষা ব্যবহার করা হয়েছে কি না।
"""

In [11]:
# question = "Find the derivative of f(x) = x³ + 3x² + 1."
# answer = "f'(x) = 3x² + 6x."
# rubric = """1. States the general power rule (d/dx[xⁿ]=n·xⁿ⁻¹) (2 points)
# 2. Applies the power rule correctly to x³ (2 points)
# 3. Applies the power rule correctly to 3x² (2 points)
# 4. Simplifies and combines like terms properly (2 points)"""

messages_batch = [make_messages(question, answer, rubric)] * 10  # for 5 voting samplesScreenshot from 2025-05-03 17-05-41

responses = llm_engine(messages_batch)
parsed_results = parse_batch_evaluations(responses)
# majority_scores,majority_explanations,totals = majority_scores_per_rubric(parsed_results)
results = majority_vote_with_explanations(parsed_results)
total_mark = 0

# for result in parsed_results:
#     print(f"\n🔎 Response #{result['response_index'] + 1}")

#     marks_got = 0
#     total_marks = 0
#     for i, ((rubric, score, total), explanation) in enumerate(zip(result["rubric_scores"], result["explanations"]), 1):
#         marks_got += score
#         total_marks += total
#         print(f"{i}. {rubric} => Score: {score}/{total}")
#         print(f"   Explanation: {explanation}")
    
#     if result["final_score"]:
#         print(f"\n✅ Final Score: {result['final_score'][0]}/{result['final_score'][1]}")
#     total_mark = total_marks
#     print(f"\nSummed Final Score: {marks_got}/{total_marks}\n")

majority_scores = []
for rubric_text, score, total, explanation in results:
    majority_scores.append(score)
    total_mark += total
    print(f"{rubric_text}. Score => {score}/{total}")
    # print(f"Explanation: {explanation}")
    print()


print(f"\nFinal Score according to majority voting: {sum(majority_scores)}/{total_mark}")

1. The student correctly identifies the given information: initial velocity (u=0 m/s), time (t=60 s), and final velocity (v=16.67 m/s).. Score => 3/3

2. The student does not explicitly mention the formula for acceleration.. Score => 0/2

3. The student uses the correct formula for acceleration, which is \( a = \frac{v - u}{t} \).. Score => 3/3

4. The student calculates the correct acceleration: \( a = \frac{16.67 \, \text{m/s} - 0 \, \text{m/s}}{60 \, \text{s}} = 0.278 \, \text{m/s}^2 \).. Score => 2/2


Final Score according to majority voting: 8/10


### The following cell prints the raw responses

In [None]:
# question = "Find the derivative of f(x) = x³ + 3x² + 1."
# answer = "f'(x) = 3x² + 6x."
# rubric = """1. States the general power rule (d/dx[xⁿ]=n·xⁿ⁻¹) (2 points)
# 2. Applies the power rule correctly to x³ (2 points)
# 3. Applies the power rule correctly to 3x² (2 points)
# 4. Writes the final result properly(no need to write 0 in the final result) (2 points)"""

messages_batch = [make_messages(question, answer, rubric)] * 5  # for 5 voting samples

responses = llm_engine(messages_batch)

for response in responses:
    print(response)

# Evaluating Multiple questions at once!

In [6]:
questions_answers_list = [
    {
        "question" : """
        0 থেকে π পর্যন্ত y=sinx দ্বারা আবদ্ধ ক্ষেত্রফল কত?
        """,
        "answer" : """
        ধরি, 
        \[
        A = \int_{0}^{\pi} \sin x \, dx
        \]
        
        আমরা জানি,
        \[
        \int \sin x \, dx = -\cos x + C
        \]
        
        অতএব,
        \[
        A = \left[-\cos x\right]_{0}^{\pi}
        = \left(-\cos \pi\right) - \left(-\cos 0\right)
        \]
    
        \[
        = -(-1) - (-1)
        = 1 + 1 = 2
        \]
        
        সুতরাং,
        \[
        \boxed{ \text{ক্ষেত্রফল} = 2 \text{ একক} }
        \]
        """,
    
        "rubric" : """
        1. ক্ষেত্রফলের সাথে সম্পর্কিত নির্দিষ্ট ইন্টিগ্রাল চিহ্নিত করে। (4 points)
        2. ক্ষেত্রফল ইন্টিগ্রেট করার জন্য সঠিক সূত্র ব্যবহার করে। (3 points)
        3. সঠিক ক্ষেত্রফল গণনা করে। (3 points)
        """
    },
    {
        "question" : "A car initially at rest, achieved a speed of 60 km/h in a minute. Calculate the accelaration of the car",
        "answer" : """Given u=0m/s,t=60s
        v=60/3.6=16.67m/s
        a=0.278ms^-2
        """,
        "rubric" : """1. Identifies the given informations correctly (3 points)
        2. Mentions the formula for accelaration explicitely (2 points)
        3. Uses the correct formula for accelaration (3 points)
        4. Calculates the correct accelaration(Might be in m/s or km/h).(2 points)
        """
    },
    {
        "question" : """The time period of a simple pendulum in earth is 2 seconds. What will be its time period
        in space?
        """,
        "answer" : "The time period will be unchanged. It will still be 2 seconds",
        "rubric" : "1. Provides the correct answer (2 points)"
    },
    {
        "question" : """
            0 থেকে 2π পর্যন্ত y=sinx দ্বারা আবদ্ধ মোট ক্ষেত্রফল কত?
            """,
        "answer" : """
            ধরি, 
            \[
            A = \int_{0}^{2\pi} \sin x \, dx
            \]
            
            আমরা জানি,
            \[
            \int \sin x \, dx = -\cos x + C
            \]
            
            অতএব,
            \[
            A = \left[-\cos x\right]_{0}^{2\pi}
            = \left(-\cos 2\pi\right) - \left(-\cos 0\right)
            \]
            
            \[
            = -(1) - (-1)
            = -1 + 1 = 0
            \]
            
            সুতরাং,
            \[
            \boxed{ \text{ক্ষেত্রফল} = 0 \text{ একক} }
            \]
            """,

        "rubric" : """
        1. ক্ষেত্রফলের সাথে সম্পর্কিত নির্দিষ্ট ইন্টিগ্রাল চিহ্নিত করে। (4 points)
        2. ক্ষেত্রফল ইন্টিগ্রেট করার জন্য সঠিক সূত্র ব্যবহার করে। (3 points)
        3. সঠিক ক্ষেত্রফল গণনা করে। (3 points)
        """
    },
    {
        "question" : "Write a short paragraph about industrial revolution",
        "answer":"""The Industrial Revolushion was a time of big changes in the 1800s, when things started to be made by machines insted of by hand. 
            It started in Britain and spreaded to other countries. 
            It help make new technology and factories, but also caused problem like pollution and poor working conditions.
            It has great impact in the history of human kind. Without industrial revolution, nothing would be possible.
        """,
        "rubric":"""
            1.Information is historically correct and relevant. (2 points)
    
            2.Sentences are grammatically correct with proper spelling. (2 points)
            
            3.Ideas are clearly expressed and logically connected. (2 points)
            
            4.Content stays focused on the Industrial Revolution. (2 points)
            
            5.Paragraph is brief but includes key details. (2 points)
        """
    },
    {
        "question" : "Why benzene is stable despite having $\pi$ bonds?",
        "answer" : """
            Benzene is stable because its π bonds are stronger than normal double bonds and are fixed between alternating carbon atoms. 
            The structure doesn't change because the double bonds stay in place, and this makes it stable. 
            Also, since it’s a ring, the shape helps it be more balanced and less reactive.
        """,
        "rubric": """
            1.Explains correct reasons (e.g., resonance, delocalization, aromaticity, Huckel's rule). (5 points)

            2.Shows understanding of electron delocalization and bond behavior in benzene. (3 points)
            
            3.Uses correct scientific terms appropriately (e.g., π bonds, resonance, aromaticity). (2 points)
        """
    },
    {
        "question" : "Write a paragraph on Sundarban in Bangla",
        "answer" : """
            সুন্দরবন বিশ্বের সর্ববৃহৎ ম্যানগ্রোভ বন, যা বাংলাদেশ ও ভারতের একটি অংশ জুড়ে বিস্তৃত। এটি রয়েল বেঙ্গল টাইগার, চিত্রা হরিণ, কুমিরসহ অনেক বিরল প্রাণীর আবাসস্থল। 
            সুন্দরবন তার জটিল নদী-নালা, খাল এবং লবণাক্ত পরিবেশের জন্য পরিচিত। 
            এটি প্রাকৃতিক দুর্যোগ থেকে উপকূলীয় এলাকাগুলোকে রক্ষা করে এবং বাংলাদেশের পরিবেশ ও জীববৈচিত্র্যের জন্য অত্যন্ত গুরুত্বপূর্ণ।
        """,
        "rubric" : """
            1.তথ্যগত সঠিকতা (4 points) – সুন্দরবন সম্পর্কিত সঠিক তথ্য প্রদান করা হয়েছে কি না (অবস্থান, বৈশিষ্ট্য, প্রাণী)।
        
            2.বিষয়বস্তুর প্রাসঙ্গিকতা (2 points) – অনুচেদটি সুন্দরবন বিষয়ে কেন্দ্রভিত্তিক ও প্রাসঙ্গিক কি না।
            
            3.ভাষার গঠন ও ব্যাকরণ (2 points) – সঠিক ব্যাকরণ, বানান ও বাক্য গঠন রয়েছে কি না।
            
            4.পরিভাষার যথাযথ ব্যবহার (2 points) – পরিবেশ, বন্যপ্রাণী ও ভূগোল সম্পর্কিত সঠিক শব্দ ও পরিভাষা ব্যবহার করা হয়েছে কি না।
        """
    }
]

In [None]:
messages_batches = []
for qa in questions_answers_list: # This list is a list of dictionaries(for now)
    question, answer, rubric = qa["question"], qa["answer"], qa["rubric"]
    single_message = make_messages(question, answer, rubric)
    batch = [single_message] * 5  # 5 copies per question
    messages_batches.extend(batch)  # flatten for vllm batch

In [2]:
def batch_messages_together(questions_answers_list):
    messages_batches = []
    for qa in questions_answers_list: # This list is a list of dictionaries(for now)
        question, answer, rubric = qa["question"], qa["answer"], qa["rubric"]
        single_message = make_messages(question, answer, rubric)
        batch = [single_message] * 5  # 5 copies per question
        messages_batches.extend(batch)  # flatten for vllm batch
    return messages_batches

## Function to run LLM for batches

In [None]:
def chunked(iterable, size):
    """Yield successive chunks from iterable."""
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]

def run_llm_in_batches(messages_batches: List[list], batch_size: int = 48, group_size: int = 5) -> List[List[str]]:
    """
    Runs the llm_engine in fixed-size batches and groups responses for majority voting.

    Args:
        messages_batches (List[list]): List of chat-format messages (prompt per instance).
        batch_size (int): Number of messages to send per call to llm_engine.
        group_size (int): Number of completions per logical instance (for majority voting).

    Returns:
        grouped_responses (List[List[str]]): Responses grouped per question.
    """
    all_responses = []

    for batch in chunked(messages_batches, batch_size):
        responses = llm_engine(batch)
        all_responses.extend(responses)

    # Group responses (e.g., for 5 votes per question)
    grouped_responses = [all_responses[i:i + group_size] for i in range(0, len(all_responses), group_size)]

    return grouped_responses

## Function for Majority voting in Grouped Responses

In [13]:
def aggregate_majority_votes(grouped_responses):
    """
    Parses and performs majority voting with explanations on grouped LLM responses.

    Args:
        grouped_responses (List[List[str]]): Responses grouped per question.

    Returns:
        List[List[Tuple[str, int, int, str]]]: 
            For each question:
                A list of tuples (rubric_text, majority_score, total, explanation).
    """
    final_results = []

    for group in grouped_responses:
        parsed = parse_batch_evaluations(group)
        rubric_scores = majority_vote_with_explanations(parsed)
        final_results.append(rubric_scores)

    return final_results

## Helper function to print the final Evaluation

In [14]:
def print_majority_vote_results(final_results, questions=None):
    """
    Nicely prints rubric-wise majority scores and explanations.

    Args:
        final_results: List of lists of tuples (rubric_text, score, total, explanation)
        questions: Optional list of question texts for headers
    """
    for i, rubric_scores in enumerate(final_results):
        if questions:
            print(f"\n📌 Question {i+1}: {questions[i]}")
        else:
            print(f"\n📌 Question {i+1}:")

        total_score = 0
        total_possible = 0

        for j, (rubric, score, total, explanation) in enumerate(rubric_scores):
            print(f"\n🧾 Rubric {j+1}: {rubric}")
            print(f"✅ Score: {score}/{total}")
            print(f"🗒️ Explanation: {explanation}")
            total_score += score
            total_possible += total

        print(f"\n🏁 Final Score: {total_score}/{total_possible}")
        print("-" * 50)

In [21]:
messages_batches = batch_messages_together(questions_answers_list)
grouped_outputs = run_llm_in_batches(messages_batches, batch_size=48, group_size=5)
final_results = aggregate_majority_votes(grouped_outputs)
print_majority_vote_results(final_results)


📌 Question 1:

🧾 Rubric 1: 1. The student correctly identifies the integral for the area under the curve \( y = \sin x \) from \( 0 \) to \( \pi \).
✅ Score: 4/4
🗒️ Explanation: 1. The student correctly sets up the integral \( A = \int_{0}^{\pi} \sin x \, dx \) to find the area under the curve.

🧾 Rubric 2: 2. The student uses the correct formula for integrating \( \sin x \), which is \( \int \sin x \, dx = -\cos x + C \).
✅ Score: 3/3
🗒️ Explanation: 2. The student applies the correct integration formula \( \int \sin x \, dx = -\cos x + C \) and evaluates the definite integral from 0 to \( \pi \).

🧾 Rubric 3: 3. The student calculates the area correctly as 2 square units.
✅ Score: 3/3
🗒️ Explanation: 3. The student evaluates the definite integral correctly and arrives at the area of 2 square units.

🏁 Final Score: 10/10
--------------------------------------------------

📌 Question 2:

🧾 Rubric 1: 1. The student correctly identifies the given information: initial velocity (u=0 m/s),

# Server

## Installation

In [None]:
%%capture
!pip install flask pyngrok

In [None]:
%%capture
# Optional: Clean up old numpy to avoid ABI mismatch
!pip uninstall -y numpy

# ✅ Reinstall numpy first to ensure ABI compatibility
!pip install numpy==1.26.4 --force-reinstall

# ✅ Install vLLM and compatible versions
!pip install -U transformers==4.45.2 vllm==0.6.0
!pip install -U torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121

# Optional but helpful:
!pip uninstall -y pynvml
!pip install nvidia-ml-py

# For quantized model support:
!pip install optimum auto-gptq

In [None]:
import re
from typing import List, Optional
import torch
import vllm
from vllm import SamplingParams
from collections import defaultdict, Counter
from typing import List

# model_id = "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4"
model_id = "Qwen/Qwen2.5-14B-Instruct-AWQ"

llm = vllm.LLM(
    model_id,
    quantization="awq",
    max_model_len=4096,
    enable_prefix_caching=True,
    tensor_parallel_size=torch.cuda.device_count(),
)

tokenizer = llm.get_tokenizer()

In [20]:
SYS_PROMPT = """You are an evaluator that checks student answers based on the given question and rubric.

For every input, follow these steps in order:
1. **Evaluation**: Go through each rubric item and assign marks accordingly. 
   - Enclose marks in `<marks>` tags like this: `<marks>1/2</marks>`.
2. **Explanation**: For each rubric item, explain the reasoning behind the assigned marks.
3. **Final Score**: Sum the individual rubric scores and provide a total, using `<score>` tags. 
   - Format: `Final score: <score>7/8</score>`

Always use these three clearly labeled sections:  
`Evaluation:`  
`Explanation:`  
`Final Score:`  

Be objective, fair, and detailed. Do not skip any rubric.

Below are examples for you to learn the format:

**Example 1:**

question = Find the derivative of f(x) = x³ + 3x² + 1.

rubric = "1. States the general power rule (2 points)
2. Applies the power rule to x³ correctly (2 points)
3. Applies the power rule to 3x² correctly (2 points)
4. Simplifies the derivative properly (2 points)

answer = f'(x) = 3x² + 6x

Response:

Evaluation:
1. The student did not explicitly state the general power rule. <marks>0/2</marks>
2. The student correctly applied the power rule to x³. <marks>2/2</marks>
3. The student correctly applied the power rule to 3x². <marks>2/2</marks>
4. The simplification is correct. <marks>2/2</marks>

Explanation:
1. The general rule (d/dx[xⁿ] = n·xⁿ⁻¹) was not stated, though applied correctly.
2. d/dx[x³] = 3x², which the student wrote.
3. d/dx[3x²] = 6x, which is also correct.
4. There are no like terms, and the expression is simplified properly.

Final Score: <score>6/8</score>

**Example 2:**

question = What is the capital of France?

rubric = 1. Identifies the capital correctly (2 points)
2. Provides any additional context (1 point)

answer = Paris is the capital of France. It is known for the Eiffel Tower.

Response:

Evaluation:
1. Correct capital is stated. <marks>2/2</marks>
2. Additional context about Eiffel Tower is provided. <marks>1/1</marks>

Explanation:
1. Paris is indeed the capital of France.
2. Mentioning the Eiffel Tower adds informative context about the city.

Final Score: <score>3/3</score>

"""

def llm_engine(
    list_of_messages: List[List[dict]],
    stop_sequences: Optional[List[str]] = None,
    start_sequence: Optional[str] = None,
    temperature: float = 0.2,
    max_tokens: int = 2048
) -> List[str]:
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=0.9,
        max_tokens=max_tokens,
        stop=stop_sequences,
        include_stop_str_in_output=True,
    )

    prompts = [
        tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        for messages in list_of_messages
    ]

    if start_sequence:
        prompts = [prompt + start_sequence for prompt in prompts]

    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
    responses = [o.outputs[0].text.strip() for o in outputs]

    if start_sequence:
        responses = [start_sequence + response for response in responses]

    return responses

def parse_evaluation_response(response: str):
    """
    Extracts rubric-wise scores, explanations, and total score from the model's output.
    Assumes the structure:
    Evaluation:
    1. explanation <marks>X/Y</marks>
    2. explanation <marks>X/Y</marks>
    ...
    Explanation:
    1. ...
    2. ...
    Final Score:
    The final score is: <score>X/Y</score>
    """
    # --- Extract rubric-wise scores with explanations ---
    eval_section = re.search(r"Evaluation:(.*?)(Explanation:|Final Score:)", response, re.DOTALL)
    explanation_section = re.search(r"Explanation:(.*?)(Final Score:|$)", response, re.DOTALL)
    final_score = re.search(r"<score>(\d+)\s*/\s*(\d+)</score>", response)

    rubric_scores = []
    if eval_section:
        lines = eval_section.group(1).strip().split("\n")
        for line in lines:
            match = re.search(r"(.*)<marks>(\d+)\s*/\s*(\d+)</marks>", line.strip())
            if match:
                rubric_text = match.group(1).strip()
                score = int(match.group(2))
                total = int(match.group(3))
                rubric_scores.append((rubric_text, score, total))

    explanations = []
    if explanation_section:
        lines = explanation_section.group(1).strip().split("\n")
        explanations = [line.strip() for line in lines if line.strip()]

    total_score = None
    if final_score:
        total_score = (int(final_score.group(1)), int(final_score.group(2)))

    return rubric_scores, explanations, total_score

def parse_batch_evaluations(responses: list[str]):
    """
    Processes a list of model responses, extracting:
    - Rubric-wise evaluations (text + marks)
    - Explanations
    - Final scores
    Returns a list of dicts, one per response.
    """
    results = []

    for idx, response in enumerate(responses):
        rubric_scores, explanations, total_score = parse_evaluation_response(response)

        entry = {
            "response_index": idx,
            "rubric_scores": rubric_scores,       # List of (rubric_text, score, total)
            "explanations": explanations,         # List of explanation strings
            "final_score": total_score            # Tuple (score, total) or None
        }

        results.append(entry)

    return results

def make_messages(question: str, answer: str, rubric: str) -> list[dict]:
    return [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": f"""QUESTION:
{question}

STUDENT ANSWER:
{answer}

RUBRIC:
{rubric}
"""}
    ]

def majority_vote_with_explanations(parsed_batch):
    """
    Takes parsed batch outputs and returns majority-voted rubric scores with explanations.

    Returns:
    - List of tuples: (rubric_text, voted_score, total, explanation)
    """
    rubric_scores = defaultdict(list)  # i -> [(score, total)]
    rubric_explanations = defaultdict(list)  # i -> [explanation]
    rubric_texts = []

    for entry in parsed_batch:
        for i, (rubric_text, score, total) in enumerate(entry['rubric_scores']):
            rubric_scores[i].append((score, total))
            rubric_explanations[i].append(entry['explanations'][i])
            if len(rubric_texts) <= i:
                rubric_texts.append(rubric_text)

    results = []
    for i in range(len(rubric_scores)):
        # Majority vote on score
        score_counter = Counter(rubric_scores[i])
        (voted_score, voted_total), _ = score_counter.most_common(1)[0]

        # Most common explanation
        explanation_counter = Counter(rubric_explanations[i])
        voted_explanation, _ = explanation_counter.most_common(1)[0]

        results.append((rubric_texts[i], voted_score, voted_total, voted_explanation))

    return results

def batch_messages_together(questions_answers_list):
    messages_batches = []
    for qa in questions_answers_list: # This list is a list of dictionaries(for now)
        question, answer, rubric = qa["question"], qa["answer"], qa["rubric"]
        single_message = make_messages(question, answer, rubric)
        batch = [single_message] * 5  # 5 copies per question
        messages_batches.extend(batch)  # flatten for vllm batch
    return messages_batches

def chunked(iterable, size):
    """Yield successive chunks from iterable."""
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]

def run_llm_in_batches(messages_batches: List[list], batch_size: int = 48, group_size: int = 5) -> List[List[str]]:
    """
    Runs the llm_engine in fixed-size batches and groups responses for majority voting.

    Args:
        messages_batches (List[list]): List of chat-format messages (prompt per instance).
        batch_size (int): Number of messages to send per call to llm_engine.
        group_size (int): Number of completions per logical instance (for majority voting).

    Returns:
        grouped_responses (List[List[str]]): Responses grouped per question.
    """
    all_responses = []

    for batch in chunked(messages_batches, batch_size):
        responses = llm_engine(batch)
        all_responses.extend(responses)

    # Group responses (e.g., for 5 votes per question)
    grouped_responses = [all_responses[i:i + group_size] for i in range(0, len(all_responses), group_size)]

    return grouped_responses

def aggregate_majority_votes(grouped_responses):
    """
    Parses and performs majority voting with explanations on grouped LLM responses.

    Args:
        grouped_responses (List[List[str]]): Responses grouped per question.

    Returns:
        List[List[Tuple[str, int, int, str]]]: 
            For each question:
                A list of tuples (rubric_text, majority_score, total, explanation).
    """
    final_results = []

    for group in grouped_responses:
        parsed = parse_batch_evaluations(group)
        rubric_scores = majority_vote_with_explanations(parsed)
        final_results.append(rubric_scores)

    return final_results

In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route("/evaluate", methods=["POST"])
def evaluate():
    data = request.get_json()
    # data = [
    #     {
    #         "question":"",
    #         "answer":"",
    #         "rubric":""
    #     },
    #     {
    #         "question":"",
    #         "answer":"",
    #         "rubric":""
    #     }
    # ]
    
    messages_batches = batch_messages_together(data)
    grouped_outputs = run_llm_in_batches(messages_batches, batch_size=48, group_size=5)
    final_results = aggregate_majority_votes(grouped_outputs)
    
    # Structure for return data
    response_data = []
    
    for i, question_results in enumerate(final_results):
        question_data = {
            "question_index": i,
            "rubric_scores": [],
            "total_score": 0,
            "total_possible": 0
        }
        
        for rubric_text, score, total, explanation in question_results:
            question_data["rubric_scores"].append({
                "rubric_text": rubric_text,
                "score": score,
                "total": total,
                "explanation": explanation
            })
            question_data["total_score"] += score
            question_data["total_possible"] += total
            
        response_data.append(question_data)
    
    return jsonify(response_data)

@app.route("/evaluate_question_set", methods=["POST"])
def evaluate_question_set():
    data = request.get_json()
    # Expected format:
    # {
    #     "question_set_id": "uuid",
    #     "submission_id": "uuid",
    #     "questions": [
    #         {
    #             "question_id": "uuid",
    #             "question": "text",
    #             "answer": "text",
    #             "rubric": "text"
    #         },
    #         ...
    #     ]
    # }
    
    # Prepare data for evaluation
    eval_data = []
    for question_data in data["questions"]:
        eval_data.append({
            "question": question_data["question"],
            "answer": question_data["answer"],
            "rubric": question_data["rubric"]
        })
    
    # Evaluate all questions in the set
    messages_batches = batch_messages_together(eval_data)
    grouped_outputs = run_llm_in_batches(messages_batches, batch_size=48, group_size=5)
    final_results = aggregate_majority_votes(grouped_outputs)
    
    # Structure results
    response_data = {
        "question_set_id": data["question_set_id"],
        "submission_id": data["submission_id"],
        "overall_score": 0,
        "overall_possible": 0,
        "question_evaluations": []
    }
    
    for i, (question_results, question_data) in enumerate(zip(final_results, data["questions"])):
        question_eval = {
            "question_id": question_data["question_id"],
            "rubric_scores": [],
            "total_score": 0,
            "total_possible": 0
        }
        
        for rubric_text, score, total, explanation in question_results:
            question_eval["rubric_scores"].append({
                "rubric_text": rubric_text,
                "score": score,
                "total": total,
                "explanation": explanation
            })
            question_eval["total_score"] += score
            question_eval["total_possible"] += total
            
        response_data["question_evaluations"].append(question_eval)
        response_data["overall_score"] += question_eval["total_score"]
        response_data["overall_possible"] += question_eval["total_possible"]
    
    return jsonify(response_data)

In [23]:
from pyngrok import ngrok,conf
import threading

conf.get_default().auth_token = "2wJ6kv95fYPhMLm6aTNvpKvdVip_6R2DJ22BBvoK8MKzhBLX8"

ngrok.kill()

# Expose the Flask app to the internet
public_url = ngrok.connect(5000)
print(f"Your public URL is: {public_url}")

# Run Flask in a separate thread to avoid blocking
threading.Thread(target=app.run, kwargs={"use_reloader": False}).start()

Exception in thread Thread-8 (_monitor_process):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.11/dist-packages/pyngrok/process.py", line 139, in _monitor_process
    self._log_line(self.proc.stdout.readline())
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/encodings/ascii.py", line 26, in decode
    return codecs.ascii_decode(input, self.errors)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 184: ordinal not in range(128)


Your public URL is: NgrokTunnel: "https://a23a-34-59-154-2.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off
