In [3]:
import os
import re
import sys
import requests
import numpy as np
import pandas as pd
from collections import Counter
import random

#getting the test dev data from the file
dev_path = "cse476_final_project_dev_data.json"
test_path = "cse_476_final_project_test_data.json"
data = pd.read_json(test_path)
print(data.head())

#method to call the LLM API endpoint (using the given func in tutorial for now) #need to change
API_KEY = "cse476"
API_BASE = "http://10.4.58.53:41701/v1"
MODEL = "bens_model"
def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answer—no explanation.",
                                model: str = MODEL,
                                temperature: float = 0.0,
                                timeout: int = 60) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": 128,
    }

    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}

                                               input
0  A student walks to school one morning and noti...
1  A marketing company pays its employees on a co...
2  Which record label released Van Morrison's "I'...
3  Jane has saved $4 of her allowance every week ...
4  About 61.1% of Victorians describe themselves ...


In [33]:
#function to classify the kind of question that is being asked (domain in the dev dataset)
########################## CONSTANTS OUTSIDE THE MAIN CODE BLOCKS ###################################
router_labels = {
    "math",
    "coding",
    "future_prediction",
    "planning",
    "common_sense"
}
#the prompt for the LLM to figure out what kind of question is being asked
router_layer_prompt = '''
You are a question classifier.

You will need to do the following:
- Read the input question
- Figure out which one of the following domains best describes the question
- Only reply with the domain name, nothing else

This is the list of valid domains, and a short description of the characteristics of the domain:
- math: questions that require any mathematical calculation, equations, inequalities, or any numerical reasoning.
- coding: question that ask about programs, code, algorithms, or debugging code.
- future_prediction: questions that ask about what will happen in the future, creating forecasts, or hypothetical events.
- planning: questions about making plans, schedules, or step-by-step strategies.
- common_sense: everyday reasoning, intuitive judgements, or logic questions that do not require math or coding to solve.

Always respond with only the name of the domain, which is one of:
math, coding, future_prediction, planning, common_sense
'''.strip()

########################## START OF THE METHODS USED BY THE AGENT ###################################

def build_routing_question(question):
    prompt = f"""
    Question:
    {question}

    Classify this question into one of the following domains:
    math, coding, future_prediction, planning, common_sense

    Reply with only the domain name.
    """.strip()
    return prompt

#building the full prompt
def classify_question(question):
    system_prompt = router_layer_prompt
    question_prompt = build_routing_question(question)

    response = call_model_chat_completions(
        prompt=question_prompt,
        system=system_prompt,
        temperature=0.0,
        timeout=5
    )
    domain = (response.get("text") or "").lower().strip()
    if domain in router_labels:
        return domain

    for label in router_labels:
        if label in domain:
            return label

    #worst case repsonse if nothing matches
    return "common_sense"


'''         These are the domain specific prompts for each of the router labels that were defined above         '''
def build_domain_system_prompt(domain):
    """
    math prompt idea:
    - use CoT for better thinking process
    - use self-consistency to ensure that the majority of models converge on a single answer
    - then format final answer after the majority ans is decided
    """
    sys_prompt = (
        "You are a helpful assistant.\n"
        "Follow the user's question prompt exactly.\n"
        "Take time to reason in a step-by-step manner internally.\n"
        "The output should be returned only in the format requested, with no thought process or explanation.\n"
    )
    if domain == "math":
        sys_prompt = (
            "You are the best mathematical assistant.\n"
            "For each math question, do the following:\n"
            "- Think about the math problem step-by-step.\n"
            "- Carefully check the algebra, arithmetic, and logic done during each step\n"
            "- Make sure the the solution meets the criteria listed in the question.\n"
            "- If the answer is a number, output just that number."
            "If it is an expression, output just that expression.\n"
            "Do NOT include phrases like 'The answer is' or any reasoning."
        )
    elif domain == "coding":
        sys_prompt = (
            "You are an expert software engineer.\n"
            "You MUST follow these rules for every coding problem:\n"
            "1. Read the problem carefully and understand the required behavior.\n"
            "2. Plan the solution internally (in your head) before you write code.\n"
            "3. Output ONLY the final code solution, with no explanation or comments.\n"
            "4. Do NOT wrap the code in markdown fences (no ```), just plain code.\n"
            "5. Match the language and any format specified in the problem statement "
            "(for example, if it says 'in Python', write Python code).\n"
            "6. If a specific function signature or class name is given, use it exactly.\n"
        )
    elif domain in {"future_prediction", "planning", "common_sense"}:
        sys_prompt = (
            "You are a precise reasoning assistant.\n"
            "You will think through the question carefully and internally, "
            "and then output ONLY the final answer in the requested format "
            "(usually a single word, short phrase, or number). "
            "Do NOT show your reasoning."
        )
        if domain == "future_prediction":
            sys_prompt = (
                "You are a precise commonsense reasoner.\n"
                "For each question, decide whether the correct answer is yes or no.\n"
                "Internally, think it through carefully.\n"
                "Then output ONLY:\n"
                "  true   if the correct answer is yes\n"
                "  false  if the correct answer is no\n"
                "Use lowercase, no punctuation, no extra words."
                "Only answer true or false.\n"
            )
        elif domain == "planning":
            sys_prompt += "\nFocus on producing a clear, actionable plan or decision."

        elif domain == "common_sense":
            sys_prompt += "\nFocus on everyday commonsense and intuitive reasoning."

    return sys_prompt

########################## MATH DOMAIN SPECIFIC CoT + Self Consistency ###################################

def call_math_once(question, temp):
    system_prompt  = build_domain_system_prompt("math")
    response = call_model_chat_completions(
        prompt=question,
        system = system_prompt,
        temperature = temp
    )
    return (response.get("text") or "").strip()

def math_formatting(answer):
    answer = answer.strip().lower()
    answer = re.sub(r"[^0-9\-\/\.]", "", answer)
    return answer

def math_self_consistency(question, polls, temp):
    answers = []
    for _ in range(polls):
        curr_answer = call_math_once(question, temp)
        answers.append(curr_answer)
    return answers

def math_majority_vote(answers):
    if not answers:
        return ""
    formatted = [math_formatting(ans) for ans in answers if ans.strip()]
    if not formatted:
        return answers[0]

    votes = Counter(formatted)
    best_match,_ = votes.most_common(1)[0]

    for ans in answers:
        if math_formatting(ans) == best_match:
            return ans

    return answers[0]

def extract_final_answer_math(text: str) -> str:
    s = text.strip()

    m = re.findall(r"####\s*([^\n]+)", s)
    if m:
        return m[-1].strip()

    frac_matches = re.findall(r"\\frac\{([^}]+)\}\{([^}]+)\}", s)
    if frac_matches:
        num, den = frac_matches[-1]
        return f"\\frac{{{num}}}{{{den}}}"

    num_matches = re.findall(r"-?\d+(?:\.\d+)?", s)
    if num_matches:
        return num_matches[-1]

    lines = [line.strip() for line in s.splitlines() if line.strip()]
    if lines:
        return lines[-1]
    return s

def math_finalize(answer):
    prompt = (
        f"Current answer: {answer}\n"
        "Is this answer in its simplest form. If it is a fraction, try to reduce it to an integer if possible"
    )
    system_prompt = "Your job is to reduce and simplify math answers if possible. Only simplify answers if possible"
    response = call_model_chat_completions(
        prompt=prompt,
        system = system_prompt,
        temperature = 0
    )
    final_answer = (response.get("text") or "").strip()
    return final_answer

def math_cot(question, curr_ans):
    system_prompt = (
        "You are a mathematical validator.\n"
        "Your job is to verify a proposed answer to a math question.\n"
        "You need to think step-by-step to decide whether the answer is valid for the question.\n"
        "If the proposed answer is correct, then only output that answer with no change.\n"
        "If it is incorrect, then recompute the correct answer, and only output the correct answer.\n"
        "In any case, only output the single final answer, expression, or format requested\n"
        "No explanations, no sentences, no thought process, no testing, only the final answer."
    )
    prompt = (
        f"Problem: \n {question}\n"
        f"Proposed answer: {curr_ans}\n"
        "Carefully verify whether the answer is valid or not.\n"
        "If needed, recompute and output the correct answer.\n"
    )

    response = call_model_chat_completions(
        prompt=prompt,
        system = system_prompt,
        temperature = 0
    )
    cot_checked = (response.get("text") or "").strip()
    return cot_checked

def math_main(question):
    answers = math_self_consistency(question, polls=3, temp=0.3)
    majority_vote = math_majority_vote(answers)
    simplified = math_finalize(majority_vote)
    CoT_check = math_cot(question, simplified)
    return extract_final_answer_math(CoT_check)

########################## CODING DOMAIN SPECIFIC SYSTEM PROMPTS ###################################

def call_coding_once(question, temp):
    system_prompt  = build_domain_system_prompt("coding")
    response = call_model_chat_completions(
        prompt=question,
        system = system_prompt,
        temperature = temp
    )
    return (response.get("text") or "").strip()

def coding_main(question):
    answer = call_coding_once(question, temp=0)
    return answer

########################## REASONING DOMAIN SPECIFIC (self-consistency used here for common sense questions) ###################################

'''----------------------- Common sense methods ------------------------'''
def common_sense_booleans(answer):
    answer = answer.strip().lower()
    if "true" in answer or "yes" in answer:
        return "true"
    elif "false" in answer or "no" in answer:
        return "false"
    else:
        return ""

def common_sense_main(question, polls=3):
    system_prompt = build_domain_system_prompt("common_sense")
    answers = []

    for _ in range(polls):
        response = call_model_chat_completions(
            prompt=question,
            system = system_prompt,
            temperature = 0.3
        )
        answer = (response.get("text") or "").strip()
        answers.append(answer)

    votes = Counter(common_sense_booleans(answer) for answer in answers)
    votes.pop("", None)

    if not votes:
        return answers[0]

    best_match,_ = votes.most_common(1)[0]
    for answer in answers:
        if common_sense_booleans(answer) == best_match:
            return answer

    return answers[0]

'''---------------------- Simple llm call for planning and prediction domains ----------------------'''
def planning_predictions(domain, question):
    system_prompt = build_domain_system_prompt(domain)
    response = call_model_chat_completions(
        prompt=question,
        system = system_prompt,
        temperature = 0
    )
    answer = (response.get("text") or "").strip()
    return answer

'''---------------------- Main method for al three reasoning domains ----------------------'''
def reasoning_main(domain, question):
    if domain == "common_sense":
        return common_sense_main(question=question)
    elif domain in {"future_prediction", "planning"}:
        return planning_predictions(domain=domain, question=question)
    else:
        return planning_predictions(domain="common_sense", question=question)

########################## METHOD TO ROUTE QUESTIONS TO RESPECTIVE MAINS ###################################

'''---------------------- routing the question to specific process after the classification is done ----------------------'''
def route(domain: str, question: str) -> str:
    if domain == "math":
        return math_main(question)
    if domain == "coding":
        return coding_main(question)
    if domain in {"common_sense", "planning", "future_prediction"}:
        return reasoning_main(domain, question)
    return reasoning_main("common_sense", question)

def main(question):
    domain = classify_question(question)
    answer = route(domain, question)
    if answer=="yes" or answer=="Yes":
        answer = "true"
    if answer=="no" or answer=="No":
        answer = "false"
    return answer


In [35]:
def extract_final_answer_from_gold(gold: str) -> str:
    s = str(gold).strip()

    # If it uses the '#### answer' pattern, grab that
    m = re.findall(r"####\s*([^\n]+)", s)
    if m:
        return m[-1].strip()

    # Otherwise, treat the whole thing as the final answer
    # (for entries like '9', 'x^3+3x-6', '\frac{243}{625}', '144', etc.)
    return s

# --- The updated main loop ---

math_rows = data[data["domain"] == "math"].sample(n=300, random_state=42)

# Initialize accuracy tracking variables
correct_predictions = 0
total_questions = 0

for _, row in math_rows.iterrows():
    q = row["input"]
    gold_raw = row["output"]
    gold = extract_final_answer_from_gold(gold_raw)
    pred = math_main(q)

    # Check for a match and update counters
    is_match = (gold == pred)
    if is_match:
        correct_predictions += 1
    total_questions += 1

    # Calculate running accuracy
    running_accuracy = (correct_predictions / total_questions) * 100 if total_questions > 0 else 0

    print("QUESTION:")
    print(q)
    print("Ground Truth:", gold)
    print("LLM answer:", pred)
    print(f"MATCH: {is_match}")
    # Print running accuracy in a formatted way
    print(f"RUNNING ACCURACY: {running_accuracy:.2f}% ({correct_predictions}/{total_questions})")
    print("-" * 60)

# Optional: Print final accuracy at the very end of the loop
print(f"\nFinal Accuracy: {running_accuracy:.2f}% ({correct_predictions}/{total_questions})")


QUESTION:
How many positive whole-number divisors does 196 have?
Ground Truth: 9
LLM answer: 9
MATCH: True
RUNNING ACCURACY: 100.00% (1/1)
------------------------------------------------------------
QUESTION:
Expand and simplify completely: \begin{align*}
x\left(x(1+x)+2x\right)-3(x^2-x+2)
\end{align*}
Ground Truth: x^3+3x-6
LLM answer: 6
MATCH: False
RUNNING ACCURACY: 50.00% (1/2)
------------------------------------------------------------
QUESTION:
Arnel had ten boxes of pencils with the same number of pencils in each box.  He kept ten pencils and shared the remaining pencils equally with his five friends. If his friends got eight pencils each, how many pencils are in each box?
Ground Truth: 5
LLM answer: 5
MATCH: True
RUNNING ACCURACY: 66.67% (2/3)
------------------------------------------------------------
QUESTION:
A gardener plants three maple trees, four oaks, and five birch trees in a row. He plants them in random order, each arrangement being equally likely. Let $\frac m n$

KeyboardInterrupt: 

In [None]:
|