# MetaMind CoT

In [None]:
import json
from pathlib import Path
from typing import Dict

PROMPT_PATH = Path("../false_belief_cleaning/MetaMindCoT.json")
def build_system_prompt(prompt_cfg: Dict[str, str]) -> str:
    parts = [
        prompt_cfg.get("sys_instruction", "").strip(),
        prompt_cfg.get("thinking_steps", "").strip(),
        prompt_cfg.get("return_format", "").strip(),
    ]
    return "\n\n".join([p for p in parts if p])

with PROMPT_PATH.open("r", encoding="utf-8") as f:
    prompt_cfg = json.load(f)

# build system prompt string
system_prompt = build_system_prompt(prompt_cfg)
# sanity check
# print(system_prompt)

EVAL_JSONL_PATH = Path("../false_belief_cleaning/output.jsonl")
def load_jsonl(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]
benchmarks = load_jsonl(EVAL_JSONL_PATH)
print(f"Loaded {len(benchmarks)} benchmark entries.")
print("Sample entry:", benchmarks[0])

def make_messages(benchmark: Dict, system_prompt: str):
    context = benchmark.get("STORY", "").strip()
    question = benchmark.get("QUESTION", "").strip()

    options = "\n".join(
        f"{key}: {benchmark.get(key, '')}"
        for key in benchmark
        if key.startswith("OPTION-")
    )

    system_message = {
        "role": "system",
        "content": system_prompt
    }

    user_message = {
        "role": "user",
        "content": (
            f"id: {benchmark.get('INDEX')}\n\n"
            f"Story:\n{context}\n\n"
            f"Question:\n{question}\n\n"
            f"Options:\n{options}"
        )
    }

    return [system_message, user_message]

print("Sample messages:", make_messages(benchmarks[0], system_prompt))

from openai import OpenAI


client = OpenAI(api_key = "YOUR_API_KEY_HERE")

def llm_query(messages):
    resp = client.chat.completions.create(
        model="gpt-5.2",
        messages=messages,
        temperature=0.0,
        max_completion_tokens=2048,
        response_format={"type": "json_object"},
    )
    return resp.choices[0].message.content

In [None]:
import pandas as pd
from tqdm import tqdm
rows = []

for i, benchmark in tqdm(enumerate(benchmarks[:100])):
    messages = make_messages(benchmark, system_prompt)

    raw_response = llm_query(messages)      # string
    try:
        response = json.loads(raw_response) # dict
    except Exception:
        response = {} 

    correct_answer = benchmark.get("ANSWER", "").strip()
    pred = response.get("pred", "").strip()

    result = (pred == correct_answer)

    rows.append({
        "id": i,
        "benchmark_id": benchmark.get("INDEX"),
        "pred": pred,
        "gold": correct_answer,
        "correct": result,
        "raw_response": raw_response
    })

MetaMindCoT = pd.DataFrame(rows)
MetaMindCoT.to_csv("../eval_result/MetaMindCoT_ChatGPT.csv", index=False)

# Gneral Prompt

In [None]:
rows = []
for i, benchmark in tqdm(enumerate(benchmarks[:100])):
    general_prompt = (
    "You are a socially intelligent assistant answering multiple-choice reasoning questions.\n"
    "Return ONLY valid JSON.\n\n"
    "JSON schema:\n"
    "{\n"
    "  \"pred\": \"A|B|C|D\"\n"
    "}\n\n"
    "Rules:\n"
    "- pred must be exactly one of A, B, C, D.\n"
    "- Do not output anything outside the JSON."
)
    messages = make_messages(benchmark, general_prompt)

    raw_response = llm_query(messages)      # string
    try:
        response = json.loads(raw_response) # dict
    except Exception:
        response = {}  

    correct_answer = benchmark.get("ANSWER", "").strip()
    pred = response.get("pred", "").strip()

    result = (pred == correct_answer)

    rows.append({
        "id": i,
        "benchmark_id": benchmark.get("INDEX"),
        "pred": pred,
        "gold": correct_answer,
        "correct": result,
        "raw_response": raw_response
    })

GenGPT = pd.DataFrame(rows)
GenGPT.to_csv("../false_belief_cleaning/eval_result/Gen_ChatGPT.csv", index=False)

In [None]:
import matplotlib.pyplot as plt
acc_summary = {
    "GenGPT": GenGPT["correct"].mean().astype(float),
    "MetaMind CoT": MetaMindCoT["correct"].mean().astype(float)
}
print(acc_summary)
