# MetaMind CoT - Claude

In [49]:
import json
from pathlib import Path
from typing import Dict

PROMPT_PATH = Path("MetaMindCoT.json") # U
def build_system_prompt(prompt_cfg: Dict[str, str]) -> str:
    parts = [
        prompt_cfg.get("sys_instruction", "").strip(),
        prompt_cfg.get("thinking_steps", "").strip(),
        prompt_cfg.get("return_format", "").strip(),
    ]
    return "\n\n".join([p for p in parts if p])

with PROMPT_PATH.open("r", encoding="utf-8") as f:
    prompt_cfg = json.load(f)

# build system prompt string
system_prompt = build_system_prompt(prompt_cfg)
# sanity check
# print(system_prompt)

EVAL_JSONL_PATH = Path("output.jsonl")
def load_jsonl(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]
benchmarks = load_jsonl(EVAL_JSONL_PATH)
print(f"Loaded {len(benchmarks)} benchmark entries.")
print("Sample entry:", benchmarks[0])

def make_messages(benchmark: Dict, system_prompt: str):
    '''
    Changed to fit Anthropic messages conventions
    returns: "messages" var to go into llm_query
    '''
    context = benchmark.get("STORY", "").strip()
    question = benchmark.get("QUESTION", "").strip()

    options = "\n".join(
        f"{key}: {benchmark.get(key, '')}"
        for key in benchmark
        if key.startswith("OPTION-")
    )

    system_message = {
        "role": "system",
        "content": system_prompt
    }

    user_message = {
        "role": "user",
        "content": (
            f"id: {benchmark.get('INDEX')}\n\n"
            f"Story:\n{context}\n\n"
            f"Question:\n{question}\n\n"
            f"Options:\n{options}"
        )
    }

    return [system_message, user_message]

print("Sample messages:", make_messages(benchmarks[0], system_prompt))

# Anthropic Claude in place of other models
from anthropic import Anthropic
import os
from dotenv import load_dotenv

load_dotenv()
client = Anthropic(
    api_key=os.environ.get("ANTHROPIC_API_KEY"), 
)   

def llm_query(messages):
    '''
    takes in messages: a list with [system_message, user_message]

    system_message = {
        "role": "system",
        "content": system_prompt
    }

    user_message = {
        "role": "user",
        "content": ()
    }
    ''' 

    system_prompt = messages[0]['content'] # extract system prompt
    user_messages = [messages[1]]

    response = client.messages.create(
        max_tokens=2048,
        system=system_prompt,
        messages=user_messages,
        model="claude-sonnet-4-5-20250929",
        output_config={
            "format": {
                "type": "json_schema",
                "schema": {
                    "type": "object",
                    "properties": {
                        "pred": {
                            "type": "string",
                            "enum": ["A", "B", "C", "D"],
                            "description": "The predicted answer"
                        }
                    },
                    "required": ["pred"],
                    "additionalProperties": False
                }
            }
        }
    )
    return response.content[0].text

Loaded 600 benchmark entries.
Sample entry: {'ABILITY': 'Belief: Location false beliefs', 'INDEX': 1, 'STORY': 'Xiaogang and Xiaoming are wandering in the bedroom, they see a handbag, a briefcase, and a backpack, they find cabbage in the handbag, Xiaoming leaves the bedroom, Xiaogang moves the cabbage to the backpack.', 'QUESTION': 'Where is the cabbage now?', 'OPTION-A': 'Backpack', 'OPTION-B': 'Handbag', 'OPTION-C': 'Tote bag', 'OPTION-D': 'Briefcase', 'ANSWER': 'A'}
Sample messages: [{'role': 'system', 'content': 'You are a structured social reasoning system that models human social cognition using explicit Theory-of-Mind inference and norm-aware reasoning. Given a user input and optional context, infer latent mental states, refine them using social and ethical considerations, and generate a grounded response.\n\nOutput ONLY valid JSON.\n\nInstructions:\n1) Generate 3-5 distinct hypotheses about the user\'s latent mental states. Each hypothesis must include: id, type âˆˆ ["belief","

In [50]:
# Extract answers from raw_response and store to pandas dataframe
import pandas as pd
from tqdm import tqdm
rows = []

for i, benchmark in tqdm(enumerate(benchmarks[:100])):
    messages = make_messages(benchmark, system_prompt)

    raw_response = llm_query(messages)      # string
    try:
        response = json.loads(raw_response) # dict
    except Exception:
        response = {} 

    print("=" * 50)
    print("Raw response:")
    print(raw_response)
    print("=" * 50)

    correct_answer = benchmark.get("ANSWER", "").strip()
    pred = response.get("pred", "").strip()

    result = (pred == correct_answer)

    rows.append({
        "id": i,
        "benchmark_id": benchmark.get("INDEX"),
        "pred": pred,
        "gold": correct_answer,
        "correct": result,
        "raw_response": raw_response
    })

MetaMindCoT = pd.DataFrame(rows)
MetaMindCoT.to_csv("eval_result/MetaMindCoT_Claude.csv", index=False)

1it [00:01,  1.19s/it]

Raw response:
{"pred":"A"}





# General Prompt (No MetaMind-style prompt)

In [51]:
rows = []
for i, benchmark in tqdm(enumerate(benchmarks[:100])):
    general_prompt = (
    "You are a socially intelligent assistant answering multiple-choice reasoning questions.\n"
    "Return ONLY valid JSON.\n\n"
    "JSON schema:\n"
    "{\n"
    "  \"pred\": \"A|B|C|D\"\n"
    "}\n\n"
    "Rules:\n"
    "- pred must be exactly one of A, B, C, D.\n"
    "- Do not output anything outside the JSON."
)
    messages = make_messages(benchmark, general_prompt)

    raw_response = llm_query(messages)      # string

    print("=" * 50)
    print("Raw response:")
    print(raw_response)
    print("=" * 50)

    try:
        response = json.loads(raw_response) # dict
    
    except Exception as e:
        print(f"JSON parse error: {e}")
        response = {}
        
    correct_answer = benchmark.get("ANSWER", "").strip()
    pred = response.get("pred", "").strip()

    result = (pred == correct_answer)

    rows.append({
        "id": i,
        "benchmark_id": benchmark.get("INDEX"),
        "pred": pred,
        "gold": correct_answer,
        "correct": result,
        "raw_response": raw_response
    })

GenClaude = pd.DataFrame(rows)
GenClaude.to_csv("eval_result/Gen_Claude.csv", index=False)

1it [00:01,  1.85s/it]

Raw response:
{"pred": "A"}


2it [00:02,  1.30s/it]

Raw response:
{"pred": "B"}


3it [00:04,  1.31s/it]

Raw response:
{"pred": "A"}


4it [00:05,  1.49s/it]

Raw response:
{"pred":"D"}


5it [00:06,  1.33s/it]

Raw response:
{"pred":"B"}


6it [00:07,  1.22s/it]

Raw response:
{"pred":"B"}


7it [00:09,  1.18s/it]

Raw response:
{"pred": "D"}


8it [00:10,  1.33s/it]

Raw response:
{"pred": "A"}


9it [00:12,  1.33s/it]

Raw response:
{"pred":"A"}


10it [00:13,  1.26s/it]

Raw response:
{"pred": "A"}


11it [00:15,  1.61s/it]

Raw response:
{"pred": "B"}


12it [00:16,  1.53s/it]

Raw response:
{"pred": "A"}


13it [00:18,  1.55s/it]

Raw response:
{"pred":"C"}


14it [00:20,  1.67s/it]

Raw response:
{"pred":"D"}


15it [00:22,  1.69s/it]

Raw response:
{"pred":"B"}


16it [00:23,  1.67s/it]

Raw response:
{"pred": "A"}


17it [00:25,  1.66s/it]

Raw response:
{"pred": "D"}


18it [00:27,  1.68s/it]

Raw response:
{"pred": "B"}


19it [00:28,  1.62s/it]

Raw response:
{"pred":"D"}


20it [00:29,  1.40s/it]

Raw response:
{"pred":"D"}


21it [00:31,  1.54s/it]

Raw response:
{"pred":"C"}


22it [00:32,  1.45s/it]

Raw response:
{"pred":"D"}


23it [00:34,  1.53s/it]

Raw response:
{"pred":"B"}


24it [00:36,  1.64s/it]

Raw response:
{"pred":"B"}


25it [00:37,  1.45s/it]

Raw response:
{"pred": "B"}


26it [00:38,  1.35s/it]

Raw response:
{"pred": "B"}


27it [00:39,  1.28s/it]

Raw response:
{"pred":"D"}


28it [00:41,  1.41s/it]

Raw response:
{"pred": "C"}


29it [00:42,  1.33s/it]

Raw response:
{"pred": "B"}


30it [00:43,  1.39s/it]

Raw response:
{"pred": "D"}


31it [00:45,  1.35s/it]

Raw response:
{"pred":"A"}


32it [00:46,  1.43s/it]

Raw response:
{"pred":"C"}


33it [00:48,  1.41s/it]

Raw response:
{"pred":"D"}


34it [00:49,  1.53s/it]

Raw response:
{"pred": "C"}


35it [00:51,  1.44s/it]

Raw response:
{"pred":"D"}


36it [00:52,  1.38s/it]

Raw response:
{"pred":"C"}


37it [00:53,  1.37s/it]

Raw response:
{"pred":"C"}


38it [00:55,  1.50s/it]

Raw response:
{"pred":"B"}


39it [00:56,  1.43s/it]

Raw response:
{"pred":"A"}


40it [00:57,  1.30s/it]

Raw response:
{"pred":"D"}


41it [00:58,  1.27s/it]

Raw response:
{"pred":"C"}


42it [01:00,  1.33s/it]

Raw response:
{"pred": "C"}


43it [01:01,  1.24s/it]

Raw response:
{"pred":"D"}


44it [01:02,  1.18s/it]

Raw response:
{"pred": "C"}


45it [01:03,  1.12s/it]

Raw response:
{"pred": "A"}


46it [01:04,  1.07s/it]

Raw response:
{"pred": "A"}


47it [01:05,  1.21s/it]

Raw response:
{"pred": "B"}


48it [01:06,  1.15s/it]

Raw response:
{"pred":"A"}


49it [01:07,  1.09s/it]

Raw response:
{"pred": "B"}


50it [01:09,  1.24s/it]

Raw response:
{"pred": "B"}


51it [01:10,  1.19s/it]

Raw response:
{"pred":"C"}


52it [01:12,  1.38s/it]

Raw response:
{"pred":"C"}


53it [01:13,  1.30s/it]

Raw response:
{"pred": "D"}


54it [01:15,  1.42s/it]

Raw response:
{"pred": "B"}


55it [01:17,  1.53s/it]

Raw response:
{"pred": "A"}


56it [01:18,  1.44s/it]

Raw response:
{"pred": "B"}


57it [01:19,  1.34s/it]

Raw response:
{"pred":"B"}


58it [01:20,  1.38s/it]

Raw response:
{"pred": "B"}


59it [01:22,  1.33s/it]

Raw response:
{"pred":"D"}


60it [01:24,  1.53s/it]

Raw response:
{"pred":"C"}


61it [01:25,  1.61s/it]

Raw response:
{"pred":"C"}


62it [01:27,  1.49s/it]

Raw response:
{"pred": "C"}


63it [01:28,  1.38s/it]

Raw response:
{"pred":"D"}


64it [01:29,  1.37s/it]

Raw response:
{"pred": "B"}


65it [01:31,  1.54s/it]

Raw response:
{"pred": "A"}


66it [01:32,  1.51s/it]

Raw response:
{"pred":"A"}


66it [01:34,  1.43s/it]


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
# Print percentage summary of correct predictions for both Prompt attempts
acc_summary = {
    "GenClaude": GenClaude["correct"].mean().astype(float),
    "MetaMind CoT": MetaMindCoT["correct"].mean().astype(float)
}
print(acc_summary)


{'GenClaude': np.float64(1.0), 'MetaMind CoT': np.float64(1.0)}


In [None]:
MetaMindCoT = pd.DataFrame(rows)
MetaMindCoT


Unnamed: 0,id,benchmark_id,pred,gold,correct,raw_response
0,0,1,A,A,True,"{""pred"": ""A""}"
