# MetaMind CoT

In [3]:
import json
from pathlib import Path
from typing import Dict

PROMPT_PATH = Path("../false_belief_cleaning/MetaMindCoT.json")
def build_system_prompt(prompt_cfg: Dict[str, str]) -> str:
    parts = [
        prompt_cfg.get("sys_instruction", "").strip(),
        prompt_cfg.get("thinking_steps", "").strip(),
        prompt_cfg.get("return_format", "").strip(),
    ]
    return "\n\n".join([p for p in parts if p])

with PROMPT_PATH.open("r", encoding="utf-8") as f:
    prompt_cfg = json.load(f)

# build system prompt string
system_prompt = build_system_prompt(prompt_cfg)
# sanity check
# print(system_prompt)

EVAL_JSONL_PATH = Path("../false_belief_cleaning/output.jsonl")
def load_jsonl(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]
benchmarks = load_jsonl(EVAL_JSONL_PATH)
print(f"Loaded {len(benchmarks)} benchmark entries.")
print("Sample entry:", benchmarks[0])

def make_messages(benchmark: Dict, system_prompt: str):
    context = benchmark.get("STORY", "").strip()
    question = benchmark.get("QUESTION", "").strip()

    options = "\n".join(
        f"{key}: {benchmark.get(key, '')}"
        for key in benchmark
        if key.startswith("OPTION-")
    )

    system_message = {
        "role": "system",
        "content": system_prompt
    }

    user_message = {
        "role": "user",
        "content": (
            f"id: {benchmark.get('INDEX')}\n\n"
            f"Story:\n{context}\n\n"
            f"Question:\n{question}\n\n"
            f"Options:\n{options}"
        )
    }

    return [system_message, user_message]

print("Sample messages:", make_messages(benchmarks[0], system_prompt))

from openai import OpenAI


client = OpenAI(api_key = "YOUR_API_KEY_HERE")

def llm_query(messages):
    resp = client.chat.completions.create(
        model="gpt-5.2",
        messages=messages,
        temperature=0.0,
        max_completion_tokens=2048,
        response_format={"type": "json_object"},
    )
    return resp.choices[0].message.content

Loaded 600 benchmark entries.
Sample entry: {'ABILITY': 'Belief: Location false beliefs', 'INDEX': 1, 'STORY': 'Xiaogang and Xiaoming are wandering in the bedroom, they see a handbag, a briefcase, and a backpack, they find cabbage in the handbag, Xiaoming leaves the bedroom, Xiaogang moves the cabbage to the backpack.', 'QUESTION': 'Where is the cabbage now?', 'OPTION-A': 'Backpack', 'OPTION-B': 'Handbag', 'OPTION-C': 'Tote bag', 'OPTION-D': 'Briefcase', 'ANSWER': 'A'}
Sample messages: [{'role': 'system', 'content': 'You are a structured social reasoning system that models human social cognition using explicit Theory-of-Mind inference and norm-aware reasoning. Given a user input and optional context, infer latent mental states, refine them using social and ethical considerations, and generate a grounded response.\n\nOutput ONLY valid JSON.\n\nInstructions:\n1) Generate 3-5 distinct hypotheses about the user\'s latent mental states. Each hypothesis must include: id, type ∈ ["belief","de

In [None]:
import pandas as pd
from tqdm import tqdm
rows = []

for i, benchmark in tqdm(enumerate(benchmarks[:100])):
    messages = make_messages(benchmark, system_prompt)

    raw_response = llm_query(messages)      # string
    try:
        response = json.loads(raw_response) # dict
    except Exception:
        response = {} 

    correct_answer = benchmark.get("ANSWER", "").strip()
    pred = response.get("pred", "").strip()

    result = (pred == correct_answer)

    rows.append({
        "id": i,
        "benchmark_id": benchmark.get("INDEX"),
        "pred": pred,
        "gold": correct_answer,
        "correct": result,
        "raw_response": raw_response
    })

MetaMindCoT = pd.DataFrame(rows)
MetaMindCoT.to_csv("../eval_result/MetaMindCoT_ChatGPT.csv", index=False)

# Gneral Prompt

In [None]:
rows = []
for i, benchmark in tqdm(enumerate(benchmarks[:100])):
    general_prompt = (
    "You are a socially intelligent assistant answering multiple-choice reasoning questions.\n"
    "Return ONLY valid JSON.\n\n"
    "JSON schema:\n"
    "{\n"
    "  \"pred\": \"A|B|C|D\"\n"
    "}\n\n"
    "Rules:\n"
    "- pred must be exactly one of A, B, C, D.\n"
    "- Do not output anything outside the JSON."
)
    messages = make_messages(benchmark, general_prompt)

    raw_response = llm_query(messages)      # string
    try:
        response = json.loads(raw_response) # dict
    except Exception:
        response = {}  

    correct_answer = benchmark.get("ANSWER", "").strip()
    pred = response.get("pred", "").strip()

    result = (pred == correct_answer)

    rows.append({
        "id": i,
        "benchmark_id": benchmark.get("INDEX"),
        "pred": pred,
        "gold": correct_answer,
        "correct": result,
        "raw_response": raw_response
    })

GenGPT = pd.DataFrame(rows)
GenGPT.to_csv("../false_belief_cleaning/eval_result/Gen_ChatGPT.csv", index=False)

In [None]:
import matplotlib.pyplot as plt
acc_summary = {
    "GenGPT": GenGPT["correct"].mean().astype(float),
    "MetaMind CoT": MetaMindCoT["correct"].mean().astype(float)
}
print(acc_summary)


# Gemini Implementation

In [None]:
import google.generativeai as genai
import json

genai.configure(api_key="AIzaSyBUk68FpPRnrlRzeF_4wleqNs1NdVtqBVk") 

# Rate limit: 30 requests per minute
DELAY_BETWEEN_REQUESTS = 1

def make_gemini_prompt_from_messages(messages):
    system = ""
    user = ""
    for m in messages:
        if m["role"] == "system":
            system = m["content"]
        elif m["role"] == "user":
            user = m["content"]
    return f"{system}\n\n{user}"

def extract_json_simple(text):
    """Simple JSON extraction"""
    try:
        json.loads(text)  # Validate it's JSON
        return text
    except:
        pass
    
    start = text.find('{')
    if start == -1:
        return text
    depth = 0
    for i in range(start, len(text)):
        if text[i] == '{':
            depth += 1
        elif text[i] == '}':
            depth -= 1
            if depth == 0:
                return text[start:i+1]
    return text[start:]

def gemini_query(prompt: str):
    model = genai.GenerativeModel("models/gemma-3-12b-it")
    
    generation_config = {
        "temperature": 0.0,
        "max_output_tokens": 2048,
        # Gemma doesn't support JSON mode, will parse from text
    }
    
    for attempt in range(max_retries):
        try:
            response = model.generate_content(
                prompt,
                generation_config=generation_config
            )
            return extract_json_simple(response.text)
        except Exception as e:
            error_str = str(e)
            print(f"Error: {error_str}")
    

sample_messages = make_messages(benchmarks[0], system_prompt)
sample_prompt = make_gemini_prompt_from_messages(sample_messages)

print("Sample entry:", benchmarks[0])
print("Sample Gemini prompt:\n", repr(sample_prompt))


Sample entry: {'ABILITY': 'Belief: Location false beliefs', 'INDEX': 1, 'STORY': 'Xiaogang and Xiaoming are wandering in the bedroom, they see a handbag, a briefcase, and a backpack, they find cabbage in the handbag, Xiaoming leaves the bedroom, Xiaogang moves the cabbage to the backpack.', 'QUESTION': 'Where is the cabbage now?', 'OPTION-A': 'Backpack', 'OPTION-B': 'Handbag', 'OPTION-C': 'Tote bag', 'OPTION-D': 'Briefcase', 'ANSWER': 'A'}
Sample Gemini prompt:
 'You are a structured social reasoning system that models human social cognition using explicit Theory-of-Mind inference and norm-aware reasoning. Given a user input and optional context, infer latent mental states, refine them using social and ethical considerations, and generate a grounded response.\n\nOutput ONLY valid JSON.\n\nInstructions:\n1) Generate 3-5 distinct hypotheses about the user\'s latent mental states. Each hypothesis must include: id, type ∈ ["belief","desire","intention","emotion","social_concern"], hypothes

In [None]:
# Gemini MetaMind CoT Prompting evaluation 
import pandas as pd
from tqdm import tqdm
import time

rows = []

for i, benchmark in tqdm(enumerate(benchmarks[:100])):
    messages = make_messages(benchmark, system_prompt)
    prompt = make_gemini_prompt_from_messages(messages)

    raw_response = gemini_query(prompt)
    try:
        response = json.loads(raw_response)
    except Exception:
        response = {}

    correct_answer = benchmark.get("ANSWER", "").strip()
    pred = response.get("pred", "").strip()

    result = (pred == correct_answer)

    rows.append({
        "id": i,
        "benchmark_id": benchmark.get("INDEX"),
        "pred": pred,
        "gold": correct_answer,
        "correct": result,
        "raw_response": raw_response
    })

    if i < 99:          # Rate limiting: wait between requests (except after the last one)
        time.sleep(DELAY_BETWEEN_REQUESTS)

MetaMindCoT_Gemini = pd.DataFrame(rows)
MetaMindCoT_Gemini.to_csv("eval_result/MetaMindCoT_Gemini.csv", index=False)
print(f"\nMetaMind CoT (Gemini) Accuracy: {MetaMindCoT_Gemini['correct'].mean():.2f}")


Starting evaluation with 3s delay between requests...
Estimated time for 100 predictions: ~5.0 minutes


100it [16:03,  9.64s/it]


MetaMind CoT (Gemini) Accuracy: 0.78





In [None]:
# Gemini General prompt evaluation
rows = []
general_prompt = (
    "You are a socially intelligent assistant answering multiple-choice reasoning questions.\n"
    "Return ONLY valid JSON.\n\n"
    "JSON schema:\n"
    "{\n"
    "  \"pred\": \"A|B|C|D\"\n"
    "}\n\n"
    "Rules:\n"
    "- pred must be exactly one of A, B, C, D.\n"
    "- Do not output anything outside the JSON."
)

for i, benchmark in tqdm(enumerate(benchmarks[:100])):
    messages = make_messages(benchmark, general_prompt)
    prompt = make_gemini_prompt_from_messages(messages)

    raw_response = gemini_query(prompt)
    try:
        response = json.loads(raw_response)
    except Exception:
        response = {}

    correct_answer = benchmark.get("ANSWER", "").strip()
    pred = response.get("pred", "").strip()
    result = (pred == correct_answer)

    rows.append({
        "id": i,
        "benchmark_id": benchmark.get("INDEX"),
        "pred": pred,
        "gold": correct_answer,
        "correct": result,
        "raw_response": raw_response
    })
    if i < 99:
        time.sleep(DELAY_BETWEEN_REQUESTS)

Gen_Gemini = pd.DataFrame(rows)
Gen_Gemini.to_csv("eval_result/Gen_Gemini.csv", index=False)
print(f"General Prompt (Gemini) Accuracy: {Gen_Gemini['correct'].mean():.4f}")


Starting General evaluation with 2s delay between requests...
Estimated time for 100 predictions: ~3.3 minutes


100it [14:52,  8.92s/it]

General Prompt (Gemini) Accuracy: 0.7000





In [40]:
# Gemini accuracy summary
acc_summary_gemini = {
    "Gen_Gemini": Gen_Gemini["correct"].mean()*100,
    "MetaMindCoT_Gemini": MetaMindCoT_Gemini["correct"].mean()*100
}
print(acc_summary_gemini)


{'Gen_Gemini': np.float64(70.0), 'MetaMindCoT_Gemini': np.float64(78.0)}


# DeepSeek Implementation

##### MetaMind Prompt

In [None]:
import json
from pathlib import Path
from typing import Dict

PROMPT_PATH = Path("../false_belief_cleaning/MetaMindCoT.json")
def build_system_prompt(prompt_cfg: Dict[str, str]) -> str:
    parts = [
        prompt_cfg.get("sys_instruction", "").strip(),
        prompt_cfg.get("thinking_steps", "").strip(),
        prompt_cfg.get("return_format", "").strip(),
    ]
    return "\n\n".join([p for p in parts if p])

with PROMPT_PATH.open("r", encoding="utf-8") as f:
    prompt_cfg = json.load(f)

# build system prompt string
system_prompt = build_system_prompt(prompt_cfg)
# sanity check
# print(system_prompt)

EVAL_JSONL_PATH = Path("../false_belief_cleaning/output.jsonl")
def load_jsonl(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]
benchmarks = load_jsonl(EVAL_JSONL_PATH)
print(f"Loaded {len(benchmarks)} benchmark entries.")
print("Sample entry:", benchmarks[0])

def make_messages(benchmark: Dict, system_prompt: str):
    context = benchmark.get("STORY", "").strip()
    question = benchmark.get("QUESTION", "").strip()

    options = "\n".join(
        f"{key}: {benchmark.get(key, '')}"
        for key in benchmark
        if key.startswith("OPTION-")
    )

    system_message = {
        "role": "system",
        "content": system_prompt
    }

    user_message = {
        "role": "user",
        "content": (
            f"id: {benchmark.get('INDEX')}\n\n"
            f"Story:\n{context}\n\n"
            f"Question:\n{question}\n\n"
            f"Options:\n{options}"
        )
    }

    return [system_message, user_message]

print("Sample messages:", make_messages(benchmarks[0], system_prompt))

from openai import OpenAI


client = OpenAI(api_key = 'your-api-key',base_url="https://api.deepseek.com")

def llm_query(messages):
    resp = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
        temperature=0.0,
        max_completion_tokens=2048,
        response_format={"type": "json_object"},
    )
    return resp.choices[0].message.content

Loaded 600 benchmark entries.
Sample entry: {'ABILITY': 'Belief: Location false beliefs', 'INDEX': 1, 'STORY': 'Xiaogang and Xiaoming are wandering in the bedroom, they see a handbag, a briefcase, and a backpack, they find cabbage in the handbag, Xiaoming leaves the bedroom, Xiaogang moves the cabbage to the backpack.', 'QUESTION': 'Where is the cabbage now?', 'OPTION-A': 'Backpack', 'OPTION-B': 'Handbag', 'OPTION-C': 'Tote bag', 'OPTION-D': 'Briefcase', 'ANSWER': 'A'}
Sample messages: [{'role': 'system', 'content': 'You are a structured social reasoning system that models human social cognition using explicit Theory-of-Mind inference and norm-aware reasoning. Given a user input and optional context, infer latent mental states, refine them using social and ethical considerations, and generate a grounded response.\n\nOutput ONLY valid JSON.\n\nInstructions:\n1) Generate 3-5 distinct hypotheses about the user\'s latent mental states. Each hypothesis must include: id, type ∈ ["belief","de

In [None]:
import pandas as pd
from tqdm import tqdm
rows = []

for i, benchmark in tqdm(enumerate(benchmarks[:100])):
    messages = make_messages(benchmark, system_prompt)

    raw_response = llm_query(messages)      # string
    try:
        response = json.loads(raw_response) # dict
    except Exception:
        response = {} 

    correct_answer = benchmark.get("ANSWER", "").strip()
    pred = response.get("pred", "").strip()

    result = (pred == correct_answer)

    rows.append({
        "id": i,
        "benchmark_id": benchmark.get("INDEX"),
        "pred": pred,
        "gold": correct_answer,
        "correct": result,
        "raw_response": raw_response
    })

MetaMindCoT = pd.DataFrame(rows)
MetaMindCoT.to_csv("../false_belief_cleaning/eval_result/MetaMindCoT_DeepSeek.csv", index=False)

##### Simple Prompt

In [8]:
rows = []
for i, benchmark in tqdm(enumerate(benchmarks[:100])):
    general_prompt = (
    "You are a socially intelligent assistant answering multiple-choice reasoning questions.\n"
    "Return ONLY valid JSON.\n\n"
    "JSON schema:\n"
    "{\n"
    "  \"pred\": \"A|B|C|D\"\n"
    "}\n\n"
    "Rules:\n"
    "- pred must be exactly one of A, B, C, D.\n"
    "- Do not output anything outside the JSON."
)
    messages = make_messages(benchmark, general_prompt)

    raw_response = llm_query(messages)      # string
    try:
        response = json.loads(raw_response) # dict
    except Exception:
        response = {}  

    correct_answer = benchmark.get("ANSWER", "").strip()
    pred = response.get("pred", "").strip()

    result = (pred == correct_answer)

    rows.append({
        "id": i,
        "benchmark_id": benchmark.get("INDEX"),
        "pred": pred,
        "gold": correct_answer,
        "correct": result,
        "raw_response": raw_response
    })

GenDeepSeek = pd.DataFrame(rows)
GenDeepSeek.to_csv("../false_belief_cleaning/eval_result/Gen_DeepSeek.csv", index=False)

100it [02:51,  1.72s/it]


##### Comparison

In [9]:
import matplotlib.pyplot as plt
acc_summary = {
    "GenDeepSeek": GenDeepSeek["correct"].mean().astype(float),
    "DeepSeek with MetaMind CoT": MetaMindCoT["correct"].mean().astype(float)
}
print(acc_summary)


Matplotlib is building the font cache; this may take a moment.


{'GenDeepSeek': np.float64(0.94), 'DeepSeek with MetaMind CoT': np.float64(0.94)}
