In [None]:
import pathlib

# defining paths to output and data directories to be used throughout notebook
BASE_DIR = pathlib.Path.cwd().resolve().parents[1]
DATA_DIR = BASE_DIR /"data"
OUT_DIR = BASE_DIR / "inference" / "outputs"

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MISTRAL_KEY= os.getenv("MISTRAL_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

## Preparing GPQA Dataset for Inference

In [None]:
import random
from datasets import load_dataset
import pandas as pd
import pathlib

# loading gpqa
dataset = load_dataset("Idavidrein/gpqa", "gpqa_extended")
df = dataset["train"].to_pandas()


# renaming the columns
df = df.rename(columns={"Question": "question"})
df["choices"] = df[["Correct Answer", "Incorrect Answer 1", "Incorrect Answer 2", "Incorrect Answer 3"]].values.tolist()
df["correct_text"] = df["Correct Answer"]
df = df[["question", "choices", "correct_text"]].head(300)

df["choices"] = df["choices"].apply(lambda xs: [str(x).strip() for x in xs])
df["correct_text"] = df["correct_text"].apply(lambda x: str(x).strip())
random.seed(42) 

def shuffle_and_label(row):
    opts = row["choices"][:]
    random.shuffle(opts)
    try:
        correct_index = opts.index(row["correct_text"])
    except ValueError:
        return None 
    row["choices"] = opts
    row["answer"] = "ABCD"[correct_index]
    return row

df = df.apply(shuffle_and_label, axis=1).dropna() 

df['id'] = df.index

df = df[["id", "question", "choices", "answer"]]

output_file = DATA_DIR / "gpqa_300.csv"
df.to_csv(output_file, index=False)

print(f" cleaned, shuffled, and saved: {len(df)} rows → {output_file}")
df.head()

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

# splitting dataset
df = pd.read_csv(DATA_DIR / "gpqa_300.csv")

calib_df, eval_df = train_test_split(df, test_size=0.85, random_state=42, shuffle=True)

calib_df.to_csv(DATA_DIR / "gpqa_calibration_split.csv", index=False)
eval_df.to_csv(DATA_DIR / "gpqa_evaluation_split.csv", index=False)

print("Split complete:")
print("Calibration:", len(calib_df), "rows")
print("Evaluation :", len(eval_df), "rows")

In [None]:
# sanity check
print("Calibration sample:")
display(calib_df.head(3))

print("Evaluation sample:")
display(eval_df.head(3))

In [None]:
eval_df = pd.read_csv(DATA_DIR/"gpqa_evaluation_split.csv")

# this is what will be used to run inference for all models for consistency
subset_df = eval_df.copy()

# converting to template we will work on
template_path = DATA_DIR / "gpqa_subset_for_models.csv"
subset_df[["id", "question", "choices", "answer"]].to_csv(template_path, index=False)

# sanity check
subset_df.head(2)

## QWEN

Loading model to device

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

# checking if all libraries have been installed
# print(f"PyTorch: {torch.__version__}")
# print(f" Transformers: {transformers.__version__}")
# print(f" MPS available: {torch.backends.mps.is_available()}\n")

# version check
major, minor = map(int, transformers.__version__.split('.')[:2])
if (major, minor) < (4, 37):
    raise ValueError(f" Transformers {transformers.__version__} too old.")

MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
print(f"Loading: {MODEL_NAME}")

# setting up device
if torch.backends.mps.is_available():
    device = torch.device("mps")
    dtype = torch.float16
else:
    device = torch.device("cpu")
    dtype = torch.float32

# loading tokenizer + model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=dtype,
    # mps doesn’t like 'auto'
    device_map=None,      
    token=HF_TOKEN
).to(device)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f" Model loaded successfully on {device} with dtype={dtype}")

Qwen Inference

In [None]:
import pandas as pd
import re
import time
from collections import Counter
import torch
import gc
import ast


THRESHOLDS = [0.25, 0.5, 0.75, 0.9]

# helper function to reate the multiple choice prompt for model
# dynamically inserts t
# formats the choices as A/B/C/D options
def build_mcq_prompt(row, t):
    opts = row["choices"]
    # if choices column is a string  "['A', 'B', 'C']" convert it back to list
    if isinstance(opts, str):
        try:
            opts = ast.literal_eval(opts)
        except Exception:
            pass
    lines = [f"Q: {row['question']}", "Options:"]
    # this is what we want
    if isinstance(opts, (list, tuple)):
        labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        for i, choice in enumerate(opts):
            lines.append(f"{labels[i]}. {choice}")
    else:
        lines.append(str(opts))
    
    # prompt
    # confidence instructions
    # answer ONLY if its confidence > t
    # otherwis explicitly say "I don't know"
    lines.append(
        f"\nAnswer only if you are more than {t} confident. "
        f"Mistakes incur {t}/(1−{t}) points, correct = +1, IDK = 0. "
        "If unsure, respond exactly with 'I don't know.'\n"
        "Provide only the capital letter of your chosen answer (A, B, C, D, ...) "
        "or 'I don't know'."
    )
    return "\n".join(lines)

# helper to extract model's answer
# If model says "I don't know" -> IDK
# otherwise pull first letter
def extract_letter(text):
    text_clean = text.strip().upper()
    if "I DON'T KNOW" in text_clean:
        return "IDK"
    m = re.search(r"\b([A-D])\b", text_clean) 
    return m.group(1) if m else None

# generates n samples using temperature sampling
# treating each generation as vote for an answer
# final prediction = majority vote
# p = (# votes for majority) / n
def qwen_answer_and_conf(prompt, n=6, temperature=0.7):
    votes = []
    for _ in range(n):
        try:
            messages = [{"role": "user", "content": prompt}]
            formatted_prompt = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            inputs = tokenizer(
                formatted_prompt, return_tensors="pt", truncation=True, max_length=512
            ).to(model.device)
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=10,
                    temperature=temperature,
                    do_sample=True if temperature > 0 else False,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )
            txt = tokenizer.decode(
                outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True
            ).strip()
            letter = extract_letter(txt)
            if letter:
                votes.append(letter)
            del inputs, outputs
            torch.cuda.empty_cache()
        except Exception as e:
            print("Error:", e)
        time.sleep(0.1)

    if not votes:
        # default to IDK
        return "IDK", 0.0
    counts = Counter(votes)
    pred = counts.most_common(1)[0][0]
    conf = counts[pred] / len(votes)
    return pred, float(conf)

subset_df = pd.read_csv(DATA_DIR/"gpqa_subset_for_models.csv")

qwen_rows = []
total_q = len(subset_df)

# sample n times
# convert each output to letter / IDK
# majority vote for prediction
# p = freq(pred) / n
for t in THRESHOLDS:
    print(f"\nRunning inference for threshold t={t}")
    for i, row in subset_df.iterrows():
        # building prompt with curr t
        prompt = build_mcq_prompt(row, t=t) 
        
        print(f"[t={t} | {i+1}/{total_q}] Asking Qwen2.5-1.5B ...")
        pred, conf = qwen_answer_and_conf(prompt, n=6)
        
        # our rows for the output file
        qwen_rows.append({
            "id": row["id"],
            "threshold": t,
            "question": row["question"],
            "choices": row["choices"],
            "answer": row["answer"],
            "predicted_answer": pred,
            "confidence": conf
        })

        if (i + 1) % 5 == 0:
            gc.collect()
            torch.cuda.empty_cache()

qwen_out = pd.DataFrame(qwen_rows)
qwen_path = OUT_DIR/"qwen-gpqa.csv"
qwen_out.to_csv(qwen_path, index=False)
print(f" Qwen2.5-1.5B predictions for all thresholds saved to {qwen_path}")
qwen_out.head(5)

---

## ChatGPT

In [None]:
from openai import OpenAI
import pandas as pd
import re
import time
import ast
from collections import Counter
import os


client = OpenAI(api_key=OPENAI_API_KEY)

THRESHOLDS = [0.25, 0.5, 0.75, 0.9]


# helper function to reate the multiple choice prompt for model
# dynamically inserts t
# formats the choices as A/B/C/D options
def build_mcq_prompt(row, t):
    opts = row["choices"]
    # need to convert to list
    if isinstance(opts, str):
        try:
            opts = ast.literal_eval(opts)
        except Exception:
            pass

    lines = [f"Q: {row['question']}", "Options:"]
    if isinstance(opts, (list, tuple)):
        labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        for i, choice in enumerate(opts):
            lines.append(f"{labels[i]}. {choice}")
    else:
        lines.append(str(opts))
    # prompt
    # confidence instructions
    # answer ONLY if its confidence > t
    # otherwise explicitly say "I don't know".
    lines.append(
        f"\nAnswer only if you are more than {t} confident. "
        f"Mistakes incur {t}/(1−{t}) points, correct = +1, IDK = 0. "
        "If unsure, respond exactly with 'I don't know.'\n"
        "Provide only the capital letter of your chosen answer (A, B, C, D, ...) "
        "or 'I don't know'."
    )
    return "\n".join(lines)

# helper to extract model's answer
# If model says "I don't know" = IDK
# otherwise pull first letter
def extract_letter(text):
    text_clean = text.strip().upper()
    if "I DON'T KNOW" in text_clean:
        return "IDK"
    m = re.search(r"\b([A-D])\b", text_clean)
    return m.group(1) if m else None

# generates n samples using temperature sampling
# treating each generation as vote for an answer.
# final prediction = majority vote.
# p = (# votes for majority) / n.
def gpt4_answer_and_conf(prompt, n=6, temperature=0.7, sleep_s=0.4):
    votes = []
    for _ in range(n):
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=10
            )
            txt = response.choices[0].message.content.strip()
            letter = extract_letter(txt) 
            if letter:
                votes.append(letter)
        except Exception as e:
            print("Error:", e)
        time.sleep(sleep_s)

    if not votes:
        return "IDK", 0.0
    counts = Counter(votes)
    pred = counts.most_common(1)[0][0]
    conf = counts[pred] / len(votes)
    return pred, float(conf)


subset_df = pd.read_csv(DATA_DIR/"/gpqa_subset_for_models.csv")

gpt4_path = OUT_DIR/"gpt-gpqa.csv"
SAVE_EVERY = 10


if os.path.exists(gpt4_path):
    existing = pd.read_csv(gpt4_path)
    gpt_rows = existing.to_dict("records")
    done_pairs = set(zip(existing["id"], existing["threshold"]))
    print(f"Resuming: loaded {len(done_pairs)} existing rows.")
else:
    gpt_rows = []
    done_pairs = set()
    print("No existing results, starting fresh.")

total_q = len(subset_df)
new_rows_since_save = 0

for t in THRESHOLDS:
    print(f"\n--- Running inference for threshold t={t} ---")
    for i, row in subset_df.iterrows():
        key = (row["id"], float(t))
        if key in done_pairs:
            continue

        prompt = build_mcq_prompt(row, t=t)
        print(f"[t={t} | {i+1}/{total_q}] Asking GPT-4 ...")

        pred, conf = gpt4_answer_and_conf(prompt, n=6)

        gpt_rows.append({
            "id": row["id"],
            "threshold": t,
            "question": row["question"],
            "choices": row["choices"],
            "answer": row["answer"],
            "predicted_answer": pred,
            "confidence": conf
        })
        done_pairs.add(key)
        new_rows_since_save += 1

        if new_rows_since_save >= SAVE_EVERY:
            pd.DataFrame(gpt_rows).to_csv(gpt4_path, index=False)
            print(f" Saved {len(gpt_rows)} rows so far.")
            new_rows_since_save = 0

gpt4_out = pd.DataFrame(gpt_rows)
gpt4_out.to_csv(gpt4_path, index=False)
print(f" GPT-4 predictions for all thresholds saved to {gpt4_path}")
gpt4_out.head(5)

---

## Gemini

In [None]:
import google.generativeai as genai
import pandas as pd
import re
import time
import ast
import os
from collections import Counter

genai.configure(api_key=GOOGLE_API_KEY)

model = genai.GenerativeModel("gemini-2.5-flash-lite" )
THRESHOLDS = [0.25, 0.5, 0.75, 0.9]


# helper function to reate the multiple choice prompt for model
# dynamically inserts t
# formats the choices as A/B/C/D options
def build_mcq_prompt(row, t):
    opts = row["choices"]
    if isinstance(opts, str):
        try: opts = ast.literal_eval(opts)
        except: pass

    lines = [f"Q: {row['question']}", "Options:"]
    if isinstance(opts, (list, tuple)):
        labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        for i, choice in enumerate(opts):
            lines.append(f"{labels[i]}. {choice}")
    else:
        lines.append(str(opts))

    lines.append(
        f"\nAnswer only if you are more than {t} confident. "
        f"Mistakes incur {t}/(1−{t}) points, correct = +1, IDK = 0. "
        "If unsure, respond exactly with 'I don't know.'\n"
        "Provide only the capital letter of your chosen answer (A, B, C, D, ...) "
        "or 'I don't know'."
    )
    return "\n".join(lines)

# If model says "I don't know" -> IDK
# otherwise pull first letter
def extract_letter(text):
    text_clean = text.strip().upper()
    if "I DON'T KNOW" in text_clean: return "IDK"
    m = re.search(r"\b([A-D])\b", text_clean)
    return m.group(1) if m else None

# to reset rate limits
CURRENT_SLEEP_S = 0.1 

# generates n samples using temperature sampling
# treating each generation as vote for an answer
# final prediction = majority vote
# p = (# votes for majority) / n
def gemini_answer_and_conf(prompt, n=6, temperature=0.7):
    global CURRENT_SLEEP_S
    votes = []
    
    for _ in range(n):
        retries = 3
        while retries > 0:
            try:
                response = model.generate_content(
                    prompt,
                    generation_config=genai.types.GenerationConfig(
                        candidate_count=1,
                        max_output_tokens=10,
                        temperature=temperature
                    )
                )
                if response.text:
                    txt = response.text.strip()
                    letter = extract_letter(txt)
                    if letter: votes.append(letter)
                break
            
            except Exception as e:
                error_str = str(e)
                # catch 429 errors: limit reached
                if "429" in error_str or "quota" in error_str.lower() or "ResourceExhausted" in str(type(e).__name__):
                    if CURRENT_SLEEP_S < 1.0:
                        print(f" Hit Rate Limit. Downgrading to Free Tier speed (4s delay) for future requests...")
                        CURRENT_SLEEP_S = 4.0
                    
                    print(f" Waiting 15s to cool down... (Retries left: {retries})")
                    time.sleep(15)
                    retries -= 1
                else:
                    print(f"Error ({type(e).__name__}): {e}")
                    break
        
        time.sleep(CURRENT_SLEEP_S)

    if not votes: return "IDK", 0.0
    counts = Counter(votes)
    pred = counts.most_common(1)[0][0]
    conf = counts[pred] / len(votes)
    return pred, float(conf)

subset_df = pd.read_csv(DATA_DIR/"gpqa_subset_for_models.csv")
gemini_path = OUT_DIR/"gemini-gpqa.csv"
SAVE_EVERY = 10 

if os.path.exists(gemini_path):
    existing = pd.read_csv(gemini_path)
    gemini_rows = existing.to_dict("records")
    done_pairs = set(zip(existing["id"], existing["threshold"]))
    print(f"Resuming: loaded {len(done_pairs)} existing rows.")
else:
    gemini_rows = []
    done_pairs = set()
    print("No existing results, starting fresh.")

total_q = len(subset_df)
new_rows_since_save = 0

for t in THRESHOLDS:
    print(f"\nRunning inference for threshold t={t}")
    for i, row in subset_df.iterrows():
        key = (row["id"], float(t))
        if key in done_pairs: continue

        prompt = build_mcq_prompt(row, t=t)
        print(f"[t={t} | {i+1}/{total_q}] Asking Gemini ({CURRENT_SLEEP_S}s delay)...")

        pred, conf = gemini_answer_and_conf(prompt, n=6)

        gemini_rows.append({
            "id": row["id"],
            "threshold": t,
            "question": row["question"],
            "choices": row["choices"],
            "answer": row["answer"],
            "predicted_answer": pred,
            "confidence": conf
        })
        done_pairs.add(key)
        new_rows_since_save += 1

        if new_rows_since_save >= SAVE_EVERY:
            pd.DataFrame(gemini_rows).to_csv(gemini_path, index=False)
            print(f" Saved {len(gemini_rows)} rows so far.")
            new_rows_since_save = 0

gemini_out = pd.DataFrame(gemini_rows)
gemini_out.to_csv(gemini_path, index=False)
print(f" Gemini predictions saved to {gemini_path}")
gemini_out.head()

----

## Llama

Loading model to device

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd

# logged in and downloaded model from hugging face already

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
dtype = torch.float16 if device in ["cuda", "mps"] else torch.float32

# for my reference
print(f" Using device={device}, dtype={dtype}")
print(f" Transformers version: {transformers.__version__}")

# loading model from hf cache
MODEL_NAME = "meta-llama/Llama-3.2-3B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=dtype,
    device_map=None,
).to(device)


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model.eval()
print(f"Model loaded successfully on {device}")

Running inference

In [None]:
import os
import time
from collections import Counter
import ast
import re

MODEL_NAME = "meta-llama/Llama-3.2-3B"

THRESHOLDS = [0.25, 0.5, 0.75, 0.9]
subset_path = DATA_DIR/"gpqa_subset_for_models.csv"
llama_path = OUT_DIR/ "llama-gpqa.csv"
SAVE_EVERY = 10

subset_df = pd.read_csv(subset_path)

# helper function to reate the multiple choice prompt for model
# dynamically inserts t
# formats the choices as A/B/C/D options
def build_mcq_prompt(row, t):
    opts = row["choices"]
    if isinstance(opts, str):
        try:
            opts = ast.literal_eval(opts)
        except Exception:
            pass

    lines = [f"Q: {row['question']}", "Options:"]
    if isinstance(opts, (list, tuple)):
        labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        for i, choice in enumerate(opts):
            lines.append(f"{labels[i]}. {choice}")
    else:
        lines.append(str(opts))
    # prompt
    # confidence instructions
    # answer ONLY if its confidence > t
    # otherwise explicitly say "I don't know"
    lines.append(
        f"\nAnswer only if you are more than {t} confident. "
        f"Mistakes incur {t}/(1−{t}) points, correct = +1, IDK = 0. "
        "If unsure, respond exactly with 'I don't know.'\n"
        "Provide only the capital letter of your chosen answer (A, B, C, D, ...) "
        "or 'I don't know'."
    )
    return "\n".join(lines)

def extract_letter(text: str):
    txt = text.strip().upper()
    if "I DON'T KNOW" in txt:
        return "IDK"
    m = re.search(r"\b([A-Z])\b", txt)
    if m:
        return m.group(1)
    return None

# generates n samples using temperature sampling
# treating each generation as vote for an answer
# final prediction = majority vote
# p = (# votes for majority) / n
def llama_answer_and_conf(prompt, n=6, temperature=0.7, sleep_s=0.2):
    votes = []

    for _ in range(n):
        try:
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=2048
            ).to(device)

            with torch.no_grad():
                out = model.generate(
                    **inputs,
                    max_new_tokens=8,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=tokenizer.eos_token_id
                )

            # Decode *only* the generated continuation
            gen_ids = out[0][inputs["input_ids"].shape[1]:]
            text = tokenizer.decode(gen_ids, skip_special_tokens=True)
            letter = extract_letter(text)
            if letter:
                votes.append(letter)
        except Exception as e:
            print("Error during generation:", e)

        time.sleep(sleep_s)

    if not votes:
        return "IDK", 0.0

    counts = Counter(votes)
    pred = counts.most_common(1)[0][0]
    conf = counts[pred] / len(votes)
    return pred, float(conf)

# continuing from where we last left off
if os.path.exists(llama_path):
    existing = pd.read_csv(llama_path)
    llama_rows = existing.to_dict("records")
    done_pairs = set(zip(existing["id"], existing["threshold"]))
    print(f"Resuming: loaded {len(done_pairs)} existing (id, threshold) rows.")
else:
    llama_rows = []
    done_pairs = set()
    print("No existing Llama results, starting fresh.")

total_q = len(subset_df)
new_rows_since_save = 0


# sample n times
# convert each output to letter / IDK
# majority vote for prediction
# p = freq(pred) / n
for t in THRESHOLDS:
    print(f"\nThreshold t={t}")
    for i, row in subset_df.iterrows():
        key = (row["id"], float(t))
        if key in done_pairs:
            continue

        prompt = build_mcq_prompt(row, t=t)
        print(f"[t={t} | {i+1}/{total_q}] Asking Llama-3.2-3B...")

        pred, conf = llama_answer_and_conf(prompt, n=6)

        llama_rows.append({
            "id": row["id"],
            "threshold": t,
            "question": row["question"],
            "choices": row["choices"],
            "answer": row["answer"],
            "predicted_answer": pred,
            "confidence": conf,
        })
        done_pairs.add(key)
        new_rows_since_save += 1

        if new_rows_since_save >= SAVE_EVERY:
            pd.DataFrame(llama_rows).to_csv(llama_path, index=False)
            print(f" Saved {len(llama_rows)} rows so far.")
            new_rows_since_save = 0

llama_out = pd.DataFrame(llama_rows)
llama_out.to_csv(llama_path, index=False)
print(f"\n Llama-3.2-3B predictions for all thresholds saved to {llama_path}")

---

## Claude

In [None]:
from anthropic import Anthropic
import pandas as pd
import re
import time
import ast
from collections import Counter
import os

client = Anthropic(api_key=ANTHROPIC_API_KEY)

CLAUDE_MODEL = "claude-haiku-4-5"

THRESHOLDS = [0.25, 0.5, 0.75, 0.9]

# helper function to reate the multiple choice prompt for model
# dynamically inserts t
# formats the choices as A/B/C/D options
def build_mcq_prompt(row, t):
    opts = row["choices"]
    # needs to be a list
    if isinstance(opts, str):
        try:
            opts = ast.literal_eval(opts)
        except Exception:
            pass

    lines = [f"Q: {row['question']}", "Options:"]
    if isinstance(opts, (list, tuple)):
        labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        for i, choice in enumerate(opts):
            lines.append(f"{labels[i]}. {choice}")
    else:
        lines.append(str(opts))
    # prompt
    # confidence instructions
    # answer ONLY if its confidence > t
    # otherwise explicitly say "I don't know"
    lines.append(
        f"\nAnswer only if you are more than {t} confident. "
        f"Mistakes incur {t}/(1−{t}) points, correct = +1, IDK = 0. "
        "If unsure, respond exactly with 'I don't know.'\n"
        "Provide only the capital letter (A, B, C, D, ...) or 'I don't know'."
    )

    return "\n".join(lines)

# helper to extract model's answer
# If model says "I don't know" -> IDK
# otherwise pull first letter
def extract_letter(text):
    text_clean = text.strip().upper()
    if "I DON'T KNOW" in text_clean:
        return "IDK"
    m = re.search(r"\b([A-Z])\b", text_clean)
    return m.group(1) if m else None

# generates n samples using temperature sampling
# treating each generation as vote for an answer
# final prediction = majority vote
# p = (# votes for majority) / n
def claude_answer_and_conf(prompt, n=6, temperature=0.7, sleep_s=0.4):
    votes = []

    for _ in range(n):
        try:
            response = client.messages.create(
                model=CLAUDE_MODEL,
                max_tokens=10,
                temperature=temperature,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            txt = response.content[0].text.strip()
            letter = extract_letter(txt)
            if letter:
                votes.append(letter)

        except Exception as e:
            print("Error:", e)

        time.sleep(sleep_s)

    if not votes:
        return "IDK", 0.0

    counts = Counter(votes)
    pred = counts.most_common(1)[0][0]
    conf = counts[pred] / len(votes)
    return pred, float(conf)


subset_df = pd.read_csv(DATA_DIR/"gpqa_subset_for_models.csv")

claude_path = OUT_DIR/"claude_gpqa.csv"
SAVE_EVERY = 10

# continue from where we last left off
if os.path.exists(claude_path):
    existing = pd.read_csv(claude_path)
    claude_rows = existing.to_dict("records")
    done_pairs = set(zip(existing["id"], existing["threshold"]))
    print(f"Resuming: loaded {len(done_pairs)} existing rows.")
else:
    claude_rows = []
    done_pairs = set()
    print("No existing results, starting fresh.")

total_q = len(subset_df)
new_rows_since_save = 0


# run inference by sampling n times
# convert each output to letter / IDK
# majority vote for prediction
# p = freq(pred) / n
for t in THRESHOLDS:
    print(f"\n--- Running Claude inference for t={t} ---")
    for i, row in subset_df.iterrows():
        key = (row["id"], float(t))
        if key in done_pairs:
            continue

        prompt = build_mcq_prompt(row, t)
        print(f"[t={t} | {i+1}/{total_q}] Asking Claude ...")

        pred, conf = claude_answer_and_conf(prompt, n=6)

        claude_rows.append({
            "id": row["id"],
            "threshold": t,
            "question": row["question"],
            "choices": row["choices"],
            "answer": row["answer"],
            "predicted_answer": pred,
            "confidence": conf
        })

        done_pairs.add(key)
        new_rows_since_save += 1

        if new_rows_since_save >= SAVE_EVERY:
            pd.DataFrame(claude_rows).to_csv(claude_path, index=False)
            print(f" Saved {len(claude_rows)} rows so far.")
            new_rows_since_save = 0

claude_out = pd.DataFrame(claude_rows)
claude_out.to_csv(claude_path, index=False)
print(f" Claude predictions saved to {claude_path}")

claude_out.head(5)

---

## Mistral

In [None]:
import mistralai
import pandas as pd
import re, time, ast
from collections import Counter

client = mistralai.Mistral(api_key=MISTRAL_KEY)

MODEL_NAME = "open-mistral-7b"
DATASET_NAME = "gpqa"

SUBSET_PATH = DATA_DIR / "gpqa_subset_for_models.csv"
OUTPUT_PATH = OUT_DIR / f"{MODEL_NAME}-{DATASET_NAME}.csv"

print("Saving to:", OUTPUT_PATH)

eval_df = pd.read_csv(SUBSET_PATH)
print("Loaded GPQA subset:", len(eval_df), "rows")

# helper function to reate the multiple choice prompt for model
# dynamically inserts t
# formats the choices as A/B/C/D options
def build_mcq_prompt(row, t):
    opts = row["choices"]
    # convert to list
    if isinstance(opts, str):
        try:
            opts = ast.literal_eval(opts)
        except:
            pass

    labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    lines = [f"Q: {row['question']}", "Options:"]

    for i, c in enumerate(opts):
        lines.append(f"{labels[i]}. {c}")
    # PROMPT
    # prompt
    # confidence instructions
    # answer ONLY if its confidence > t
    # otherwise explicitly say "I don't know"
    lines.append(
        f"\nAnswer only if you are more than {t} confident. "
        f"Mistakes incur {t}/(1−{t}) penalty points. "
        "Correct = +1, IDK = 0. "
        "If unsure, respond exactly with 'I don't know'.\n"
        "Provide ONLY the capital letter (A, B, C, …) OR 'I don't know'."
    )
    return "\n".join(lines)

# helper to extract model's answer
# If model says "I don't know" -> IDK
# otherwise pull first letter
def extract_letter(text):
    clean = text.strip().upper()

    if "I DON'T KNOW" in clean or "IDK" in clean:
        return "IDK"

    m = re.search(r"\b([A-Z])\b", clean)
    return m.group(1) if m else None


# generates n samples using temperature sampling
# treating each generation as vote for an answer
# final prediction = majority vote
# p = (# votes for majority) / n
def mistral_answer_and_conf(prompt, n=6, temperature=0.7, base_sleep=1.2):
    votes = []

    for _ in range(n):
        while True:
            try:
                resp = client.chat.complete(
                    model=MODEL_NAME,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=temperature,
                    max_tokens=10,
                )

                txt = resp.choices[0].message.content.strip()
                letter = extract_letter(txt)

                if letter:
                    votes.append(letter)

                time.sleep(base_sleep)
                break

            except Exception as e:
                err = str(e)
                print("Mistral Error:", err)

                if "429" in err or "rate" in err.lower():
                    print("Rate limit hit — sleeping 30 sec.")
                    time.sleep(30)
                    continue

                print("Unexpected error — sleeping 5 sec.")
                time.sleep(5)
                continue

    if not votes:
        return "IDK", 0.0

    counts = Counter(votes)
    pred = counts.most_common(1)[0][0]
    conf = counts[pred] / len(votes)
    return pred, conf

# continue from previous run to save time
if OUTPUT_PATH.exists():
    existing = pd.read_csv(OUTPUT_PATH)
    saved_rows = existing.to_dict("records")
    done_pairs = set(zip(existing["id"], existing["threshold"]))
    print("Resuming — loaded", len(saved_rows), "rows.")
else:
    saved_rows = []
    done_pairs = set()
    print("Starting fresh.")

SAVE_EVERY = 10
new_since_save = 0
total = len(eval_df)
THRESHOLDS = [0.25, 0.50, 0.75, 0.90]

# sample n times
# convert each output to letter / IDK
# majority vote for prediction
# p = freq(pred) / n
for t in THRESHOLDS:
    print(f"\nThreshold t={t}")

    for idx, row in eval_df.iterrows():
        key = (row["id"], float(t))
        if key in done_pairs:
            continue

        prompt = build_mcq_prompt(row, t)
        print(f"[t={t}] {idx+1}/{total} Mistral API...")

        pred, conf = mistral_answer_and_conf(prompt)

        saved_rows.append({
            "id": row["id"],
            "threshold": t,
            "question": row["question"],
            "choices": row["choices"],
            "answer": row["answer"],
            "predicted_answer": pred,
            "confidence": conf
        })

        done_pairs.add(key)
        new_since_save += 1

        if new_since_save >= SAVE_EVERY:
            pd.DataFrame(saved_rows).to_csv(OUTPUT_PATH, index=False)
            print(f"Saved {len(saved_rows)} rows.")
            new_since_save = 0

pd.DataFrame(saved_rows).to_csv(OUTPUT_PATH, index=False)
print("saved:", OUTPUT_PATH)