# Main experiment

## imports and setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import pandas as pd

from src.pgr_experiment import *
from src.utils import choose_indices
from src.config import MODELS_LIST
from src.math_dataset import load_questions

In [3]:
train_ds = load_questions("train", limit=100)
test_ds = load_questions("test", limit=100)
indices = choose_indices(len(train_ds), 3, seed=42)

## Compare Performance of each model on MATH dataset

In [4]:
results = {}
for model in MODELS_LIST:
    print(f"Running experiments for model: {model}")
    scores = []
    for _ in range(3):
        score = await ask_with_gold(model, train_ds, test_ds, indices)
        scores.append(score)
    results[model] = {"mean": sum(scores) / len(scores), "std": np.std(scores, ddof=1)}
    print(f"Mean score: {results[model]['mean']:.3f}, Std: {results[model]['std']:.3f}")
print(results)

Running experiments for model: gpt-5-mini
Mean score: 0.9466666666666667, Std: 0.01527525231651942
Running experiments for model: o3-mini
Mean score: 0.8366666666666666, Std: 0.011547005383792525
Running experiments for model: gpt-4.1-mini
Mean score: 0.9166666666666666, Std: 0.02081665999466128
Running experiments for model: gpt-4o-mini
Mean score: 0.5233333333333333, Std: 0.005773502691896262
Running experiments for model: gpt-4.1-nano
Mean score: 0.7799999999999999, Std: 0.01732050807568879
{'gpt-5-mini': {'mean': 0.9466666666666667, 'std': np.float64(0.01527525231651942)}, 'o3-mini': {'mean': 0.8366666666666666, 'std': np.float64(0.011547005383792525)}, 'gpt-4.1-mini': {'mean': 0.9166666666666666, 'std': np.float64(0.02081665999466128)}, 'gpt-4o-mini': {'mean': 0.5233333333333333, 'std': np.float64(0.005773502691896262)}, 'gpt-4.1-nano': {'mean': 0.7799999999999999, 'std': np.float64(0.01732050807568879)}}


In [7]:
# save results
with open("results/model_math_performance.json", "w") as f:
    json.dump(results, f)

In [8]:
[[k, v["mean"]] for k, v in results.items()]

[['gpt-5-mini', 0.9466666666666667],
 ['o3-mini', 0.8366666666666666],
 ['gpt-4.1-mini', 0.9166666666666666],
 ['gpt-4o-mini', 0.5233333333333333],
 ['gpt-4.1-nano', 0.7799999999999999]]

In [10]:
cross_results = pd.DataFrame(columns=["weak_model", "strong_model", "mean_pgr", "std_pgr"])

ordered_models = sorted(MODELS_LIST, key=lambda m: results[m]["mean"], reverse=True)

In [11]:
logs = {}
strong_model = ordered_models[0]
strong_mean = results[strong_model]["mean"]
for weak_model in ordered_models:
    if weak_model == strong_model:
        continue
    weak_mean = results[weak_model]["mean"]
    if weak_mean >= strong_mean:
        continue  # skip models with equal or higher accuracy
    print(f"Running cross model PGR: weak={weak_model}, strong={strong_model}")
    pgrs = []
    logs[(weak_model, strong_model)] = {}
    for i in range(3):
        pgr, acc_strong_weak, acc_strong_gold, acc_weak_gold = await run_pgr_experiment(
            weak_model,
            strong_model,
            train_ds,
            test_ds[:50],
            indices,
            verbose=True
        )
        pgrs.append(pgr)
        logs[(weak_model, strong_model)][i] = {
            "pgr": pgr,
            "acc_strong_weak": acc_strong_weak,
            "acc_strong_gold": acc_strong_gold,
            "acc_weak_gold": acc_weak_gold
        }
    cross_results.loc[len(cross_results.index)] = [weak_model, strong_model, np.mean(pgrs), np.std(pgrs)]
    print(f"PGR: {np.mean(pgrs):.3f} ± {np.std(pgrs):.3f}")

Running cross model PGR: weak=gpt-4.1-mini, strong=gpt-5-mini
Acc (strong model with weak labels): 0.9600
Acc (strong model with gold labels): 0.9400
Acc (weak model with gold labels): 0.9200
Acc (strong model with weak labels): 0.9600
Acc (strong model with gold labels): 0.9600
Acc (weak model with gold labels): 0.9000
Acc (strong model with weak labels): 0.9800
Acc (strong model with gold labels): 0.9400
Acc (weak model with gold labels): 0.9200
PGR: 1.000 ± 0.000
Running cross model PGR: weak=o3-mini, strong=gpt-5-mini
Acc (strong model with weak labels): 0.9400
Acc (strong model with gold labels): 0.9800
Acc (weak model with gold labels): 0.8600
Acc (strong model with weak labels): 0.9600
Acc (strong model with gold labels): 0.9400
Acc (weak model with gold labels): 0.8600
Acc (strong model with weak labels): 0.9600
Acc (strong model with gold labels): 0.9600
Acc (weak model with gold labels): 0.8400
PGR: 0.889 ± 0.157
Running cross model PGR: weak=gpt-4.1-nano, strong=gpt-5-mini
A

In [20]:
strong_model = ordered_models[1]  # update strong model
strong_mean = results[strong_model]["mean"]
for weak_model in ordered_models:
    if weak_model == strong_model:
        continue
    weak_mean = results[weak_model]["mean"]
    if weak_mean >= strong_mean:
        continue  # skip models with equal or higher accuracy
    print(f"Running cross model PGR: weak={weak_model}, strong={strong_model}")
    pgrs = []
    logs[(weak_model, strong_model)] = {}
    for i in range(3):
        pgr, acc_strong_weak, acc_strong_gold, acc_weak_gold = await run_pgr_experiment(
            weak_model,
            strong_model,
            train_ds,
            test_ds[:50],
            indices,
            verbose=True
        )
        pgrs.append(pgr)
        logs[(weak_model, strong_model)][i] = {
            "pgr": pgr,
            "acc_strong_weak": acc_strong_weak,
            "acc_strong_gold": acc_strong_gold,
            "acc_weak_gold": acc_weak_gold
        }
    cross_results.loc[len(cross_results.index)] = [weak_model, strong_model, np.mean(pgrs), np.std(pgrs)]
    print(f"PGR: {np.mean(pgrs):.3f} ± {np.std(pgrs):.3f}")

Running cross model PGR: weak=o3-mini, strong=gpt-4.1-mini
Acc (strong model with weak labels): 0.8800
Acc (strong model with gold labels): 0.9200
Acc (weak model with gold labels): 0.8200
Acc (strong model with weak labels): 0.9400
Acc (strong model with gold labels): 0.9400
Acc (weak model with gold labels): 0.8200
Acc (strong model with weak labels): 0.9600
Acc (strong model with gold labels): 0.9200
Acc (weak model with gold labels): 0.8000
PGR: 0.867 ± 0.189
Running cross model PGR: weak=gpt-4.1-nano, strong=gpt-4.1-mini
Acc (strong model with weak labels): 0.9600
Acc (strong model with gold labels): 0.9200
Acc (weak model with gold labels): 0.7800
Acc (strong model with weak labels): 0.9200
Acc (strong model with gold labels): 0.9000
Acc (weak model with gold labels): 0.7400
Acc (strong model with weak labels): 0.9200




Acc (strong model with gold labels): 0.9000
Acc (weak model with gold labels): 0.7600
PGR: 1.000 ± 0.000
Running cross model PGR: weak=gpt-4o-mini, strong=gpt-4.1-mini
Acc (strong model with weak labels): 0.9200
Acc (strong model with gold labels): 0.9000
Acc (weak model with gold labels): 0.5800
Acc (strong model with weak labels): 0.9000
Acc (strong model with gold labels): 0.9400
Acc (weak model with gold labels): 0.5600
Acc (strong model with weak labels): 0.9200
Acc (strong model with gold labels): 0.9000
Acc (weak model with gold labels): 0.5800
PGR: 0.965 ± 0.050


In [23]:
strong_model = ordered_models[2]  # update strong model
strong_mean = results[strong_model]["mean"]
for weak_model in ordered_models:
    if weak_model == strong_model:
        continue
    weak_mean = results[weak_model]["mean"]
    if weak_mean >= strong_mean:
        continue  # skip models with equal or higher accuracy
    print(f"Running cross model PGR: weak={weak_model}, strong={strong_model}")
    pgrs = []
    logs[(weak_model, strong_model)] = {}
    for i in range(3):
        pgr, acc_strong_weak, acc_strong_gold, acc_weak_gold = await run_pgr_experiment(
            weak_model,
            strong_model,
            train_ds,
            test_ds[:50],
            indices,
            verbose=True
        )
        pgrs.append(pgr)
        logs[(weak_model, strong_model)][i] = {
            "pgr": pgr,
            "acc_strong_weak": acc_strong_weak,
            "acc_strong_gold": acc_strong_gold,
            "acc_weak_gold": acc_weak_gold
        }
    cross_results.loc[len(cross_results.index)] = [weak_model, strong_model, np.mean(pgrs), np.std(pgrs)]
    print(f"PGR: {np.mean(pgrs):.3f} ± {np.std(pgrs):.3f}")

Running cross model PGR: weak=gpt-4.1-nano, strong=o3-mini
Acc (strong model with weak labels): 0.8400
Acc (strong model with gold labels): 0.8800
Acc (weak model with gold labels): 0.7800
Acc (strong model with weak labels): 0.8600
Acc (strong model with gold labels): 0.8400
Acc (weak model with gold labels): 0.8000
Acc (strong model with weak labels): 0.8200
Acc (strong model with gold labels): 0.8400
Acc (weak model with gold labels): 0.7800
PGR: 0.756 ± 0.175
Running cross model PGR: weak=gpt-4o-mini, strong=o3-mini
Acc (strong model with weak labels): 0.8400
Acc (strong model with gold labels): 0.8200
Acc (weak model with gold labels): 0.5600
Acc (strong model with weak labels): 0.8400
Acc (strong model with gold labels): 0.8200
Acc (weak model with gold labels): 0.5400
Acc (strong model with weak labels): 0.8000
Acc (strong model with gold labels): 0.9000
Acc (weak model with gold labels): 0.6400
PGR: 0.872 ± 0.181


In [None]:
strong_model = ordered_models[3]  # update strong model
strong_mean = results[strong_model]["mean"]
for weak_model in ordered_models:
    if weak_model == strong_model:
        continue
    weak_mean = results[weak_model]["mean"]
    if weak_mean >= strong_mean:
        continue  # skip models with equal or higher accuracy
    print(f"Running cross model PGR: weak={weak_model}, strong={strong_model}")
    pgrs = []
    logs[(weak_model, strong_model)] = {}
    for i in range(3):
        pgr, acc_strong_weak, acc_strong_gold, acc_weak_gold = await run_pgr_experiment(
            weak_model,
            strong_model,
            train_ds,
            test_ds[:50],
            indices,
            verbose=True
        )
        pgrs.append(pgr)
        logs[(weak_model, strong_model)][i] = {
            "pgr": pgr,
            "acc_strong_weak": acc_strong_weak,
            "acc_strong_gold": acc_strong_gold,
            "acc_weak_gold": acc_weak_gold
        }
    cross_results.loc[len(cross_results.index)] = [weak_model, strong_model, np.nanmean(pgrs), np.nanstd(pgrs)]
    print(f"PGR: {np.mean(pgrs):.3f} ± {np.std(pgrs):.3f}")

Running cross model PGR: weak=gpt-4o-mini, strong=gpt-4.1-nano
Acc (strong model with weak labels): 0.7800
Acc (strong model with gold labels): 0.8000
Acc (weak model with gold labels): 0.5400
Acc (strong model with weak labels): 0.8400
Acc (strong model with gold labels): 0.8000
Acc (weak model with gold labels): 0.5400




Acc (strong model with weak labels): 0.8000
Acc (strong model with gold labels): 0.7200
Acc (weak model with gold labels): 0.6000
PGR: 0.974 ± 0.036


In [26]:
logs

{('gpt-4.1-mini',
  'gpt-5-mini'): {0: {'pgr': 1.0,
   'acc_strong_weak': 0.96,
   'acc_strong_gold': 0.94,
   'acc_weak_gold': 0.92}, 1: {'pgr': 1.0,
   'acc_strong_weak': 0.96,
   'acc_strong_gold': 0.96,
   'acc_weak_gold': 0.9}, 2: {'pgr': 1.0,
   'acc_strong_weak': 0.98,
   'acc_strong_gold': 0.94,
   'acc_weak_gold': 0.92}},
 ('o3-mini',
  'gpt-5-mini'): {0: {'pgr': 0.6666666666666664,
   'acc_strong_weak': 0.94,
   'acc_strong_gold': 0.98,
   'acc_weak_gold': 0.86}, 1: {'pgr': 1.0,
   'acc_strong_weak': 0.96,
   'acc_strong_gold': 0.94,
   'acc_weak_gold': 0.86}, 2: {'pgr': 1.0,
   'acc_strong_weak': 0.96,
   'acc_strong_gold': 0.96,
   'acc_weak_gold': 0.84}},
 ('gpt-4.1-nano',
  'gpt-5-mini'): {0: {'pgr': 0.7777777777777775,
   'acc_strong_weak': 0.94,
   'acc_strong_gold': 0.98,
   'acc_weak_gold': 0.8}, 1: {'pgr': 1.0,
   'acc_strong_weak': 0.96,
   'acc_strong_gold': 0.94,
   'acc_weak_gold': 0.8}, 2: {'pgr': 1.0,
   'acc_strong_weak': 0.96,
   'acc_strong_gold': 0.96,
   '

In [27]:
cross_results.to_csv("results/cross_model_math_pgr.csv")
with open("logs/cross_model_logs.json", "w") as f:
    # Convert tuple keys to strings for JSON serialization
    logs_str_keys = {str(k): v for k, v in logs.items()}
    json.dump(logs_str_keys, f)

## K Few shot experiment

In [31]:
k_few_shot_results = {}
logs = {}
for k in range(0, 6, 2):
    print(f"Running experiments with k={k} few-shots...")
    indices = choose_indices(len(train_ds), k, seed=42)
    pgrs = []
    logs[k] = {}
    for i in range(3):
        pgr, acc_strong_weak, acc_strong_gold, acc_weak_gold = await run_pgr_experiment(
            "o3-mini",
            "gpt-4.1-mini",
            train_ds,
            test_ds[:50],
            indices,
            verbose=True
        )
        pgrs.append(pgr)
        logs[k][i] = {
            "pgr": pgr,
            "acc_strong_weak": acc_strong_weak,
            "acc_strong_gold": acc_strong_gold,
            "acc_weak_gold": acc_weak_gold
        }
    mean_pgr = np.nanmean(pgrs)
    std_pgr = np.nanstd(pgrs, ddof=1)
    print(pgrs)
    print(f"Mean PGR: {mean_pgr}, Std: {std_pgr}")
    results[k] = {"mean": mean_pgr, "std": std_pgr}

with open("logs/few_shot_logs.json", "w") as f:
    json.dump(logs, f)
with open("results/few_shot_performance.json", "w") as f:
    json.dump(results, f)

Running experiments with k=0 few-shots...
Acc (strong model with weak labels): 0.9600
Acc (strong model with gold labels): 0.9400
Acc (weak model with gold labels): 0.8200
Acc (strong model with weak labels): 0.9800
Acc (strong model with gold labels): 0.9200
Acc (weak model with gold labels): 0.8000
Acc (strong model with weak labels): 0.9200
Acc (strong model with gold labels): 0.9200
Acc (weak model with gold labels): 0.8200
[1.0, 1.0, 1.0]
Mean PGR: 1.0, Std: 0.0
Running experiments with k=2 few-shots...
Acc (strong model with weak labels): 0.9400
Acc (strong model with gold labels): 0.9800
Acc (weak model with gold labels): 0.8000
Acc (strong model with weak labels): 0.9400
Acc (strong model with gold labels): 0.9200
Acc (weak model with gold labels): 0.8200




Acc (strong model with weak labels): 0.9000
Acc (strong model with gold labels): 0.9400
Acc (weak model with gold labels): 0.8000
[0.7777777777777775, 1.0, 0.7142857142857146]
Mean PGR: 0.8306878306878307, Std: 0.15002589289690813
Running experiments with k=4 few-shots...
Acc (strong model with weak labels): 0.9400
Acc (strong model with gold labels): 0.9000
Acc (weak model with gold labels): 0.8000
Acc (strong model with weak labels): 0.8800
Acc (strong model with gold labels): 0.9000
Acc (weak model with gold labels): 0.8000
Acc (strong model with weak labels): 0.9000
Acc (strong model with gold labels): 0.8600
Acc (weak model with gold labels): 0.8000
[1.0, 0.7999999999999998, 1.0]
Mean PGR: 0.9333333333333332, Std: 0.11547005383792526
