# Main experiment

## imports and setup

In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import pandas as pd

from src.pgr_experiment import *
from src.utils import choose_indices
from src.config import MODELS_LIST
from src.math_dataset import load_questions

In [6]:
train_ds = load_questions("train", limit=200)
test_ds = load_questions("test", limit=200)
indices = choose_indices(len(train_ds), 3, seed=42)

## Compare Performance of each model on MATH dataset

In [12]:
results = {}
for model in MODELS_LIST:
    print(f"Running experiments for model: {model}")
    score = await ask_with_gold(model, train_ds, test_ds, indices)
    results[model] = score
print(results)

Running experiments for model: gpt-5-mini
Running experiments for model: o3-mini
Running experiments for model: gpt-4.1-mini
Running experiments for model: gpt-4o-mini
Running experiments for model: gpt-4.1-nano




{'gpt-5-mini': 0.95, 'o3-mini': 0.83, 'gpt-4.1-mini': 0.885, 'gpt-4o-mini': 0.25, 'gpt-4.1-nano': 0.765}


In [None]:
# save results
with open("results/model_math_performance.json", "w") as f:
    json.dump(results, f)

In [13]:
pgr = await run_pgr_experiment(
    "gpt-4o-mini",
    "gpt-4.1-mini",
    train_ds,
    test_ds,
    indices,
    verbose=True
)
print("====================================")
print(f"PGR           : {pgr:.3f}")
print("====================================")




Acc (strong model with weak labels): 0.8650
Acc (strong model with gold labels): 0.8850
Acc (weak model with gold labels): 0.2900
PGR           : 0.966


In [34]:
cross_results = pd.DataFrame(columns=["weak_model", "strong_model", "acc"])

In [35]:
strong_model = "gpt-5-mini"
for weak_model in MODELS_LIST:
    if weak_model == strong_model:
        continue
    print(f"Running cross model PGR: weak={weak_model}, strong={strong_model}")
    pgr = await run_pgr_experiment(
        weak_model,
        strong_model,
        train_ds,
        test_ds,
        indices,
        verbose=True
    )
    cross_results.loc[len(cross_results.index)] = [weak_model, strong_model, pgr]
    print(f"PGR: {pgr:.3f}")

Running cross model PGR: weak=o3-mini, strong=gpt-5-mini
Acc (strong model with weak labels): 0.9400
Acc (strong model with gold labels): 0.9500
Acc (weak model with gold labels): 0.8100
PGR: 0.929
Running cross model PGR: weak=gpt-4.1-mini, strong=gpt-5-mini
Acc (strong model with weak labels): 0.9300
Acc (strong model with gold labels): 0.9450




Acc (weak model with gold labels): 0.8850
PGR: 0.750
Running cross model PGR: weak=gpt-4o-mini, strong=gpt-5-mini
Acc (strong model with weak labels): 0.9300
Acc (strong model with gold labels): 0.9400
Acc (weak model with gold labels): 0.2850
PGR: 0.985
Running cross model PGR: weak=gpt-4.1-nano, strong=gpt-5-mini
Acc (strong model with weak labels): 0.9350
Acc (strong model with gold labels): 0.9400




Acc (weak model with gold labels): 0.7700
PGR: 0.971


In [36]:
strong_model = "o3-mini"
for weak_model in MODELS_LIST:
    if weak_model == strong_model:
        continue
    if weak_model == "gpt-5-mini":
        continue
    print(f"Running cross model PGR: weak={weak_model}, strong={strong_model}")
    pgr = await run_pgr_experiment(
        weak_model,
        strong_model,
        train_ds,
        test_ds,
        indices,
        verbose=True
    )
    cross_results.loc[len(cross_results.index)] = [weak_model, strong_model, pgr]
    print(f"PGR: {pgr:.3f}")

Running cross model PGR: weak=gpt-4.1-mini, strong=o3-mini
Acc (strong model with weak labels): 0.8100
Acc (strong model with gold labels): 0.8250




Acc (weak model with gold labels): 0.8900
PGR: 1.231
Running cross model PGR: weak=gpt-4o-mini, strong=o3-mini
Acc (strong model with weak labels): 0.8450
Acc (strong model with gold labels): 0.8100
Acc (weak model with gold labels): 0.3050
PGR: 1.069
Running cross model PGR: weak=gpt-4.1-nano, strong=o3-mini
Acc (strong model with weak labels): 0.8100
Acc (strong model with gold labels): 0.8100




Acc (weak model with gold labels): 0.7450
PGR: 1.000


In [37]:
strong_model = "gpt-4.1-mini"
for weak_model in MODELS_LIST:
    if weak_model == strong_model:
        continue
    if weak_model == "gpt-5-mini" or weak_model == "o3-mini":
        continue
    print(f"Running cross model PGR: weak={weak_model}, strong={strong_model}")
    pgr = await run_pgr_experiment(
        weak_model,
        strong_model,
        train_ds,
        test_ds,
        indices,
        verbose=True
    )
    cross_results.loc[len(cross_results.index)] = [weak_model, strong_model, pgr]
    print(f"PGR: {pgr:.3f}")

Running cross model PGR: weak=gpt-4o-mini, strong=gpt-4.1-mini




Acc (strong model with weak labels): 0.8800




Acc (strong model with gold labels): 0.8600
Acc (weak model with gold labels): 0.3100
PGR: 1.036
Running cross model PGR: weak=gpt-4.1-nano, strong=gpt-4.1-mini
Acc (strong model with weak labels): 0.8650




Acc (strong model with gold labels): 0.8800




Acc (weak model with gold labels): 0.7350
PGR: 0.897


In [38]:
cross_results

Unnamed: 0,weak_model,strong_model,acc
0,o3-mini,gpt-5-mini,0.928571
1,gpt-4.1-mini,gpt-5-mini,0.75
2,gpt-4o-mini,gpt-5-mini,0.984733
3,gpt-4.1-nano,gpt-5-mini,0.970588
4,gpt-4.1-mini,o3-mini,1.230769
5,gpt-4o-mini,o3-mini,1.069307
6,gpt-4.1-nano,o3-mini,1.0
7,gpt-4o-mini,gpt-4.1-mini,1.036364
8,gpt-4.1-nano,gpt-4.1-mini,0.896552


In [39]:
pgr = await run_pgr_experiment(
    "gpt-4o-mini",
    "gpt-4.1-nano",
    train_ds,
    test_ds,
    indices,
    verbose=True
)
cross_results.loc[len(cross_results.index)] = [weak_model, strong_model, pgr]
print(f"PGR: {pgr:.3f}")

Running cross model PGR: weak=gpt-4.1-nano, strong=gpt-4.1-mini




Acc (strong model with weak labels): 0.7500




Acc (strong model with gold labels): 0.7750
Acc (weak model with gold labels): 0.2950
PGR: 0.948


In [41]:
cross_results.to_csv("results/cross_model_math_pgr.csv")