# Pass@k curves

In [None]:
import matplotlib.pyplot as plt
from pathlib import Path
from coderm.eval.metrics import get_pass_ks
from coderm.utils import gunzip_json_read
import numpy as np
from math import comb

def calcEstVar(n, k, c):
    p = c / n
    var = 0
    for i in range(n+1):
        var += comb(n-i, k) * p**i / comb(n, k) * (comb(n-k, i) * (1-p)**(n-i))
    return var - (1-p)**(2*k)


paths_to_results = [ # baselines (if compare)
    # "p4o_results/default4o",
    # "p4o_results/default4o_temp0.8",
    # "p4o_results/idea_filter4o_temp0.5",
    # "p4o_results/simple_filter4o_temp0.8",
    # "p4o_results/simple_idea4o_temp0.5",
    # "p4o_results/pseudocode_temp0.5",

   # "sweeps/simpleidea_small_it0.2_ct0.2",
    # "sweeps/simpleidea_small_it0.3_ct0.3",
    # "sweeps/simpleidea_small_it0.3_ct0.4",
    # "sweeps/simpleidea_small_it0.4_ct0.4",
    # "sweeps/simpleidea_small_it0.5_ct0.5",
    # "sweeps/simpleidea_small_it0.6_ct0.6",
    # "test_results/observation_small",
    # "test_results/filter_temp0.5",
    # "test_results/ideafilter_temp0.5",
    # "test_results/simpleidea_temp0.5",
    # "test_results/default",
    # "test_results/default_temp0.8",
    # "sweeps/simpleidea_small_temp0.4",
    # "sweeps/simpleidea_small_temp0.45",
    # "sweeps/simpleidea_small_temp0.5",
    # "sweeps/simpleidea_small_temp0.55",
    # "test_results/simpleidea_small",
    # "test_results/simpleidea_small_temp0.1",
    # "test_results/simpleidea_small_temp0.2",
    # "test_results/simpleidea_small_temp0.4",
    # "test_results/simpleidea_small_temp0.5",
    # "test_results/simpleidea_small_temp0.6",
    # "test_results/simpleidea_small_temp0.8",
    # "test_results/default_small",
    # "test_results/default_small_temp0.8",
]
compare = [
    # "p4o_results/default4omi_temp0",
    # "p4o_results/default4omi_temp0_redo",
    # "p4o_results/default4omi_temp0_redo2",
    # "p4o_results/default4omi_temp0_redo3",

    # "p4o_results/default4omi_temp0.8",
    # "p4o_results/default4omi_temp0.8_redo",

    # "p4o_results/idea_filter4omi_temp0.5",
    # "p4o_results/simple_filter4omi_temp0.5",
    # "p4o_results/simple_filter4omi_temp0.8",
    "p4o_results/simple_idea4omi_temp0.5",
    "p4o_results/simple_idea4omi_temp0.5_zshot",
    "p4o_results/simple_idea4omi_temp1.0",
    # "p4o_results/simple_idea4omi_temp0.5_redo",
    # "p4o_results/simple_idea4omi_temp0.5_redo2",
    # "p4o_results/simple_idea4omi_temp0.5_redo3",

    "p4o_results/simple_idea4omi_temp0.8",
    "p4o_results/simple_idea4omi_temp0.8_zshot",
    # "p4o_results/simple_idea4omi_temp0.8_redo",

    # "p4o_results/simple_idea4omi_it0.8ct0.4",
    # "p4o_results/simple_idea4omi_it0.8ct0.4_redo",

    "p4o_results/simple_idea4omi_temp1.2",
    # "p4o_results/simple_idea4omi_temp1.2_redo",

    # "p4o_results/pseudocode_temp0.5",

 
    # "test_results/simpleidea_small_temp0.4",
    # "test_results/simpleidea_small_temp0.5",
    # "test_results/simpleidea_small_temp0.6",
]
from pathlib import Path
for p in paths_to_results + compare:
    assert Path(p).exists(), f"Path {p} doesn't exist!"

In [None]:
all_pass_ks = {}
for r in (paths_to_results + compare):
    print(f"Reading", r)
    items = gunzip_json_read(r)["items"]
    upper_k = len(items[0]["results"])
    pass_ks = {}
    for k in range(1, upper_k+1):
        pass_ks[k] = np.mean(get_pass_ks(items, k))
    all_pass_ks[r] = pass_ks

all_std = {}
for r in (paths_to_results + compare):
    print(f"Reading", r)
    items = gunzip_json_read(r)["items"]
    upper_k = len(items[0]["results"])
    
    vars = []
    for item in items:
        single_problem = []
        for k in range(1, upper_k+1):
            single_problem.append(calcEstVar(len(items[0]["results"]), k, sum(i["passing"] for i in item["results"])))
        vars.append(single_problem)

    vars = np.array(vars)
    all_std[r] = np.sqrt(np.sum(vars, axis=0) / len(items) ** 2) * 2.5

In [None]:
ps_idea4o = []
ps_idea4omi = []

# Read and process simple_idea4o
items_idea4o = gunzip_json_read("p4o_results/simple_idea4o_temp0.5")["items"]
for item in items_idea4o:
    trials = []
    for trial in item["results"]:
        trials.append(trial["passing"])
    ps_idea4o.append(sum(trials) / len(trials))

# Read and process simple_idea4omi
items_idea4omi = gunzip_json_read("p4o_results/simple_idea4omi_temp0.5")["items"]
for item in items_idea4omi:
    trials = []
    for trial in item["results"]:
        trials.append(trial["passing"])
    ps_idea4omi.append(sum(trials) / len(trials))

# Plot histograms for both distributions
plt.hist(ps_idea4o, alpha=0.5, label='simple_idea4o')
plt.hist(ps_idea4omi, alpha=0.5, label='simple_idea4omi')
plt.legend(loc='upper right')
plt.xlabel('Problem Solve Probability')
plt.ylabel('Frequency')
plt.title('Distribution of Problem Solve Probabilities')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))

for label, values in all_pass_ks.items():
    ks = list(values.keys())
    pass_at_k = list(values.values())
    std_devs = list(all_std[label])
    linestyle = '--' if any(p in label for p in paths_to_results) and len(compare) > 0 else '-'
    plt.plot(ks, pass_at_k, label=Path(label), linestyle=linestyle)
    plt.fill_between(ks, np.array(pass_at_k) - np.array(std_devs), np.array(pass_at_k) + np.array(std_devs), alpha=0.2)

plt.xlabel('k')
plt.xscale('log')
plt.ylabel('Pass@k')
plt.title('Pass@k vs k for GPT-4o-mini various methods')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Generate paths for the 25 result files
idea_temps = [0.2, 0.3, 0.4, 0.5, 0.6]
code_temps = [0.2, 0.3, 0.4, 0.5, 0.6]
heatmap_data = np.zeros((len(idea_temps), len(code_temps)))

sel_k = 90
for i, it in enumerate(idea_temps):
    for j, ct in enumerate(code_temps):
        path = f"sweeps/simpleidea_small_it{it}_ct{ct}"
        if not Path(path).exists():
            pass_ks = 0
        else:
            items = gunzip_json_read(path)["items"]
            pass_ks = np.mean(get_pass_ks(items, sel_k))
        heatmap_data[i, j] = pass_ks
# Plotting the heatmap
plt.figure(figsize=(10, 8))
plt.imshow(heatmap_data, cmap='plasma', origin='lower')
plt.colorbar(label=f'Pass@{sel_k}')
plt.xticks(ticks=np.arange(len(code_temps)), labels=code_temps)
plt.yticks(ticks=np.arange(len(idea_temps)), labels=idea_temps)
plt.xlabel('Code Temperature (ct)')
plt.ylabel('Idea Temperature (it)')
plt.title(f'Pass@{sel_k} Heatmap for Different Temperatures')

# Function to determine text color based on brightness
def get_text_color(value, cmap):
    norm = plt.Normalize(vmin=heatmap_data.min(), vmax=heatmap_data.max())
    rgba = cmap(norm(value))
    brightness = 0.299 * rgba[0] + 0.587 * rgba[1] + 0.114 * rgba[2]
    return 'black' if brightness > 0.5 else 'white'

cmap = plt.get_cmap('plasma')
for i in range(len(idea_temps)):
    for j in range(len(code_temps)):
        text_color = get_text_color(heatmap_data[i, j], cmap)
        plt.text(j, i, f"{heatmap_data[i, j]:.2f}", ha='center', va='center', color=text_color, fontsize=12)
plt.show()