# Pass@k curves

In [None]:
from typing import Optional, TypeVar
import matplotlib.pyplot as plt
from pathlib import Path
from coderm.eval.metrics import get_pass_ks, pass_at_k, get_pass_ks_given_public, get_num_completions_per_problem, get_num_pass_public_per_problem
from coderm.utils import gunzip_json_read

from adjustText import adjust_text
import numpy as np
from math import comb
from itertools import product
import os

def calcEstVar(n, k, c):
    p = c / n
    var = 0
    for i in range(n+1):
        var += comb(n-i, k) * p**i / comb(n, k) * (comb(n-k, i) * (1-p)**(n-i))
    return var - (1-p)**(2*k)

def convert_basic_prompting(method: str):
    # if method == "basic_prompting":
    #     return "default"
    return method

T = TypeVar('T')
def split_dict_by_datasets(d: dict[tuple[str, str, str], T]) -> dict[tuple[str, str], T]:
    output_dict = {}
    for k, v in d.items():
        if k[0] not in output_dict:
            output_dict[k[0]] = {}
        output_dict[k[0]][k[1:]] = v
    return output_dict


class Result():
    def __init__(self, base_directory: str, diversity_directory: Optional[str], dataset: str, method: str, model: str) -> None:
        self.dataset = dataset
        self.method = method
        self.model = model
        self.path = os.path.join(base_directory, dataset, convert_basic_prompting(method) + "_" + model + "_temp0.9")
        if diversity_directory is None:
            self.diversity_path = None
        else:
            self.diversity_path = os.path.join(diversity_directory, dataset, convert_basic_prompting(method) + "_" + model + "_temp0.9", "results.npy")

        self.pass_ks = None
        self.pass_ks_given_public = None
        self.num_pass_public = None
        self.stds = None
        self.diversities = None

    def pass_k_exists(self) -> bool:
        return Path(self.path).exists()
    def diversity_exists(self) -> bool:
        if self.diversity_path is not None:
            return Path(self.diversity_path).exists()
        return False
   
    def _load_pass_k_results(self):
        if self.pass_ks is not None:
            return

        items = gunzip_json_read(self.path)["items"]
        upper_k = max(len(item["results"]) for item in items)
        pass_ks = []
        for k in range(1, upper_k+1):
            pass_ks.append(np.mean(get_pass_ks(items, k)))
        self.pass_ks = np.array(pass_ks)
 
        # vars = []
        # for item in items:
        #     single_problem = []
        #     for k in range(1, upper_k+1):
        #         single_problem.append(calcEstVar(len(items[0]["results"]), k, sum(i["passing"] for i in item["results"])))
        #     vars.append(single_problem)
        # vars = np.array(vars)
        # self.stds = np.sqrt(np.sum(vars, axis=0) / len(items) ** 2) * 2.5
    
    def _load_pass_k_public_results(self):
        assert self.pass_k_exists()
        items = gunzip_json_read(self.path)["items"]
        upper_k = min(len(item["results"]) for item in items)

        pass_ks_gp = []
        for k in range(1, upper_k+1):
            public_ks = get_pass_ks_given_public(items, k)
            assert public_ks is not None
            pass_ks_gp.append(np.mean(public_ks))

        self.pass_ks_given_public = np.array(pass_ks_gp)
        self.num_pass_public = np.array(get_num_pass_public_per_problem(items))
   
    def _load_diversity_results(self):
        assert self.diversity_exists()
        self.diversities = np.load(self.diversity_path)

    def get_diversities(self) -> np.ndarray:
        if self.diversities is None:
            self._load_diversity_results()
        return self.diversities

    def get_pass_ks(self) -> np.ndarray:
        if self.pass_ks is None:
            self._load_pass_k_results()
        return self.pass_ks
    
    def get_num_pass_public(self) -> Optional[np.ndarray]:
        if self.pass_ks_given_public is None:
            self._load_pass_k_public_results()
        return self.num_pass_public

    def get_pass_ks_given_public(self) -> Optional[np.ndarray]:
        if self.pass_ks_given_public is None:
            self._load_pass_k_public_results()
        return self.pass_ks_given_public

    def get_pass_ks_stds(self) -> np.ndarray:
        if self.pass_ks is None:
            self._load_pass_k_results()
        return self.stds

class ResultSeries():
    def __init__(self, base_directory: str, diversity_directory: Optional[str], datasets: list[str], models: list[str], methods: list[str]) -> None:
        self.base_directory = base_directory
        self.diversity_directory = diversity_directory

        self.big_dict = {}
        self.datasets = datasets
        self.models = models
        self.methods = methods

        self.the_dict: dict[tuple[str, str, str], Result] = {}
        for dataset, model, method in product(self.datasets, self.models, self.methods):
            add_result = Result(self.base_directory, self.diversity_directory, dataset, method, model)
            if add_result.pass_k_exists():
                self.the_dict[(dataset, model, method)] = add_result 
            else:
                print(f"Warning, not adding {(dataset, model, method)}.")

    def add_exps(self, r: list[Result]):
        for result in r:
            key = (result.dataset, result.model, result.method)
            assert key not in self.the_dict
            if not result.pass_k_exists():
                print(f"Warning, not adding {key}.")
                continue
            self.the_dict[key] = result

    def get_pass_ks(self, with_public: bool = False) -> dict[tuple[str, str, str], np.ndarray]:
        out_dict = {}
        for k, v in self.the_dict.items():
            out_dict[k] = v.get_pass_ks()
            if with_public:
                assert v.get_pass_ks_given_public() is not None
                out_dict[(k[0], k[1], "public_filtered_" + k[2])] = v.get_pass_ks_given_public()
        return out_dict

    def get_pass_ks_stds(self) -> dict[tuple[str, str, str], np.ndarray]:
        return {k: v.get_pass_ks_stds() for k, v in self.the_dict.items()}

    def get_num_pass_public(self) -> dict[tuple[str, str, str], np.ndarray]:
        out_dict = {}
        for k, v in self.the_dict.items():
            assert v.get_num_pass_public() is not None
            out_dict[k] = v.get_num_pass_public()
        return out_dict

    def get_diversities(self) -> dict[tuple[str, str, str], np.ndarray]:
        output_dict = {}
        for k, v in self.the_dict.items():
            if v.diversity_exists():
                output_dict[k] = v.get_diversities()
        return output_dict

DIVER_DIR = "../../other_logs/similar_logs/final_logs"
BASE_DIR = "../../final_results"

result_series = ResultSeries("../../final_results/base_v_instruct", "../../other_logs/similar_logs/final_logs/base_v_instruct",
    ["human_eval_plus", "mbpp_plus", "livecodebench_lite_v3"],
    ["baby-deepseek-b_sgl", "baby-deepseek-i_sgl", "llama318b_sgl", "llama318bi_sgl"],
    ["basic_prompting225"]
)
# result_series.add_exps([Result("../../final_results/llama405bi", None, "livecodebench_lite_v3", "basic_prompting10", "llama405bi_fire")])
result_series = ResultSeries("../../final_results/llama405bi", None,
    ["livecodebench_lite_v3"],
    ["llama405bi_fire", "llama"],
    ["basic_prompting10"]
)

# result_series = ResultSeries(BASE_DIR, DIVER_DIR, 
#     ["human_eval_plus", "mbpp_plus", "livecodebench_lite_v3"],
#     ["gpt-4o-mini", "gpt-4o", "deepseek-coder", "sonnet-3-5"],
#     ["basic_prompting225", "simple_idea225", "combo_observation_no"]
# )

In [None]:
datasets = ["mbpp_plus", "human_eval_plus", "livecodebench_lite_v3"]

for dataset in datasets:
    plt.figure(figsize=(12, 6))
    all_pass_ks = split_dict_by_datasets(result_series.get_pass_ks())[dataset]

    for label, values in all_pass_ks.items():
        ks = np.arange(len(values)) + 1
        pass_k = values
        linestyle = '-'
        plt.plot(ks, pass_k, label='|'.join(label), linestyle=linestyle)

    plt.xlabel('k')
    plt.xscale('log')
    plt.ylabel('Pass@k')
    plt.title(f'Pass@k vs k for various methods on {dataset}')
    plt.legend(fontsize='small', loc='lower center')
    plt.grid(True)
    plt.show()

In [None]:
dataset = "mbpp_plus"
datasets = ["mbpp_plus", "human_eval_plus", "livecodebench_lite_v3"]
for dataset in datasets:
    diversities = split_dict_by_datasets(result_series.get_diversities())[dataset]
    # diversities = {k: 1 - np.power(v, 4).mean() for k, v in diversities.items()}
    diversities = {k: 1 - v.mean() for k, v in diversities.items()}

    all_pass_ks = split_dict_by_datasets(result_series.get_pass_ks())[dataset]
    all_pass_0s = {k: v[-1] - v[0] for k, v in all_pass_ks.items()}
    
    plt.figure(figsize=(10, 6))
    # plt.hist(np.power(list(diversities.values())[3], 4))
    scatter = plt.scatter(list(all_pass_0s.values()), list(diversities.values()), c='blue', edgecolor='k', alpha=0.7, s=100)
    
    annotations = []
    for i, label in enumerate(all_pass_0s.keys()):
        annotations.append(plt.annotate(label, (list(all_pass_0s.values())[i], list(diversities.values())[i]), fontsize=9,))
    adjust_text(annotations, min_arrow_len=5, arrowprops=dict(arrowstyle="->", color='r', lw=0.3, alpha=0.8))    

    
    plt.xlabel('Difference in Pass@k - Pass@1', fontsize=12)
    plt.ylabel('Measured Diversity (higher is more diverse)', fontsize=12)
    plt.title(f'Measured Diversity vs Gain Through Pass@k ({dataset})', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.6)
    
    plt.show()

In [None]:
def split_into_public_private(d: dict[tuple[str, str], np.ndarray]) -> dict[tuple[str, str], np.ndarray]:
    CONSTANT = "public_filtered"
    possible = {k: [v] for k, v in d.items() if CONSTANT not in k[1]}

    for k, v in possible.items():
        v.append(d[(k[0], CONSTANT + "_" + k[1])])
    for k, v in possible.items():
        possible[k] = np.stack(v, axis=0)
    
    return possible

pp_data = split_into_public_private(split_dict_by_datasets(result_series.get_pass_ks(with_public=True))["human_eval_plus"])

MAX_LEN = 10
OFFSET = 2

public_idx = np.arange(MAX_LEN) + 1
private_idx = public_idx * OFFSET


plt.figure(figsize=(9, 9))
for label, values in pp_data.items():
    linestyle = '-'
    plt.plot(values[0, private_idx-1], values[1, public_idx-1], label=f'|'.join(label), linestyle=linestyle)
x = np.linspace(0.3, 1, 40)
y = x
plt.plot(x, y, label='x = y', linestyle='--', color='red')


plt.xlabel('private score')
plt.ylabel('public score')
plt.title(f'public vs private score')
plt.legend(fontsize='small', loc='lower center')
plt.grid(True)
plt.show()

In [None]:
datasets = ["human_eval_plus", "mbpp_plus", "livecodebench_lite_v3"]

for dataset in datasets:
    plt.figure(figsize=(12, 6))
    all_pass_ks = split_dict_by_datasets(result_series.get_pass_ks(with_public=False))[dataset]

    for label, values in all_pass_ks.items():
        ks = np.arange(len(values)) + 1
        pass_k = values
        linestyle = '-'
        plt.plot(ks, pass_k, label='|'.join(label), linestyle=linestyle)

    plt.xlabel('k')
    plt.xscale('log')
    plt.ylabel('Pass@k')
    plt.title(f'Pass@k vs k for various methods on {dataset}')
    plt.legend(fontsize='small', loc='lower center')
    plt.grid(True)
    plt.show()