# Pass@k curves

In [None]:
from typing import Optional, TypeVar
import matplotlib.pyplot as plt
from pathlib import Path
from coderm.eval.metrics import get_pass_ks, pass_at_k, get_pass_ks_given_public, get_num_completions_per_problem, get_num_pass_public_per_problem
from coderm.utils import gunzip_json_read

from adjustText import adjust_text
import numpy as np
from math import comb
from itertools import product
import os

def calcEstVar(n, k, c):
    p = c / n
    var = 0
    for i in range(n+1):
        var += comb(n-i, k) * p**i / comb(n, k) * (comb(n-k, i) * (1-p)**(n-i))
    return var - (1-p)**(2*k)

def convert_basic_prompting(method: str):
    # if method == "basic_prompting":
    #     return "default"
    return method

T = TypeVar('T')
def split_dict_by_datasets(d: dict[tuple[str, str, str], T]) -> dict[tuple[str, str], T]:
    output_dict = {}
    for k, v in d.items():
        if k[0] not in output_dict:
            output_dict[k[0]] = {}
        output_dict[k[0]][k[1:]] = v
    return output_dict


class Result():
    def __init__(self, base_directory: str, diversity_directory: Optional[str], dataset: str, method: str, model: str) -> None:
        self.dataset = dataset
        self.method = method
        self.model = model
        self.path = os.path.join(base_directory, dataset, convert_basic_prompting(method) + "_" + model + "_temp0.9")
        if diversity_directory is None:
            self.diversity_path = None
        else:
            self.diversity_path = os.path.join(diversity_directory, dataset, convert_basic_prompting(method) + "_" + model + "_temp0.9", "results.npy")

        self.pass_ks = None
        self.pass_ks_given_public = None
        self.num_pass_public = None
        self.stds = None
        self.diversities = None

    def pass_k_exists(self) -> bool:
        return Path(self.path).exists()
    def diversity_exists(self) -> bool:
        if self.diversity_path is not None:
            return Path(self.diversity_path).exists()
        return False
   
    def _load_pass_k_results(self):
        if self.pass_ks is not None:
            return

        items = gunzip_json_read(self.path)["items"]
        upper_k = max(len(item["results"]) for item in items)
        pass_ks = []
        for k in range(1, upper_k+1):
            pass_ks.append(np.mean(get_pass_ks(items, k)))
        self.pass_ks = np.array(pass_ks)
 
        # vars = []
        # for item in items:
        #     single_problem = []
        #     for k in range(1, upper_k+1):
        #         single_problem.append(calcEstVar(len(items[0]["results"]), k, sum(i["passing"] for i in item["results"])))
        #     vars.append(single_problem)
        # vars = np.array(vars)
        # self.stds = np.sqrt(np.sum(vars, axis=0) / len(items) ** 2) * 2.5
    
    def _load_pass_k_public_results(self):
        assert self.pass_k_exists()
        items = gunzip_json_read(self.path)["items"]
        upper_k = max(len(item["results"]) for item in items)

        pass_ks_gp = []
        for k in range(1, upper_k+1):
            public_ks = get_pass_ks_given_public(items, k)
            assert public_ks is not None
            pass_ks_gp.append(np.mean(public_ks))

        self.pass_ks_given_public = np.array(pass_ks_gp)
        self.num_pass_public = np.array(get_num_pass_public_per_problem(items))
   
    def _load_diversity_results(self):
        assert self.diversity_exists()
        self.diversities = np.load(self.diversity_path)

    def get_diversities(self) -> np.ndarray:
        if self.diversities is None:
            self._load_diversity_results()
        return self.diversities

    def get_pass_ks(self) -> np.ndarray:
        if self.pass_ks is None:
            self._load_pass_k_results()
        return self.pass_ks
    
    def get_num_pass_public(self) -> Optional[np.ndarray]:
        if self.pass_ks_given_public is None:
            self._load_pass_k_public_results()
        return self.num_pass_public

    def get_pass_ks_given_public(self) -> Optional[np.ndarray]:
        if self.pass_ks_given_public is None:
            self._load_pass_k_public_results()
        return self.pass_ks_given_public

    def get_pass_ks_stds(self) -> np.ndarray:
        if self.pass_ks is None:
            self._load_pass_k_results()
        return self.stds

class ResultSeries():
    def __init__(self, base_directory: str, diversity_directory: Optional[str], datasets: list[str], models: list[str], methods: list[str]) -> None:
        self.base_directory = base_directory
        self.diversity_directory = diversity_directory

        self.big_dict = {}
        self.datasets = datasets
        self.models = models
        self.methods = methods

        self.the_dict: dict[tuple[str, str, str], Result] = {}
        for dataset, model, method in product(self.datasets, self.models, self.methods):
            add_result = Result(self.base_directory, self.diversity_directory, dataset, method, model)
            if add_result.pass_k_exists():
                self.the_dict[(dataset, model, method)] = add_result 
            else:
                print(f"Warning, not adding {(dataset, model, method)}.")

    def add_results(self, r: list[Result]):
        for result in r:
            key = (result.dataset, result.model, result.method)
            assert key not in self.the_dict
            if not result.pass_k_exists():
                print(f"Warning, not adding {key}.")
                continue
            self.the_dict[key] = result

    def add_result_series(self, rs: "ResultSeries"):
        for k, v in rs.the_dict.items():
            assert k not in self.the_dict
            self.the_dict[k] = v

    def get_pass_ks(self, with_public: bool = False) -> dict[tuple[str, str, str], np.ndarray]:
        out_dict = {}
        for k, v in self.the_dict.items():
            out_dict[k] = v.get_pass_ks()
            if with_public:
                assert v.get_pass_ks_given_public() is not None
                out_dict[(k[0], k[1], "public_filtered_" + k[2])] = v.get_pass_ks_given_public()
        return out_dict

    def get_pass_ks_stds(self) -> dict[tuple[str, str, str], np.ndarray]:
        return {k: v.get_pass_ks_stds() for k, v in self.the_dict.items()}

    def get_num_pass_public(self) -> dict[tuple[str, str, str], np.ndarray]:
        out_dict = {}
        for k, v in self.the_dict.items():
            assert v.get_num_pass_public() is not None
            out_dict[k] = v.get_num_pass_public()
        return out_dict

    def get_diversities(self) -> dict[tuple[str, str, str], np.ndarray]:
        output_dict = {}
        for k, v in self.the_dict.items():
            if v.diversity_exists():
                output_dict[k] = v.get_diversities()
        return output_dict

DIVER_DIR = "../../other_logs/similar_logs/final_logs"
BASE_DIR = "../../final_results"

# result_series = ResultSeries("../../final_results/base_v_instruct", "../../other_logs/similar_logs/final_logs/base_v_instruct",
#     ["human_eval_plus", "mbpp_plus", "livecodebench_lite_v3"],
#     ["baby-deepseek-b_sgl", "baby-deepseek-i_sgl", "llama318b_sgl", "llama318bi_sgl", "llama3170b_sgl", "llama3170bi_sgl"],
#     ["basic_prompting225"]
# )
# result_series.add_exps([Result("../../final_results/llama405bi", None, "livecodebench_lite_v3", "basic_prompting10", "llama405bi_fire")])
# result_series = ResultSeries("../../final_results/llama405bi", None,
#     ["livecodebench_lite_v3"],
#     ["llama405bi_fire", "llama"],
#     ["basic_prompting10"]
# )
result_series = ResultSeries(BASE_DIR, DIVER_DIR, 
    ["human_eval_plus", "mbpp_plus", "livecodebench_lite_v3"],
    ["gpt-4o-mini", "gpt-4o", "deepseek-coder", "sonnet-3-5"],
    ["basic_prompting225", "simple_idea225", "combo_observation_no"]
)

combo_dict = {}

result_series
for k in result_series.the_dict:
    # if "combo" in k[2]:
    combo_dict[result_series.the_dict[k].path] = {}
# result_series.add_result_series(result_seriesd)

In [None]:
for path in combo_dict:
    items = gunzip_json_read(path)["items"]
    combo_dict[path]["num_problems"] = len(items)
    combo_dict[path]["num_completions"] = sum(len(item["results"]) for item in items)

In [None]:
import json
import tiktoken
import tqdm
from multiprocessing import Pool, Manager

encoding = tiktoken.get_encoding("cl100k_base")

def process_path(path):
    log_path = os.path.join(path.replace("../../final_results/", "../../other_logs/final_logs/"), "queries")
    total_tokens = 0
    for log in os.listdir(log_path):
        with open(os.path.join(log_path, log), "r") as f:
            data = json.load(f)
            for query in data["queries"]:
                num_tokens = query["completion"]["num_tokens"]
                if num_tokens <= 0:
                    num_tokens = len(encoding.encode(query["completion"]["text"]))
                    print(query["completion"])
                    print(num_tokens)
                assert num_tokens > 0
                total_tokens += num_tokens
    return path, total_tokens

with Pool() as pool:
    results = list(tqdm.tqdm(pool.imap(process_path, combo_dict.keys()), total=len(combo_dict)))
    for path, total_tokens in results:
        combo_dict[path]["total_tokens"] = total_tokens

In [None]:
for v in combo_dict.values():
    print(v["total_tokens"] / v["num_completions"])

In [None]:
combo_dict

In [None]:
import numpy as np

averages = {}
stdevs = {}
methods = ["basic_prompting225", "simple_idea225", "combo_observation_no"]

for method in methods:
    total_tokens_list = []
    num_completions_list = []
    for path, data in combo_dict.items():
        if method in path:
            total_tokens_list.append(data["total_tokens"])
            num_completions_list.append(data["num_completions"])

    total_tokens_array = np.array(total_tokens_list)
    num_completions_array = np.array(num_completions_list)
    averages[method] = np.mean(total_tokens_array / num_completions_array)
    stdevs[method] = np.std(total_tokens_array / num_completions_array)

{"avg": averages, "std": stdevs}

In [None]:
import os
import json
import tiktoken
import tqdm
from multiprocessing import Pool, Manager

encoding = tiktoken.get_encoding("cl100k_base")

def process_path(path):
    log_path = "/mnt/efs/evanwang/src/models/research/evan/search/logs/basic_prompting_09-17T00:23:34/queries"
    # log_path = os.path.join(path.replace("../../final_results/", "../../other_logs/final_logs/"), "queries")
    total_tokens = 0
    for log in os.listdir(log_path):
        with open(os.path.join(log_path, log), "r") as f:
            data = json.load(f)
            for query in data["queries"]:
                num_tokens = query["completion"]["num_tokens"]
                if num_tokens <= 0:
                    num_tokens = len(encoding.encode(query["completion"]["text"]))
                    print(query["completion"])
                    print(num_tokens)
                assert num_tokens > 0
                total_tokens += num_tokens
    return path, total_tokens

# with Pool() as pool:
#     results = list(tqdm.tqdm(pool.imap(process_path, combo_dict.keys()), total=len(combo_dict)))
#     for path, total_tokens in results:
#         combo_dict[path]["total_tokens"] = total_tokens

# process_path()
process_path("l")[1] / 20 / 174 / 1400
