# Pass@k curves

In [None]:
from typing import Optional, TypeVar
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from pathlib import Path
from coderm.eval.metrics import get_pass_ks, pass_at_k, get_pass_ks_given_public, get_num_completions_per_problem, get_num_pass_public_per_problem
from coderm.utils import gunzip_json_read
import json

from adjustText import adjust_text
import numpy as np
from math import comb
from itertools import product
import os

def calcEstVar(n, k, c):
    p = c / n
    var = 0
    for i in range(n+1):
        var += comb(n-i, k) * p**i / comb(n, k) * (comb(n-k, i) * (1-p)**(n-i))
    return var - (1-p)**(2*k)

def convert_basic_prompting(method: str):
    # if method == "basic_prompting":
    #     return "default"
    return method

T = TypeVar('T')
def split_dict_by_datasets(d: dict[tuple[str, str, str], T]) -> dict[tuple[str, str], T]:
    output_dict = {}
    for k, v in d.items():
        if k[0] not in output_dict:
            output_dict[k[0]] = {}
        output_dict[k[0]][k[1:]] = v
    return output_dict

def calc_pass_k(ns, cs, k):
    pass_ks = []
    for n, c in zip(ns, cs):
        pass_ks.append(pass_at_k(n, c, k))
    return pass_ks

def count_ns(items, is_public: bool = False) -> list[int]:
    if not is_public:
        return [len(item["results"]) for item in items]
    else:
        out = []
        for item in items:
            n = 0
            for result in item["results"]:
                if result.get("passing_public", None) is None:
                    return None
                if result["passing_public"]:
                    n += 1
            out.append(n)
        return out

def count_cs(items, is_public: bool = False) -> list[int]:
    if not is_public:
        return [sum(ex["passing"] for ex in item["results"]) for item in items]
    else:
        out = []
        for item in items:
            correct = 0
            for result in item["results"]:
                if result.get("passing_public", None) is None:
                    return None
                if result["passing"]:
                    if not result["passing_public"]:
                        print("Warning: passes private but does not pass public")
                    else:
                        correct += 1
            out.append(correct)
        return out

def calc_pass_k(ns, cs, k):
    pass_ks = []
    for n, c in zip(ns, cs):
        pass_ks.append(pass_at_k(n, c, k))
    return pass_ks

class Result():
    def __init__(self, base_directory: str, diversity_directory: Optional[str], dataset: str, method: str, model: str, temp: float = 0.9) -> None:
        self.dataset = dataset
        self.method = method
        self.model = model
        self.temp = temp
        self.path = os.path.join(base_directory, dataset, convert_basic_prompting(method) + "_" + model + f"_temp{self.temp}")
        if diversity_directory is None:
            self.diversity_path = None
        else:
            self.diversity_path = os.path.join(diversity_directory, dataset, convert_basic_prompting(method) + "_" + model + f"_temp{self.temp}", "results.npy")

        self.pass_ks = None
        self.pass_ks_given_public = None
        self.num_pass_public = None
        self.stds = None
        self.diversities = None

    def pass_k_exists(self) -> bool:
        return Path(self.path).exists()
    def diversity_exists(self) -> bool:
        if self.diversity_path is not None:
            return Path(self.diversity_path).exists()
        return False
   
    def _load_pass_k_results(self):
        if self.pass_ks is not None:
            return

        items = gunzip_json_read(self.path)["items"]
        ns = count_ns(items)
        cs = count_cs(items)
        upper_k = max(len(item["results"]) for item in items)
        pass_ks = []
        for k in range(1, upper_k+1):
            pass_ks.append(np.mean(calc_pass_k(ns, cs, k)))
        self.pass_ks = np.array(pass_ks)
 
        # vars = []
        # for item in items:
        #     single_problem = []
        #     for k in range(1, upper_k+1):
        #         single_problem.append(calcEstVar(len(items[0]["results"]), k, sum(i["passing"] for i in item["results"])))
        #     vars.append(single_problem)
        # vars = np.array(vars)
        # self.stds = np.sqrt(np.sum(vars, axis=0) / len(items) ** 2) * 2.5
    
    def _load_pass_k_public_results(self):
        assert self.pass_k_exists()
        items = gunzip_json_read(self.path)["items"]
        upper_k = max(len(item["results"]) for item in items)
        ns = count_ns(items, is_public=True)
        cs = count_cs(items, is_public=True)

        pass_ks_gp = []
        for k in range(1, upper_k+1):
            assert (ns is not None) and (cs is not None)
            public_ks = calc_pass_k(ns, cs, k)
            # public_ks = get_pass_ks_given_public(items, k)
            assert public_ks is not None
            pass_ks_gp.append(np.mean(public_ks))

        self.pass_ks_given_public = np.array(pass_ks_gp)
        self.num_pass_public = np.array(cs)
   
    def _load_diversity_results(self):
        assert self.diversity_exists()
        self.diversities = np.load(self.diversity_path)

    def get_diversities(self) -> np.ndarray:
        if self.diversities is None:
            self._load_diversity_results()
        return self.diversities

    def get_pass_ks(self) -> np.ndarray:
        if self.pass_ks is None:
            self._load_pass_k_results()
        return self.pass_ks
    
    def get_num_pass_public(self) -> Optional[np.ndarray]:
        if self.pass_ks_given_public is None:
            self._load_pass_k_public_results()
        return self.num_pass_public

    def get_pass_ks_given_public(self) -> Optional[np.ndarray]:
        if self.pass_ks_given_public is None:
            self._load_pass_k_public_results()
        return self.pass_ks_given_public

    def get_pass_ks_stds(self) -> np.ndarray:
        if self.pass_ks is None:
            self._load_pass_k_results()
        return self.stds

class ResultSeries():
    def __init__(self, base_directory: str, diversity_directory: Optional[str], datasets: list[str], models: list[str], methods: list[str], temps: list[float] = None) -> None:
        if temps is None:
            temps = [0.9]
        self.base_directory = base_directory
        self.diversity_directory = diversity_directory

        self.big_dict = {}
        self.datasets = datasets
        self.models = models
        self.methods = methods
        self.temps = temps

        self.the_dict: dict[tuple[str, str, str], Result] = {}
        for dataset, model, method, temp in product(self.datasets, self.models, self.methods, self.temps):
            add_result = Result(self.base_directory, self.diversity_directory, dataset, method, model, temp=temp)
            if add_result.pass_k_exists():
                self.the_dict[(dataset, model, method, temp)] = add_result 
            else:
                print(f"Warning, not adding {(dataset, model, method, temp)}.")

    def add_results(self, r: list[Result]):
        for result in r:
            key = (result.dataset, result.model, result.method)
            assert key not in self.the_dict
            if not result.pass_k_exists():
                print(f"Warning, not adding {key}.")
                continue
            self.the_dict[key] = result

    def add_result_series(self, rs: "ResultSeries"):
        for k, v in rs.the_dict.items():
            assert k not in self.the_dict
            self.the_dict[k] = v

    def get_pass_ks(self, with_public: bool = False, with_temp: bool = False) -> dict[tuple[str, str, str], np.ndarray]:
        out_dict = {}
        for k, v in self.the_dict.items():
            out_dict[k] = v.get_pass_ks()
            if with_public:
                assert v.get_pass_ks_given_public() is not None
                out_dict[(k[0], k[1], "public_filtered_" + k[2], k[3])] = v.get_pass_ks_given_public()

        if not with_temp:
            no_temp = {}
            for k, v in out_dict.items():
                no_temp[(k[0], k[1], k[2])] = v
            return no_temp
        return out_dict

    def get_pass_ks_stds(self) -> dict[tuple[str, str, str], np.ndarray]:
        return {k: v.get_pass_ks_stds() for k, v in self.the_dict.items()}

    def get_num_pass_public(self) -> dict[tuple[str, str, str], np.ndarray]:
        out_dict = {}
        for k, v in self.the_dict.items():
            assert v.get_num_pass_public() is not None
            out_dict[k] = v.get_num_pass_public()
        return out_dict

    def get_diversities(self) -> dict[tuple[str, str, str], np.ndarray]:
        output_dict = {}
        for k, v in self.the_dict.items():
            if v.diversity_exists():
                output_dict[k] = v.get_diversities()
        return output_dict

DIVER_DIR = "../../other_logs/similar_logs/final_logs"
BASE_DIR = "../../final_results"

# result_series.add_exps([Result("../../final_results/llama405bi", None, "livecodebench_lite_v3", "basic_prompting10", "llama405bi_fire")])
# result_series = ResultSeries("../../final_results/llama405bi", None,
#     ["livecodebench_lite_v3"],
#     ["llama405bi_fire", "llama"],
#     ["basic_prompting10"]
# )
# result_series.add_result_series(result_seriesd)

In [None]:
datasets = {"mbpp_plus": "MBPP+", "human_eval_plus": "HumanEval+", "livecodebench_lite_v3": "LiveCodeBench"}
label_to_str = {"basic_prompting225": "Repeated Sampling", "simple_idea225": "IdeaSearch", "combo_observation_no": "PlanSearch", }
model_to_str = {
    "gpt-4o-mini": "GPT-4o-mini",
    "gpt-4o": "GPT-4o",
    "deepseek-coder": "DeepSeek-Coder-V2",
    "sonnet-3-5": "Sonnet-3.5",
    "baby-deepseek-b_sgl": "DeepSeek-Coder-V2-Lite-Base",
    "baby-deepseek-i_sgl": "DeepSeek-Coder-V2-Lite-Instruct",
    "llama318b_sgl": "Llama-3.1-8B-Base",
    "llama318bi_sgl": "Llama-3.1-8B-Instruct",
    "llama3170b_sgl": "Llama-3.1-70B-Base",
    "llama3170bi_sgl": "Llama-3.1-70B-Instruct"
}
color_scheme = {
    'basic_prompting225': '#4DA6FF',  # Slightly darker blue
    'simple_idea225': '#A64DFF',      # Slightly darker purple
    'combo_observation_no': '#FF704D', # Slightly darker orange
}
BASIC_AVG = 244.05671435131
COMBO_AVG = 1427.981831636804

TITLE_FONT_SIZE = 25
TICK_FONT_SIZE = 16
AXIS_FONT_SIZE = 22
LEGEND_FONT_SIZE = 20
LEGEND_TITLE_FONT_SIZE = 22

# CoT

In [None]:
result_series = ResultSeries(BASE_DIR, DIVER_DIR, 
    ["human_eval_plus", "mbpp_plus", "livecodebench_lite_v3"],
    ["gpt-4o-mini", "gpt-4o", "deepseek-coder", "sonnet-3-5"],
    ["basic_prompting225", "basic_prompting_cot225", "simple_idea225", "combo_observation_no"],
    temps=[0.9]
)

In [None]:
label_to_str_cot = {"basic_prompting225": "Repeated Sampling", "basic_prompting_cot225": "Chain-of-Thought", "simple_idea225": "IdeaSearch", "combo_observation_no": "PlanSearch", }
color_scheme_cot = {
    'basic_prompting225': '#4DA6FF',  # Slightly darker blue
    'simple_idea225': '#A64DFF',      # Slightly darker purple
    'basic_prompting_cot225': '#FFB84D',  # Slightly darker yellow
    'combo_observation_no': '#FF704D', # Slightly darker orange
}

use_public = False 
MAX_X = 200 if not use_public else 20

for dataset in datasets:
    all_pass_ks = split_dict_by_datasets(result_series.get_pass_ks(with_public=use_public))[dataset]
    all_pass_ks_by_model = split_dict_by_datasets(all_pass_ks)
    
    num_models = len(result_series.models)
    num_rows = 2
    num_cols = 2
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(14, 14), squeeze=False)
    if use_public:
        fig.suptitle(f'Pass@k vs k with CoT (Public Filtering) on {datasets[dataset]}', fontsize=24, fontweight='medium')
    else:
        fig.suptitle(f'Pass@k vs k with CoT on {datasets[dataset]}', fontsize=24, fontweight='medium')
    
    # Find the global y-axis limits
    y_min, y_max = float('inf'), float('-inf')
    for model in result_series.models:
        final_data = all_pass_ks_by_model[model]
        for values in final_data.values():
            y_min = min(y_min, np.min(values[:MAX_X]))
            y_max = max(y_max, np.max(values[:MAX_X]))
    
    # Extend y_max slightly upward, but cap at 1
    y_max = min(y_max + 0.015, 1.0)
    
    for idx, model in enumerate(result_series.models):
        row = idx // num_cols
        col = idx % num_cols
        ax = axs[row, col]
        final_data = all_pass_ks_by_model[model]
        for i, (label, values) in enumerate(final_data.items()):
            to_plot = values[:MAX_X]
            ks = np.arange(1, MAX_X + 1)

            is_public_filtered = 'public_filtered' in label[0]
            if is_public_filtered:
                use_label = label[0].split("public_filtered_")[1]
            else:
                use_label = label[0]

            str_label = use_label if use_label not in label_to_str_cot else label_to_str_cot[use_label]
            str_label = str_label if not is_public_filtered else "Public Filtered " + str_label
            
            # Determine color and linestyle
            color = color_scheme_cot.get(use_label, 'black')
            
            if use_public:
                linestyle = '-' if is_public_filtered else '--'
                lw = 2.3 if is_public_filtered else 1.8
                a = 0.95 if is_public_filtered else 0.5
            else:
                linestyle = '-'
                lw = 2.3
                a = 0.95
            
            ax.plot(ks, to_plot, label=str_label, linestyle=linestyle, linewidth=lw, color=color, alpha=a)
        
        if row == num_rows - 1:
            ax.set_xlabel('k', fontsize=18)
        ax.set_xscale('log')
        ax.set_xlim(1, MAX_X)
        if col == 0:
            ax.set_ylabel('Pass@k', fontsize=18)
        ax.set_title(f'{model_to_str[model]}', fontsize=21)
        ax.grid(True, linestyle='--', alpha=0.7)
        ax.set_ylim(y_min, y_max)
        
        # Improve tick labels
        ax.tick_params(axis='both', which='major', labelsize=16)
        ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
        
        # Add minor gridlines
        ax.xaxis.grid(True, which='minor', linestyle=':', alpha=0.35)
        ax.yaxis.grid(True, which='minor', linestyle=':', alpha=0.35)
    
    # Create a legend for the colors
    color_handles = [plt.Line2D([0], [0], color=color, lw=4, label=label_to_str_cot.get(label, label)) for label, color in color_scheme_cot.items()]
    first_legend = plt.legend(handles=color_handles, fontsize=16, loc='lower right', frameon=True)
    plt.gca().add_artist(first_legend)  # Add the first legend to the axes

    if use_public:
        # Create a legend for the line styles
        line_handles = [
            plt.Line2D([0], [0], linestyle='-', color='black', linewidth=2.5, label='Public Filtering'),
            plt.Line2D([0], [0], linestyle='--', color='black', linewidth=2, label='No Public Filtering', alpha=0.7)
        ]
        plt.legend(handles=line_handles, fontsize=16, loc='upper left', frameon=True)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    if use_public:
        plt.savefig(f"plots/public_cot_pass_at_k_{dataset}.pdf", format="pdf", dpi=300, bbox_inches='tight')
    else:
        plt.savefig(f"plots/cot_pass_at_k_{dataset}.pdf", format="pdf", dpi=300, bbox_inches='tight')
    plt.show()

# Avg

In [None]:
result_series = ResultSeries(BASE_DIR, DIVER_DIR, 
    ["human_eval_plus", "mbpp_plus", "livecodebench_lite_v3"],
    ["gpt-4o-mini", "gpt-4o", "deepseek-coder", "sonnet-3-5"],
    ["basic_prompting225", "simple_idea225", "combo_observation_no"],
    temps=[0.9]
)

In [None]:
use_public = True
MAX_X = 200 if not use_public else 20

for dataset in datasets:
    plt.figure(figsize=(13, 10))
    all_pass_ks = split_dict_by_datasets(result_series.get_pass_ks(with_public=use_public))[dataset]
    all_pass_ks_by_model = split_dict_by_datasets(all_pass_ks)
    avg = {}
    for model in result_series.models:
        final_data = all_pass_ks_by_model[model]
        starting_val = final_data[("basic_prompting225",)][0]
        for label, values in final_data.items():
            to_plot = values[:MAX_X]
            ks = np.arange(len(to_plot)) + 1
            pass_k = (to_plot - starting_val) / starting_val * 100
            if label[0] not in avg:
                avg[label[0]] = []
            avg[label[0]].append(pass_k)
        
    min_ylim, max_ylim = float('inf'), float('-inf')
    for label, values in avg.items():
        is_public_filtered = 'public_filtered' in label
        if is_public_filtered:
            use_label = label.split("public_filtered_")[1]
        else:
            use_label = label

        pass_k = np.stack(values, axis=0).mean(axis=0)
        ks = np.arange(len(pass_k)) + 1

        str_label = use_label if use_label not in label_to_str else label_to_str[use_label]
        str_label = str_label if not is_public_filtered else "Public Filtered " + str_label

        color = color_scheme.get(use_label, 'black')
        
        if use_public:
            linestyle = '-' if is_public_filtered else '--'
            lw = 2.5 if is_public_filtered else 2
            a = 1 if is_public_filtered else 0.55
        else:
            linestyle = '-'
            lw = 2.5
            a = 1
        
        plt.plot(ks[:MAX_X], pass_k[:MAX_X], label=str_label, linestyle=linestyle, color=color, linewidth=lw, alpha=a)
        
        min_ylim = min(min_ylim, pass_k[:MAX_X].min())
        max_ylim = max(max_ylim, pass_k[:MAX_X].max())

    plt.xlabel('k', fontsize=AXIS_FONT_SIZE)
    plt.xscale('log')
    plt.ylabel('Relative Gain (%)', fontsize=AXIS_FONT_SIZE)
    plt.xticks(fontsize=TICK_FONT_SIZE)
    plt.yticks(fontsize=TICK_FONT_SIZE)
    if use_public:
        plt.title(f'Average Gains with Public Filtering on {datasets[dataset]}', fontsize=TITLE_FONT_SIZE)
    else:
        plt.title(f'Average Gains on {datasets[dataset]}', fontsize=TITLE_FONT_SIZE)

    # Create a legend for the colors
    color_handles = [plt.Line2D([0], [0], color=color, lw=4, label=label_to_str.get(label, label)) for label, color in color_scheme.items()]
    first_legend = plt.legend(handles=color_handles, fontsize=LEGEND_FONT_SIZE, loc='lower right', frameon=True,)
    plt.gca().add_artist(first_legend)  # Add the first legend to the axes

    if use_public:
        # Create a legend for the line styles
        line_handles = [
            plt.Line2D([0], [0], linestyle='-', color='black', linewidth=2.5, label='Public Filtering'),
            plt.Line2D([0], [0], linestyle='--', color='black', linewidth=2, label='No Public Filtering', alpha=0.7)
        ]
        plt.legend(handles=line_handles, fontsize=LEGEND_FONT_SIZE, loc='upper left', frameon=True)

    plt.grid(True, linestyle='--', alpha=0.7)
    plt.grid(which='minor', linestyle='--', linewidth='0.5', color='gray', alpha=0.25)
    plt.minorticks_on()
    plt.xlim(1, MAX_X)
    plt.tight_layout()
    if use_public:
        plt.savefig(f"plots/public_avg_perfimprovement_{dataset}.pdf", format="pdf", dpi=300, bbox_inches='tight')
    else:
        plt.savefig(f"plots/avg_perfimprovement_{dataset}.pdf", format="pdf", dpi=300, bbox_inches='tight')
    plt.show()

# Bar Chart

In [None]:
pass_ks = result_series.get_pass_ks()
pass_ks_split = {}
for key, value in pass_ks.items():
    dataset, model, method = key
    if dataset not in pass_ks_split:
        pass_ks_split[dataset] = {}
    if model not in pass_ks_split[dataset]:
        pass_ks_split[dataset][model] = {}
    pass_ks_split[dataset][model][method] = value
pass_ks = pass_ks_split

K = 200
for dataset, models in pass_ks.items():
    for model, methods in models.items():
        di = {}
        for k1, v1 in methods.items():
            if "simple_idea" in k1:
                continue
            if "basic_prompting" in k1:
                di["basic_prompting1"] = methods["basic_prompting225"][0]
                di[f"basic_prompting{K}"] = methods["basic_prompting225"][K-1]
            else:
                di[k1] = v1[K-1]
        pass_ks[dataset][model] = di

def method_to_str(method: str, default_k: Optional[int] = None) -> str:
    if default_k is None:
        return label_to_str[method]
    else:
        if method.startswith("basic_prompting"):
            k = method.split("basic_prompting")[1]
            return f"Repeated Sampling@{k}"
        return label_to_str[method] + f"@{default_k}"

def method_to_color(method: str):
    method_colors = {
        "Repeated Sampling@1": '#4CAF50',  # Slightly darker green color
        "Repeated Sampling@200": color_scheme["basic_prompting225"],
        "PlanSearch@200": color_scheme["combo_observation_no"],
    }
    return method_colors.get(method, "#808080")

for dataset, models in pass_ks.items():
    model_names = list(models.keys())
    methods = list(models[model_names[0]].keys())
    x = np.arange(len(model_names))
    width = 0.25

    fig, ax = plt.subplots(figsize=(12, 9))
    multiplier = 0

    for method in methods:
        values = [models[model][method] for model in model_names]
        offset = width * multiplier
        rects = ax.bar(x + offset, values, width, label=method_to_str(method, K), 
                       color=method_to_color(method_to_str(method, K)), edgecolor='black', linewidth=0.4, alpha=0.81)
        multiplier += 1

    ax.set_ylabel('Pass@k', fontsize=AXIS_FONT_SIZE, fontweight='medium')
    ax.set_title(f'Pass@k Scores by Method on {datasets[dataset]}', fontsize=TITLE_FONT_SIZE, fontweight='medium')
    ax.set_xticks(x + width)
    ax.set_xticklabels([model_to_str[model] for model in model_names], rotation=45, ha='right', fontsize=TICK_FONT_SIZE)
    ax.legend(loc='lower right', fontsize=LEGEND_FONT_SIZE*0.9, frameon=True, edgecolor='black', fancybox=True, framealpha=1.0)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(1.25)
    ax.spines['bottom'].set_linewidth(1.25)

    ax.tick_params(axis='both', which='major', labelsize=TICK_FONT_SIZE, width=1.5, length=6)
    ax.tick_params(axis='both', which='minor', width=1, length=4)

    for rect in ax.patches:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontsize=13, fontweight='bold')

    ax.grid(axis='y', linestyle='--', alpha=0.5, zorder=0)
    plt.tight_layout()
    plt.savefig(f"plots/pass_k_scores_by_model_method_{dataset}.pdf", format="pdf", dpi=300, bbox_inches='tight')
    plt.show()

# Diversity

In [None]:
import matplotlib.patches as mpatches
SELECTED_K = 200
marker_styles = ['d', 's', 'o', 'P', 'v', 'X', 'H', '8', 'd']  # Define different marker styles with uniform area
model_to_marker = {model: marker_styles[i % len(marker_styles)] for i, model in enumerate(model_to_str.keys())}

for dataset in datasets:
    diversities = split_dict_by_datasets(result_series.get_diversities())[dataset]
    diversities = {k: 1 - v.mean() for k, v in diversities.items()}

    all_pass_ks = split_dict_by_datasets(result_series.get_pass_ks())[dataset]
    all_pass_0s = {k: (v[SELECTED_K-1] - v[0]) / v[0] for k, v in all_pass_ks.items()}
    
    plt.figure(figsize=(12, 8), dpi=300)
    # labels_to_idx = {v: i for i, v in enumerate(list(label_to_str))}
    labels_to_idx = {m: i for i, m in enumerate(result_series.methods)}
    colors = [color_scheme[label[1]] for label in all_pass_0s.keys()]
    markers = [model_to_marker[label[0]] for label in all_pass_0s.keys()]
    avail_models = dict((label[0], None) for label in all_pass_0s.keys())
    
    for (diversity, pass_0, color, marker) in zip(diversities.values(), all_pass_0s.values(), colors, markers):
        plt.scatter(diversity, pass_0, c=color, edgecolor='k', alpha=0.81, s=300, marker=marker, zorder=3)
    
    # Create a legend for the colors
    color_handles = [mpatches.Patch(color=color_scheme[label], label=label_to_str[label], alpha=0.7) for label, i in labels_to_idx.items()]
    
    # Create a legend for the marker styles
    marker_handles = [plt.Line2D([0], [0], marker=model_to_marker[model], color='w', markerfacecolor='w', markersize=14, label=model_to_str[model], markeredgecolor='k', markeredgewidth=1.5, alpha=0.7) for model in avail_models]
    
    first_legend = plt.legend(handles=color_handles, fontsize=LEGEND_FONT_SIZE, loc='upper left', frameon=True)
    ax = plt.gca().add_artist(first_legend)  # Add the first legend to the axes
    plt.legend(handles=marker_handles, fontsize=LEGEND_FONT_SIZE, loc='lower right', frameon=True)
    plt.xticks(fontsize=TICK_FONT_SIZE)
    plt.yticks(fontsize=TICK_FONT_SIZE)
    plt.ylabel(f'Relative Gains (Pass@1 to Pass@{SELECTED_K})', fontsize=AXIS_FONT_SIZE, fontweight='medium')
    plt.xlabel('Idea Diversity', fontsize=AXIS_FONT_SIZE, fontweight='medium')
    plt.title(f'Idea Diversity vs Relative Gains from Search (on {datasets[dataset]})', fontsize=TITLE_FONT_SIZE, fontweight='medium')
    plt.grid(True, linestyle='--', alpha=0.6, zorder=0)
    
    plt.tight_layout()
    plt.savefig(f"plots/diversity_vs_improvement_{dataset}.pdf", format="pdf", dpi=300, bbox_inches='tight')
    plt.show()

# Offset (skip for now)

In [None]:
def split_into_public_private(d: dict[tuple[str, str], np.ndarray]) -> dict[tuple[str, str], np.ndarray]:
    CONSTANT = "public_filtered"
    possible = {k: [v] for k, v in d.items() if CONSTANT not in k[1]}

    for k, v in possible.items():
        v.append(d[(k[0], CONSTANT + "_" + k[1])])
    for k, v in possible.items():
        possible[k] = np.stack(v, axis=0)
    
    return possible

# pp_data = split_into_public_private(split_dict_by_datasets(result_series.get_pass_ks(with_public=True))["human_eval_plus"])

# MAX_LEN = 10
# OFFSET = 2

# public_idx = np.arange(MAX_LEN) + 1
# private_idx = public_idx * OFFSET


# plt.figure(figsize=(9, 9))
# for label, values in pp_data.items():
#     linestyle = '-'
#     plt.plot(values[0, private_idx-1], values[1, public_idx-1], label=f'|'.join(label), linestyle=linestyle)
# x = np.linspace(0.3, 1, 40)
# y = x
# plt.plot(x, y, label='x = y', linestyle='--', color='red')


# plt.xlabel('private score')
# plt.ylabel('public score')
# plt.title(f'public vs private score')
# plt.legend(fontsize='small', loc='lower center')
# plt.grid(True)
# plt.show()

# 2x2

In [None]:
use_public = False 
MAX_X = 200 if not use_public else 20

# plt.style.use('seaborn-whitegrid')
# colors = plt.cm.Set2(np.linspace(0, 1, 10))

for dataset in datasets:
    all_pass_ks = split_dict_by_datasets(result_series.get_pass_ks(with_public=use_public))[dataset]
    all_pass_ks_by_model = split_dict_by_datasets(all_pass_ks)
    
    num_models = len(result_series.models)
    num_rows = 2
    num_cols = 2
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(14, 14), squeeze=False)
    if use_public:
        fig.suptitle(f'Pass@k vs k for Methods with Public Filtering on {datasets[dataset]}', fontsize=24, fontweight='medium')
    else:
        fig.suptitle(f'Pass@k vs k for Methods on {datasets[dataset]}', fontsize=24, fontweight='medium')
    
    # Find the global y-axis limits
    y_min, y_max = float('inf'), float('-inf')
    for model in result_series.models:
        final_data = all_pass_ks_by_model[model]
        for values in final_data.values():
            y_min = min(y_min, np.min(values[:MAX_X]))
            y_max = max(y_max, np.max(values[:MAX_X]))
    
    # Extend y_max slightly upward, but cap at 1
    y_max = min(y_max + 0.015, 1.0)
    
    for idx, model in enumerate(result_series.models):
        row = idx // num_cols
        col = idx % num_cols
        ax = axs[row, col]
        final_data = all_pass_ks_by_model[model]
        for i, (label, values) in enumerate(final_data.items()):
            to_plot = values[:MAX_X]
            ks = np.arange(1, MAX_X + 1)

            is_public_filtered = 'public_filtered' in label[0]
            if is_public_filtered:
                use_label = label[0].split("public_filtered_")[1]
            else:
                use_label = label[0]

            str_label = use_label if use_label not in label_to_str else label_to_str[use_label]
            str_label = str_label if not is_public_filtered else "Public Filtered " + str_label
            
            # Determine color and linestyle
            color = color_scheme.get(use_label, 'black')
            
            if use_public:
                linestyle = '-' if is_public_filtered else '--'
                lw = 2.3 if is_public_filtered else 1.8
                a = 0.95 if is_public_filtered else 0.5
            else:
                linestyle = '-'
                lw = 2.3
                a = 0.95
            
            ax.plot(ks, to_plot, label=str_label, linestyle=linestyle, linewidth=lw, color=color, alpha=a)
        
        if row == num_rows - 1:
            ax.set_xlabel('k', fontsize=18)
        ax.set_xscale('log')
        ax.set_xlim(1, MAX_X)
        if col == 0:
            ax.set_ylabel('Pass@k', fontsize=18)
        ax.set_title(f'{model_to_str[model]}', fontsize=21)
        ax.grid(True, linestyle='--', alpha=0.7)
        ax.set_ylim(y_min, y_max)
        
        # Improve tick labels
        ax.tick_params(axis='both', which='major', labelsize=16)
        ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
        
        # Add minor gridlines
        ax.xaxis.grid(True, which='minor', linestyle=':', alpha=0.35)
        ax.yaxis.grid(True, which='minor', linestyle=':', alpha=0.35)
    
    # Create a legend for the colors
    color_handles = [plt.Line2D([0], [0], color=color, lw=4, label=label_to_str.get(label, label)) for label, color in color_scheme.items()]
    first_legend = plt.legend(handles=color_handles, fontsize=16, loc='lower right', frameon=True)
    plt.gca().add_artist(first_legend)  # Add the first legend to the axes

    if use_public:
        # Create a legend for the line styles
        line_handles = [
            plt.Line2D([0], [0], linestyle='-', color='black', linewidth=2.5, label='Public Filtering'),
            plt.Line2D([0], [0], linestyle='--', color='black', linewidth=2, label='No Public Filtering', alpha=0.7)
        ]
        plt.legend(handles=line_handles, fontsize=16, loc='upper left', frameon=True)

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    if use_public:
        plt.savefig(f"plots/public_pass_at_k_{dataset}.pdf", format="pdf", dpi=300, bbox_inches='tight')
    else:
        plt.savefig(f"plots/pass_at_k_{dataset}.pdf", format="pdf", dpi=300, bbox_inches='tight')
    plt.show()

# Base v Instruct

In [None]:
result_series = ResultSeries("../../final_results/base_v_instruct", "../../other_logs/similar_logs/final_logs/base_v_instruct",
    ["livecodebench_lite_v3"],
    ["baby-deepseek-b_sgl", "baby-deepseek-i_sgl", "llama318b_sgl", "llama318bi_sgl"],
    ["basic_prompting10000", ]
)

In [None]:
use_public = True  # Set this to True or False based on your requirement

model_groups = {
    "DeepSeek-Coder-V2-Lite": ["baby-deepseek-b", "baby-deepseek-i"],
    "Llama-3.1-8B": ["llama318b", "llama318bi"],
}
MAX_X = 10000 if not use_public else 1000
use_datasets = ['livecodebench_lite_v3']
    

for dataset in use_datasets:
    all_pass_ks = split_dict_by_datasets(result_series.get_pass_ks(with_public=use_public))[dataset]
    all_pass_ks_by_model = split_dict_by_datasets(all_pass_ks)
    
    # Plot each model group separately
    for group_name, group_models in model_groups.items():
        plt.figure(figsize=(14, 9))
        plt.title(f'Pass@k vs k for {group_name} Models on {datasets[dataset]}', fontsize=TITLE_FONT_SIZE, fontweight='medium')
        
        # Find the y-axis limits for the current group
        y_min, y_max = float('inf'), float('-inf')
        for model in result_series.models:
            if any(model.startswith(group_model) for group_model in group_models):
                final_data = all_pass_ks_by_model[model]
                for values in final_data.values():
                    y_min = min(y_min, np.min(values[:MAX_X]))
                    y_max = max(y_max, np.max(values[:MAX_X]))
        
        # Extend y_max slightly upward, but cap at 1
        y_max = min(y_max + 0.015, 1.0)
        
        use_models = {}
        for model in result_series.models:
            if any(model.startswith(group_model) for group_model in group_models):
                final_data = all_pass_ks_by_model[model]
                for i, (label, values) in enumerate(final_data.items()):
                    to_plot = values[:MAX_X]
                    ks = np.arange(1, MAX_X + 1)

                    is_public_filtered = 'public_filtered' in label[0]

                    if model not in use_models:
                        use_models[model] = len(use_models)

                    color = f"C{use_models[model]}"
                    
                    if use_public:
                        linestyle = '-' if is_public_filtered else '--'
                        lw = 3.5 if is_public_filtered else 2.4
                        a = 0.95 if is_public_filtered else 0.5
                    else:
                        linestyle = '-'
                        lw = 3.5
                        a = 0.95

                    plt.plot(ks, to_plot, label=f'{model_to_str[model]}', linestyle=linestyle, linewidth=lw, color=color, alpha=a)
        
        plt.xlabel('k', fontsize=AXIS_FONT_SIZE)
        plt.xscale('log')
        plt.xlim(1, MAX_X)
        plt.ylabel('Pass@k', fontsize=AXIS_FONT_SIZE)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.ylim(y_min, y_max)
        
        # Improve tick labels
        plt.tick_params(axis='both', which='major', labelsize=TICK_FONT_SIZE)
        plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
        
        # Add minor gridlines
        plt.gca().xaxis.grid(True, which='minor', linestyle=':', alpha=0.35)
        plt.gca().yaxis.grid(True, which='minor', linestyle=':', alpha=0.35)
        
        # Create a legend for the colors
        color_handles = [plt.Line2D([0], [0], color=f"C{use_models[model]}", lw=4, label=model_to_str[model]) for model in use_models]
        first_legend = plt.legend(handles=color_handles, fontsize=16, loc='lower right', frameon=True)
        plt.gca().add_artist(first_legend)  # Add the first legend to the axes

        if use_public:
            # Create a legend for the line styles
            line_handles = [
                plt.Line2D([0], [0], linestyle='-', color='black', linewidth=3.5, label='Public Filtering'),
                plt.Line2D([0], [0], linestyle='--', color='black', linewidth=2.4, label='No Public Filtering', alpha=0.7)
            ]
            plt.legend(handles=line_handles, fontsize=16, loc='upper left', frameon=True)

        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        if use_public:
            plt.savefig(f"plots/public_basevinstruct_big_{dataset}_{group_name}.pdf", format="pdf", dpi=300, bbox_inches='tight')
        else:
            plt.savefig(f"plots/basevinstruct_big_{dataset}_{group_name}.pdf", format="pdf", dpi=300, bbox_inches='tight')
        plt.show()

In [None]:
result_series = ResultSeries("../../final_results/base_v_instruct", "../../other_logs/similar_logs/final_logs/base_v_instruct",
    ["human_eval_plus", "mbpp_plus", "livecodebench_lite_v3"],
    ["baby-deepseek-b_sgl", "baby-deepseek-i_sgl", "llama318b_sgl", "llama318bi_sgl", "llama3170b_sgl", "llama3170bi_sgl"],
    ["basic_prompting225", ]
)

In [None]:
use_public = False  # Set this to True or False based on your requirement

model_groups = {
    "DeepSeek-Coder-V2-Lite": ["baby-deepseek-b", "baby-deepseek-i"],
    "Llama-3.1-8B": ["llama318b", "llama318bi"],
    "Llama-3.1-70B": ["llama3170b", "llama3170bi"]
}
use_datasets = datasets 
MAX_X = 200 if not use_public else 20

fig, axes = plt.subplots(len(use_datasets), len(model_groups), figsize=(17, 17),)
fig.suptitle('Pass@k of Base and Instruct Models', fontsize=TITLE_FONT_SIZE, fontweight='medium')

for col_idx, (group_name, group_models) in enumerate(model_groups.items()):
    for row_idx, dataset in enumerate(use_datasets):
        ax = axes[row_idx, col_idx]
        all_pass_ks = split_dict_by_datasets(result_series.get_pass_ks(with_public=use_public))[dataset]
        all_pass_ks_by_model = split_dict_by_datasets(all_pass_ks)
        
        # Find the y-axis limits for the current group and dataset
        y_min, y_max = float('inf'), float('-inf')
        for model in result_series.models:
            final_data = all_pass_ks_by_model[model]
            for values in final_data.values():
                y_min = min(y_min, np.min(values[:MAX_X]))
                y_max = max(y_max, np.max(values[:MAX_X]))
        
        # Extend y_max slightly upward, but cap at 1
        y_max = min(y_max + 0.015, 1.0)
        
        use_models = {}
        for model in result_series.models:
            if any(model.startswith(group_model) for group_model in group_models):
                final_data = all_pass_ks_by_model[model]
                for i, (label, values) in enumerate(final_data.items()):
                    to_plot = values[:MAX_X]
                    ks = np.arange(1, MAX_X + 1)

                    is_public_filtered = 'public_filtered' in label[0]

                    if model not in use_models:
                        use_models[model] = len(use_models)

                    color = f"C{use_models[model]}"
                    
                    if use_public:
                        linestyle = '-' if is_public_filtered else '--'
                        lw = 3.5 if is_public_filtered else 2.4
                        a = 0.95 if is_public_filtered else 0.5
                    else:
                        linestyle = '-'
                        lw = 3.5
                        a = 0.95

                    ax.plot(ks, to_plot, label=f'{model_to_str[model]}', linestyle=linestyle, linewidth=lw, color=color, alpha=a)
        
        ax.set_xscale('log')
        ax.set_xlim(1, MAX_X)
        ax.grid(True, linestyle='--', alpha=0.7)
        ax.set_ylim(y_min, y_max)
        
        # Improve tick labels
        ax.tick_params(axis='both', which='major', labelsize=TICK_FONT_SIZE)
        ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
        
        # Add minor gridlines
        ax.xaxis.grid(True, which='minor', linestyle=':', alpha=0.35)
        ax.yaxis.grid(True, which='minor', linestyle=':', alpha=0.35)
        
        if col_idx == 0:
            ax.set_ylabel(f'{datasets[dataset]}\n\nPass@k', fontsize=AXIS_FONT_SIZE)
        if row_idx == len(use_datasets) - 1:
            ax.set_xlabel(f'{group_name}\n\nk', fontsize=AXIS_FONT_SIZE)
        
        if row_idx == len(use_datasets) - 1 and col_idx == len(model_groups) - 1:
            # Create a legend for the colors
            BASE_INSTRUCT = ["Base Model", "Instruct Model"]
            color_handles = [plt.Line2D([0], [0], color=f"C{use_models[model]}", lw=4, label=BASE_INSTRUCT[use_models[model]]) for model in use_models]
            first_legend = ax.legend(handles=color_handles, fontsize=LEGEND_FONT_SIZE, loc='lower right', frameon=True)
            ax.add_artist(first_legend)  # Add the first legend to the axes

            if use_public:
                # Create a legend for the line styles
                line_handles = [
                    plt.Line2D([0], [0], linestyle='-', color='black', linewidth=3.5, label='Public Filtering'),
                    plt.Line2D([0], [0], linestyle='--', color='black', linewidth=2.4, label='No Public Filtering', alpha=0.7)
                ]
                ax.legend(handles=line_handles, fontsize=LEGEND_FONT_SIZE, loc='upper left', frameon=True)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
if use_public:
    plt.savefig(f"plots/public_basevinstruct_combined.pdf", format="pdf", dpi=300, bbox_inches='tight')
else:
    plt.savefig(f"plots/basevinstruct_combined.pdf", format="pdf", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
use_public = False  # Set this to True or False based on your requirement

model_groups = {
    "DeepSeek-Coder-V2-Lite": ["baby-deepseek-b", "baby-deepseek-i"]
}
use_datasets = ["mbpp_plus"]  # Only use the mbpp+ dataset
MAX_X = 200 if not use_public else 20

for dataset in use_datasets:
    all_pass_ks = split_dict_by_datasets(result_series.get_pass_ks(with_public=use_public))[dataset]
    all_pass_ks_by_model = split_dict_by_datasets(all_pass_ks)
    
    # Plot each model group separately
    for group_name, group_models in model_groups.items():
        plt.figure(figsize=(14, 9))
        plt.title(f'Pass@k vs k for {group_name} Models on {datasets[dataset]}', fontsize=TITLE_FONT_SIZE, fontweight='medium')
        
        # Find the y-axis limits for the current group
        y_min, y_max = float('inf'), float('-inf')
        for model in result_series.models:
            if any(model.startswith(group_model) for group_model in group_models):
                final_data = all_pass_ks_by_model[model]
                for values in final_data.values():
                    y_min = min(y_min, np.min(values[:MAX_X]))
                    y_max = max(y_max, np.max(values[:MAX_X]))
        
        # Extend y_max slightly upward, but cap at 1
        y_max = min(y_max + 0.015, 1.0)
        
        use_models = {}
        for model in result_series.models:
            if any(model.startswith(group_model) for group_model in group_models):
                final_data = all_pass_ks_by_model[model]
                for i, (label, values) in enumerate(final_data.items()):
                    to_plot = values[:MAX_X]
                    ks = np.arange(1, MAX_X + 1)

                    is_public_filtered = 'public_filtered' in label[0]

                    if model not in use_models:
                        use_models[model] = len(use_models)

                    color = f"C{use_models[model]}"
                    
                    if use_public:
                        linestyle = '-' if is_public_filtered else '--'
                        lw = 3.5 if is_public_filtered else 2.4
                        a = 0.95 if is_public_filtered else 0.5
                    else:
                        linestyle = '-'
                        lw = 3.5
                        a = 0.95

                    plt.plot(ks, to_plot, label=f'{model_to_str[model]}', linestyle=linestyle, linewidth=lw, color=color, alpha=a)
        
        plt.xlabel('k', fontsize=AXIS_FONT_SIZE)
        plt.xscale('log')
        plt.xlim(1, MAX_X)
        plt.ylabel('Pass@k', fontsize=AXIS_FONT_SIZE)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.ylim(y_min, y_max)
        
        # Improve tick labels
        plt.tick_params(axis='both', which='major', labelsize=TICK_FONT_SIZE)
        plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
        
        # Add minor gridlines
        plt.gca().xaxis.grid(True, which='minor', linestyle=':', alpha=0.35)
        plt.gca().yaxis.grid(True, which='minor', linestyle=':', alpha=0.35)
        
        # Create a legend for the colors
        color_handles = [plt.Line2D([0], [0], color=f"C{use_models[model]}", lw=4, label=model_to_str[model]) for model in use_models]
        first_legend = plt.legend(handles=color_handles, fontsize=16, loc='lower right', frameon=True)
        plt.gca().add_artist(first_legend)  # Add the first legend to the axes

        if use_public:
            # Create a legend for the line styles
            line_handles = [
                plt.Line2D([0], [0], linestyle='-', color='black', linewidth=3.5, label='Public Filtering'),
                plt.Line2D([0], [0], linestyle='--', color='black', linewidth=2.4, label='No Public Filtering', alpha=0.7)
            ]
            plt.legend(handles=line_handles, fontsize=16, loc='upper left', frameon=True)

        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        if use_public:
            pass
            # plt.savefig(f"plots/public_basevinstruct_{dataset}_{group_name}.pdf", format="pdf", dpi=300, bbox_inches='tight')
        else:
            plt.savefig(f"plots/basevinstruct_{dataset}_{group_name}.pdf", format="pdf", dpi=300, bbox_inches='tight')
        plt.show()

# Temperature

In [None]:
result_series = ResultSeries("../../final_results/temp_sweep/", None, 
    ["livecodebench_lite_v3"],
    ["gpt-4o-mini"],
    ["default", "simple_idea"],
    temps=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]
)

In [None]:
new_label_to_str = label_to_str | {"default": "Repeated Sampling", "simple_idea": "IdeaSearch"}
pks = result_series.get_pass_ks(with_temp=True)
pks = split_dict_by_datasets(split_dict_by_datasets(split_dict_by_datasets(pks)["livecodebench_lite_v3"])["gpt-4o-mini"])

import matplotlib.colors as mcolors
import matplotlib
cmap = matplotlib.colormaps.get_cmap('cool')  # Changed colormap to 'viridis' for better contrast on white background
norm = mcolors.Normalize(vmin=min(result_series.temps), vmax=max(result_series.temps))

plt.figure(figsize=(13, 9))
for method, temp_pks in pks.items():
    label = new_label_to_str[method]
    for temp, values in temp_pks.items():
        color = cmap(norm(temp))
        linestyle = '-.' if label == 'Repeated Sampling' else '-'
        plt.plot(np.arange(len(values))+1, values, color=color, linestyle=linestyle, linewidth=2, alpha=0.7, label=f'Temp: {temp}')

plt.xlabel('k', fontsize=AXIS_FONT_SIZE)
plt.xscale("log")
plt.ylabel('Pass@k', fontsize=AXIS_FONT_SIZE)
plt.title('Temperature Effects on Pass@k (on LiveCodeBench)', fontsize=TITLE_FONT_SIZE)
plt.xlim(1, len(values))  # Set xlim from 1 to the maximum value
plt.grid(True, which="both", linestyle='--', linewidth=0.5)
plt.xticks(fontsize=TICK_FONT_SIZE)
plt.yticks(fontsize=TICK_FONT_SIZE)

# Add legend for line styles
import matplotlib.lines as mlines
repeated_sampling_line = mlines.Line2D([], [], color='black', linestyle='-.', label='Repeated Sampling', linewidth=2)
idea_search_line = mlines.Line2D([], [], color='black', linestyle='-', label='IdeaSearch', linewidth=2)
plt.legend(handles=[repeated_sampling_line, idea_search_line], fontsize=LEGEND_FONT_SIZE, loc='best')

# Add colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=plt.gca())
cbar.set_label('Temperature', fontsize=LEGEND_FONT_SIZE)
cbar.ax.tick_params(labelsize=LEGEND_FONT_SIZE)  # Increase colorbar font size

plt.tight_layout()
plt.savefig("plots/temperature_sweep0.0-1.2.pdf", format="pdf", dpi=300, bbox_inches='tight')
plt.show()

# Compute Norm

In [None]:
result_series = ResultSeries(BASE_DIR, DIVER_DIR, 
    ["livecodebench_lite_v3"],
    ["gpt-4o-mini", "gpt-4o", "deepseek-coder", "sonnet-3-5"],
    ["basic_prompting1200", "combo_observation_no"],
    temps=[0.9]
)

In [None]:
pass_k_dict = result_series.get_pass_ks()
pass_k_dict = split_dict_by_datasets(split_dict_by_datasets(pass_k_dict)["livecodebench_lite_v3"])

MAX_K = 200
COMBO_TO_BASIC_MULTIPLIER = COMBO_AVG / BASIC_AVG
BASIC_COL = "basic_prompting1200"
COMBO_COL = "combo_observation_no"

model_colors = {
    "gpt-4o-mini": "#1f77b4",  # muted blue
    "gpt-4o": "#ff7f0e",       # safety orange
    "deepseek-coder": "#2ca02c", # cooked asparagus green
    "sonnet-3-5": "#d62728"    # brick red
}

plt.figure(figsize=(13, 10), dpi=300)
for model in pass_k_dict:
    combo_data = pass_k_dict[model][(COMBO_COL, )]
    basic_data = pass_k_dict[model][(BASIC_COL, )]
    combo_x = np.arange(len(combo_data)) + 1
    basic_x = np.arange(len(basic_data)) + 1

    combo_y = combo_data 
    basic_y = basic_data
    
    color = model_colors.get(model, "black")
    plt.plot(combo_x * COMBO_AVG, combo_y, label=f'{model_to_str[model]} - PlanSearch', linestyle='-', color=color, linewidth=2)
    plt.plot(basic_x * BASIC_AVG, basic_y, label=f'{model_to_str[model]} - Repeated Sampling', linestyle='-.', color=color, linewidth=2, alpha=0.75)

plt.xlim(BASIC_AVG, MAX_K * COMBO_AVG)

plt.xlabel('Average Tokens Used (per problem)', fontsize=AXIS_FONT_SIZE, fontweight='medium')
plt.ylabel('Solve-rate', fontsize=AXIS_FONT_SIZE, fontweight='medium')
plt.xscale('log')

plt.title('Compute-Normalized Repeated Sampling vs PlanSearch', fontsize=TITLE_FONT_SIZE, fontweight='medium')

# Create a legend for the line styles
line_handles = [
    plt.Line2D([0], [0], linestyle='-', color='black', linewidth=2.5, label='PlanSearch'),
    plt.Line2D([0], [0], linestyle='-.', color='black', linewidth=2.5, label='Repeated Sampling')
]
first_legend = plt.legend(handles=line_handles, fontsize=LEGEND_FONT_SIZE, loc='upper left', frameon=True)
plt.gca().add_artist(first_legend)  # Add the first legend to the axes

# Create a legend for the colors
color_handles = [plt.Line2D([0], [0], color=color, lw=4, label=model_to_str[model]) for model, color in model_colors.items()]
plt.legend(handles=color_handles, fontsize=LEGEND_FONT_SIZE, loc='lower right', frameon=True)

plt.grid(True, linestyle='--', linewidth=0.5)
flat_basic_avg = int(np.ceil(BASIC_AVG))
ticks = [flat_basic_avg] + plt.xticks()[0].tolist()
tick_labels = [f'{flat_basic_avg}'] + plt.xticks()[1]
good_ticks = [i for i, x in enumerate(ticks) if x >= BASIC_AVG and x <= MAX_K * COMBO_AVG]
plt.xticks([ticks[i] for i in good_ticks], [tick_labels[i] for i in good_ticks], fontsize=TICK_FONT_SIZE)
plt.yticks(fontsize=TICK_FONT_SIZE)

plt.grid(True, linestyle='--', alpha=0.7)
plt.grid(which='minor', linestyle='--', linewidth='0.5', color='gray', alpha=0.25)
plt.minorticks_on()

plt.tight_layout()

plt.savefig("plots/compute_normalized_plansearch.pdf", format='pdf', bbox_inches='tight', dpi=300)

plt.show()

# Backtranslate

In [None]:
paths_to_results = [
    "../../temp_sweeps/backtranslate_base",
    "../../temp_sweeps/backtranslate_5words",
    "../../temp_sweeps/backtranslate_10words",
    "../../temp_sweeps/backtranslate_25words",
    "../../temp_sweeps/backtranslate_35words",
    "../../temp_sweeps/backtranslate_50words",
    "../../temp_sweeps/backtranslate_75words",
    "../../temp_sweeps/backtranslate_100words",
    "../../temp_sweeps/backtranslate_125words",
    "../../temp_sweeps/backtranslate_150words",
    "../../temp_sweeps/backtranslate_175words",
    "../../temp_sweeps/backtranslate_200words",
    "../../temp_sweeps/backtranslate_250words",
    "../../temp_sweeps/backtranslate_500words",
    "../../temp_sweeps/backtranslate_750words",
    "../../temp_sweeps/backtranslate_all",
]

from pathlib import Path
for p in paths_to_results:
    assert Path(p).exists(), f"Path {p} doesn't exist!"

In [None]:
avg_solution_length = {}

for path in paths_to_results:
    if "base" in path:
        # avg_solution_length[path] = 0. 
        continue

    log_path = os.path.join("../../logs", path.replace("../../", ""))
    query_path = os.path.join(log_path, "queries")

    solution_files = [f for f in os.listdir(query_path) if f.startswith("solution")]
    solution_path = None
    for solution_file in solution_files:
        solution_path = os.path.join(query_path, solution_file)
        print(f"Found solution file: {solution_path}")
    
    assert solution_path is not None

    with open(solution_path, "r") as solution_file:
        solutions = json.load(solution_file)
    
    num_tokens_list = [e["completion"]["num_tokens"] for e in solutions]
    avg_tokens = sum(num_tokens_list) / len(num_tokens_list)
    avg_solution_length[path] = avg_tokens

In [None]:
all_pass_ks = {}
for r in (paths_to_results):
    print(f"Reading", r)
    items = gunzip_json_read(r)["items"]
    upper_k = len(items[0]["results"])
    pass_ks = {}
    for k in range(1, upper_k+1):
        pass_ks[k] = np.mean(get_pass_ks(items, k))
    all_pass_ks[r] = pass_ks

all_std = {}
for r in (paths_to_results):
    print(f"Reading", r)
    items = gunzip_json_read(r)["items"]
    upper_k = len(items[0]["results"])
    
    vars = []
    for item in items:
        single_problem = []
        for k in range(1, upper_k+1):
            single_problem.append(calcEstVar(len(items[0]["results"]), k, sum(i["passing"] for i in item["results"])))
        vars.append(single_problem)

    vars = np.array(vars)
    all_std[r] = np.sqrt(np.sum(vars, axis=0) / len(items) ** 2) * 2.5

In [None]:
plt.figure(figsize=(12, 8), dpi=300)

select_ones = [path for path in paths_to_results if "base" not in path]
base_path = [path for path in paths_to_results if "base" in path][0]

plot_line = [[avg_solution_length[label], all_pass_ks[label][1], all_std[label][0]] for label in select_ones]
plot_line = np.array(sorted(plot_line))
plt.plot(plot_line[:, 0], plot_line[:, 1], linestyle='-', marker='o', markersize=8, label='Pass@1', color='#1f77b4', linewidth=2.5)
baseline_pass_at_1 = all_pass_ks[base_path][1]
plt.axhline(y=baseline_pass_at_1, color='#1f77b4', linestyle='--', linewidth=2.2, label='Baseline Pass@1')

plot_line_pa5 = [[avg_solution_length[label], all_pass_ks[label][5], all_std[label][4]] for label in select_ones]
plot_line_pa5 = np.array(sorted(plot_line_pa5))
plt.plot(plot_line_pa5[:, 0], plot_line_pa5[:, 1], linestyle='-', marker='s', markersize=8, label='Pass@5', color='#ff7f0e', linewidth=2.5)
baseline_pass_at_5 = all_pass_ks[base_path][5]
plt.axhline(y=baseline_pass_at_5, color='#ff7f0e', linestyle='--', linewidth=2.2, label='Baseline Pass@5')

plt.xlabel('Average Solution Token Length', fontsize=AXIS_FONT_SIZE)
plt.xscale('log')
plt.ylabel('Pass@k', fontsize=AXIS_FONT_SIZE)
plt.title('Effects of Backtranslation on Performance', fontsize=TITLE_FONT_SIZE)
plt.legend(fontsize=LEGEND_FONT_SIZE, frameon=True, fancybox=True, )
plt.grid(True, which='both', linestyle='--', alpha=0.3)

plt.tick_params(axis='both', which='major', labelsize=TICK_FONT_SIZE)

# Set x-axis limits
plt.xlim([min(plot_line[:, 0])*0.95, max(plot_line[:, 0]) * 1.05])

# Add error bars
# plt.errorbar(plot_line[:, 0], plot_line[:, 1], yerr=plot_line[:, 2], fmt='none', ecolor='gray', alpha=0.5, capsize=3)
# plt.errorbar(plot_line_pa5[:, 0], plot_line_pa5[:, 1], yerr=plot_line_pa5[:, 2], fmt='none', ecolor='gray', alpha=0.5, capsize=3)

# Adjust layout and save with high DPI
plt.tight_layout()
plt.savefig('plots/backtranslation_performance.pdf', dpi=300, bbox_inches='tight', format="pdf")
# plt.savefig('plots/backtranslation_performance.png', dpi=300, bbox_inches='tight', format="png")
plt.show()

# Idea Variance

In [None]:
DATASET_NAME = "codegenning/F_livecodebench_lite_v2"
LOG_DIRECTORY = os.path.join("../../logs",
                                "simple_idea/codet0.8"
                             )

json_path = os.path.join(LOG_DIRECTORY, "results_per_code_group.json")
assert Path(json_path).exists()
with open(json_path, "r") as f:
    data_per_idea = json.load(f)

NUM_PROBS = 226
data_per_problem = [{"results": []} for _ in range(NUM_PROBS)]
for i, idea_group in enumerate(data_per_idea):
    data_per_problem[i % NUM_PROBS]["results"].extend(idea_group["results"])

In [None]:
mid_problems = []
for i, data_for_problem in enumerate(data_per_problem):
    num_passing = sum([p["passing"] for p in data_for_problem["results"]])
    fraction = num_passing / len(data_for_problem["results"])
    if fraction <= 0.01 or fraction >= 0.99:
        continue
    mid_problems.append(i)

good_ideas_idx = []
for i in range(len(data_per_idea)):
    if i % NUM_PROBS in mid_problems:
        good_ideas_idx.append(i)

filtered_pp = [data_per_problem[i] for i in mid_problems]
filtered_pi = [data_per_idea[i] for i in good_ideas_idx]
print(len(mid_problems))

In [None]:
# Set up the plot style for a professional look
plt.figure(figsize=(13, 8), dpi=300)

# Calculate pass@1 scores
data_per_idea_pass_ks = get_pass_ks(filtered_pi, 1)
data_per_problem_pass_ks = get_pass_ks(filtered_pp, 1)

# Normalize the heights by the lengths of each
weights_data_per_idea = np.ones_like(data_per_idea_pass_ks) / len(data_per_idea_pass_ks)
weights_data_per_problem = np.ones_like(data_per_problem_pass_ks) / len(data_per_problem_pass_ks)

# Define common bins for both histograms
bins = np.linspace(0, 1, 15)  # 20 bins from 0 to 1 for more granularity

# Plot histograms
plt.hist(data_per_problem_pass_ks, bins=bins, color='#FF9999', edgecolor='#CC0000', 
         alpha=0.7, label='Per Problem', weights=weights_data_per_problem)
plt.hist(data_per_idea_pass_ks, bins=bins, color='#66B2FF', edgecolor='#004080', 
         alpha=0.7, label='Per Idea', weights=weights_data_per_idea)

# Customize the plot
plt.title('Distribution of Solve Rates Conditioned on Idea', fontsize=TITLE_FONT_SIZE,)
plt.xlabel('Solve Rate', fontsize=AXIS_FONT_SIZE)
plt.ylabel('Frequency', fontsize=AXIS_FONT_SIZE)
plt.legend(loc='upper right', fontsize=LEGEND_FONT_SIZE)
plt.tick_params(axis='both', which='major', labelsize=TICK_FONT_SIZE)
# Set x-axis limits to 0 and 1
plt.xlim(-0.021, 1.021)
# Add grid lines
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
# Add a text box with statistics
# stats_text = f"N(problems) = {len(data_per_problem_pass_ks)}\n"
# stats_text += f"N(ideas) = {len(data_per_idea_pass_ks)}"
# plt.text(0.05, 0.95, stats_text, transform=plt.gca().transAxes, 
#          verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('plots/idea_solve_rates_distribution.pdf', format='pdf', bbox_inches='tight')
plt.show()