In [3]:
import pandas as pd
from typing import List, Union
import itertools

import numpy as np
from collections import defaultdict
import os
import pandas as pd
import json

In [2]:
def estimate_pass_at_k(
    num_samples: Union[int, List[int], np.ndarray],
    num_correct: Union[List[int], np.ndarray],
    k: int
) -> np.ndarray:
    """
    Estimates pass@k of each problem and returns them in an array.
    """

    def estimator(n: int, c: int, k: int) -> float:
        """
        Calculates 1 - comb(n - c, k) / comb(n, k).
        """
        if n - c < k:
            return 1.0
        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

    if isinstance(num_samples, int):
        num_samples_it = itertools.repeat(num_samples, len(num_correct))
    else:
        assert len(num_samples) == len(num_correct)
        num_samples_it = iter(num_samples)

    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])

In [4]:
with open('./Output/GPT3.5_Output_Py_1.0_result.jsonl') as f:
    data = [json.loads(line) for line in f]



379

In [8]:
problem_mapping = defaultdict(list)
for d in data:
    id = d['task_id']
    splitted = id.split('_')
    if len(splitted) == 1:
        problem_mapping['original'].append(d)
    else:
        problem_mapping[splitted[1]].append(d)

problem_mapping.keys()

dict_keys(['original', 'P8', 'P1', 'all', 'P6', 'P15', 'P4'])

In [12]:
for key in problem_mapping.keys():
    problems = problem_mapping[key]
    results = defaultdict(list)
    for d in problems:
        for i in range(20):
            results[d['task_id']].append(d[f'result_{i}']['passed'])
    total, correct = [], []
    for result in results.values():
        total.append(len(result))
        correct.append(sum(result))
    pass_at_1 = estimate_pass_at_k(total, correct, 1)
    pass_at_3 = estimate_pass_at_k(total, correct, 3)
    pass_at_10 = estimate_pass_at_k(total, correct, 10)
    print(f'Problem type: {key}')
    print(f'len: {len(total)}')
    print(f'Pass@1: {np.mean(pass_at_1)}')
    print(f'Pass@3: {np.mean(pass_at_3)}')
    print(f'Pass@10: {np.mean(pass_at_10)}')

Problem type: original
len: 164
Pass@1: 0.4621951219512195
Pass@3: 0.7015350877192983
Pass@10: 0.8425229201441163
Problem type: P8
len: 147
Pass@1: 0.4571428571428572
Pass@3: 0.6786788399570355
Pass@10: 0.814716280328841
Problem type: P1
len: 27
Pass@1: 0.38888888888888895
Pass@3: 0.691812865497076
Pass@10: 0.9095812454945581
Problem type: all
len: 32
Pass@1: 0.2921875
Pass@3: 0.5006853070175439
Pass@10: 0.6566924754811752
Problem type: P6
len: 7
Pass@1: 0.0
Pass@3: 0.0
Pass@10: 0.0
Problem type: P15
len: 1
Pass@1: 0.0
Pass@3: 0.0
Pass@10: 0.0
Problem type: P4
len: 1
Pass@1: 0.15000000000000002
Pass@3: 0.4035087719298246
Pass@10: 0.8947368421052632
