In [36]:
import pandas as pd
from typing import List, Union
import itertools

import numpy as np
from collections import defaultdict
import os
import pandas as pd
import json

In [37]:
def estimate_pass_at_k(
    num_samples: Union[int, List[int], np.ndarray],
    num_correct: Union[List[int], np.ndarray],
    k: int
) -> np.ndarray:
    """
    Estimates pass@k of each problem and returns them in an array.
    """

    def estimator(n: int, c: int, k: int) -> float:
        """
        Calculates 1 - comb(n - c, k) / comb(n, k).
        """
        if n - c < k:
            return 1.0
        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

    if isinstance(num_samples, int):
        num_samples_it = itertools.repeat(num_samples, len(num_correct))
    else:
        assert len(num_samples) == len(num_correct)
        num_samples_it = iter(num_samples)

    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])

In [38]:
with open('./Output/CodeGen_Output_Py_0.6_result.jsonl') as f:
    data = [json.loads(line) for line in f]

In [39]:
problem_mapping = defaultdict(list)
problem_mapping_count = defaultdict(int)
for d in data:
    id = d['task_id']
    splitted = id.split('_')
    problem_mapping_count[splitted[0]] += 1
    # if len(splitted) == 1:
    #     problem_mapping['original'].append(d)
    # else:
    #     problem_mapping[splitted[1]].append(d)

for d in data:
    id = d['task_id']
    splitted = id.split('_')
    if len(splitted) == 1:
        problem_mapping['original'].append(d)
    else:
        problem_mapping[splitted[1]].append(d)
        if problem_mapping_count[splitted[0]] == 2:
            problem_mapping['all'].append(d)

print(problem_mapping.keys())

dict_keys(['original', 'P8', 'all', 'P1', 'P6', 'P15', 'P4'])


In [40]:
result_data = []
for key in problem_mapping.keys():
    problems = problem_mapping[key]
    results = defaultdict(list)
    
    for d in problems:
        for i in range(20):
            results[d['task_id']].append(d[f'result_{i}']['passed'])

    
    total, correct = [], []
    for result in results.values():
        total.append(len(result))
        correct.append(sum(result))
    pass_at_1 = estimate_pass_at_k(total, correct, 1)
    pass_at_3 = estimate_pass_at_k(total, correct, 3)
    pass_at_10 = estimate_pass_at_k(total, correct, 10)
    print(f'Problem type: {key}')
    print(f'len: {len(total)}')
    print(f'Pass@1: {np.mean(pass_at_1)}')
    print(f'Pass@3: {np.mean(pass_at_3)}')
    print(f'Pass@10: {np.mean(pass_at_10)}')

    if key !='original':
        pair_results = defaultdict(list)
        pair_problems = problem_mapping['original']
       
        for id in results.keys():
            id = id.split('_')[0]
            for d in pair_problems:
                if d['task_id'] == id:
                    for i in range(20):
                        pair_results[d['task_id']].append(d[f'result_{i}']['passed'])
        
        
        print(f'pair len: {len(pair_results)}')

        total, correct = [], []
        for result in pair_results.values():
            total.append(len(result))
            correct.append(sum(result))
        pair_pass_at_1 = estimate_pass_at_k(total, correct, 1)
        pair_pass_at_3 = estimate_pass_at_k(total, correct, 3)
        pair_pass_at_10 = estimate_pass_at_k(total, correct, 10)
        print(f'Pass@1: {np.mean(pair_pass_at_1)}')
        print(f'Pass@3: {np.mean(pair_pass_at_3)}')
        print(f'Pass@10: {np.mean(pair_pass_at_10)}')
        result_data.append(['CodeGen','Py',key,len(total), np.mean(pass_at_1), np.mean(pass_at_3), np.mean(pass_at_10), np.mean(pair_pass_at_1), np.mean(pair_pass_at_3), np.mean(pair_pass_at_10)])
    else:
        result_data.append(['CodeGen','Py',key,len(total), np.mean(pass_at_1), np.mean(pass_at_3), np.mean(pass_at_10), 0, 0, 0])
                


Problem type: original
len: 164
Pass@1: 0.10091463414634147
Pass@3: 0.22483953786906288
Pass@10: 0.4229945137924826
Problem type: P8
len: 147
Pass@1: 0.06904761904761907
Pass@3: 0.1548514142499105
Pass@10: 0.2967162573531437
pair len: 147
Pass@1: 0.09421768707482994
Pass@3: 0.20944026733500418
Pass@10: 0.3943917648030873
Problem type: all
len: 151
Pass@1: 0.06887417218543049
Pass@3: 0.1582084349947717
Pass@10: 0.30931836498440973
pair len: 151
Pass@1: 0.09304635761589405
Pass@3: 0.2075578017892413
Pass@10: 0.3931809686633052
Problem type: P1
len: 27
Pass@1: 0.07222222222222222
Pass@3: 0.17192982456140357
Pass@10: 0.3405131733305108
pair len: 27
Pass@1: 0.09074074074074076
Pass@3: 0.190448343079922
Pass@10: 0.3626590987272102
Problem type: P6
len: 7
Pass@1: 0.0
Pass@3: 0.0
Pass@10: 0.0
pair len: 7
Pass@1: 0.05714285714285715
Pass@3: 0.13784461152882208
Pass@10: 0.25110570544007077
Problem type: P15
len: 1
Pass@1: 0.0
Pass@3: 0.0
Pass@10: 0.0
pair len: 1
Pass@1: 0.0
Pass@3: 0.0
Pass@10: 

In [41]:
import pandas as pd
df = pd.DataFrame(result_data, columns=['Model','Language','Total', 'Problem Type', 'Pass@1', 'Pass@3', 'Pass@10', 'Pair Pass@1', 'Pair Pass@3', 'Pair Pass@10'])
df.to_csv('./Output/CodeGen_Output_Py_0.6_result.csv', index=False)

In [42]:
# 1. Pairwise comparison
# 2. Fix with 'all'
# 3. Keep the completion as it is.
# 4. Starcoder
# 5. Java
# 6. Code clone.
# 7. Change the result for rq2.