In [8]:
import pandas as pd
from typing import List, Union
import itertools

import numpy as np
from collections import defaultdict
import os
import pandas as pd
import json

In [9]:
def estimate_pass_at_k(
    num_samples: Union[int, List[int], np.ndarray],
    num_correct: Union[List[int], np.ndarray],
    k: int
) -> np.ndarray:
    """
    Estimates pass@k of each problem and returns them in an array.
    """

    def estimator(n: int, c: int, k: int) -> float:
        """
        Calculates 1 - comb(n - c, k) / comb(n, k).
        """
        if n - c < k:
            return 1.0
        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

    if isinstance(num_samples, int):
        num_samples_it = itertools.repeat(num_samples, len(num_correct))
    else:
        assert len(num_samples) == len(num_correct)
        num_samples_it = iter(num_samples)

    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])

In [10]:
with open('./Output/GPT3.5_Output_Java_1.0_result.jsonl') as f:
    data = [json.loads(line) for line in f]

In [11]:
problem_mapping = defaultdict(list)
problem_mapping_count = defaultdict(int)
for d in data:
    id = d['task_id']
    splitted = id.split('_')
    problem_mapping_count[splitted[0]] += 1
    # if len(splitted) == 1:
    #     problem_mapping['original'].append(d)
    # else:
    #     problem_mapping[splitted[1]].append(d)

for d in data:
    id = d['task_id']
    splitted = id.split('_')
    if len(splitted) == 1:
        problem_mapping['original'].append(d)
    else:
        problem_mapping[splitted[1]].append(d)
        if problem_mapping_count[splitted[0]] == 2:
            problem_mapping['all'].append(d)

print(problem_mapping.keys())

dict_keys(['original', 'P13', 'all', 'P8', 'P6', 'P1', 'P15', 'P16'])


In [12]:
result_data = []
for key in problem_mapping.keys():
    problems = problem_mapping[key]
    results = defaultdict(list)
    
    for d in problems:
        for i in range(20):
            results[d['task_id']].append(d[f'result_{i}']['passed'])

    
    total, correct = [], []
    for result in results.values():
        total.append(len(result))
        correct.append(sum(result))
    pass_at_1 = estimate_pass_at_k(total, correct, 1)
    pass_at_3 = estimate_pass_at_k(total, correct, 3)
    pass_at_10 = estimate_pass_at_k(total, correct, 10)
    print(f'Problem type: {key}')
    print(f'len: {len(total)}')
    print(f'Pass@1: {np.mean(pass_at_1)}')
    print(f'Pass@3: {np.mean(pass_at_3)}')
    print(f'Pass@10: {np.mean(pass_at_10)}')

    if key !='original':
        pair_results = defaultdict(list)
        pair_problems = problem_mapping['original']
       
        for id in results.keys():
            id = id.split('_')[0]
            for d in pair_problems:
                if d['task_id'] == id:
                    for i in range(20):
                        pair_results[d['task_id']].append(d[f'result_{i}']['passed'])
        
        
        print(f'pair len: {len(pair_results)}')

        total, correct = [], []
        for result in pair_results.values():
            total.append(len(result))
            correct.append(sum(result))
        pair_pass_at_1 = estimate_pass_at_k(total, correct, 1)
        pair_pass_at_3 = estimate_pass_at_k(total, correct, 3)
        pair_pass_at_10 = estimate_pass_at_k(total, correct, 10)
        print(f'Pass@1: {np.mean(pair_pass_at_1)}')
        print(f'Pass@3: {np.mean(pair_pass_at_3)}')
        print(f'Pass@10: {np.mean(pair_pass_at_10)}')
        result_data.append(['GPT-3.5','Java',key,len(total), np.mean(pass_at_1), np.mean(pass_at_3), np.mean(pass_at_10), np.mean(pair_pass_at_1), np.mean(pair_pass_at_3), np.mean(pair_pass_at_10)])
    else:
        result_data.append(['GPT-3.5','Java',key,len(total), np.mean(pass_at_1), np.mean(pass_at_3), np.mean(pass_at_10), 0, 0, 0])
                


Problem type: original
len: 161
Pass@1: 0.8760869565217392
Pass@3: 0.9449384330391197
Pass@10: 0.9612148855317519
Problem type: P13
len: 161
Pass@1: 0.8599378881987577
Pass@3: 0.9409502015909338
Pass@10: 0.9556892831223158
pair len: 161
Pass@1: 0.8760869565217391
Pass@3: 0.9449384330391195
Pass@10: 0.9612148855317519
Problem type: all
len: 161
Pass@1: 0.7127329192546584
Pass@3: 0.7613272311212815
Pass@10: 0.769873450012096
pair len: 161
Pass@1: 0.8760869565217392
Pass@3: 0.9449384330391195
Pass@10: 0.9612148855317519
Problem type: P8
len: 67
Pass@1: 0.5152985074626867
Pass@3: 0.5469096823574434
Pass@10: 0.5521579409454277
pair len: 67
Pass@1: 0.8126865671641792
Pass@3: 0.9128829536527887
Pass@10: 0.9441142702775714
Problem type: P6
len: 6
Pass@1: 0.0
Pass@3: 0.0
Pass@10: 0.0
pair len: 6
Pass@1: 0.875
Pass@3: 0.975877192982456
Pass@10: 0.9999900770024609
Problem type: P1
len: 21
Pass@1: 0.8833333333333332
Pass@3: 0.9502506265664159
Pass@10: 0.9523809523809523
pair len: 21
Pass@1: 0.8999

In [13]:
import pandas as pd
df = pd.DataFrame(result_data, columns=['Model','Language','Total', 'Problem Type', 'Pass@1', 'Pass@3', 'Pass@10', 'Pair Pass@1', 'Pair Pass@3', 'Pair Pass@10'])
df.to_csv('./Output/GPT3.5_Output_Java_1.0_result.csv', index=False)

In [14]:
# 1. Pairwise comparison
# 2. Fix with 'all'
# 3. Keep the completion as it is.
# 4. Starcoder
# 5. Java
# 6. Code clone.
# 7. Change the result for rq2.