In [1]:
import json
import os

with open('data/codearena_instances.json') as f:
    tasks = json.load(f)

instance_ids = [t['instance_id'] for t in tasks]

def get_task(instance_id):
    return [t for t in tasks if t['instance_id'] == instance_id][0]

yt_ids = [t['instance_id'] for t in tasks if 'youtube' in t['instance_id']]

def save_ids(id_list, filename):
    with open(filename, 'w') as f:
        for instance_id in id_list:
            f.write(f"{instance_id}\n")

def load_ids(filename):
    with open(filename, 'r') as f:
        return [line.strip() for line in f.readlines()]

N_PYLINT_MESSAGES_SHOWN_TO_AGENT = 20

chosen_ids = load_ids('data/chosen_sane_instances.txt')

In [2]:
IGNORE_IDS = [
    'E0401',  # Unable to import
    'W0511',  # TODO
]

def filter_pylint_output(pylint_output, keep_types=['error', 'warning', 'fatal'], max=None):
    output = []
    n_so_far = 0

    for file_output in pylint_output:
        new_output = file_output.copy()
        new_output['messages'] = [
            i for i in file_output['messages']
            if (i['type'] in keep_types
                and i['message-id'] not in IGNORE_IDS)
        ]
        if max and len(new_output['messages']) - n_so_far > max:
            new_output['messages'] = new_output['messages'][:max - n_so_far]

        n_so_far += len(new_output['messages'])
        # get rid of the counts fields
        to_delete = [k for k in new_output if k.endswith('count')]
        for k in to_delete:
            del new_output[k]
        output.append(new_output)

        if max and n_so_far >= max:
            break

    return output

def get_filtered_pylint_output(pylint_output):
    return filter_pylint_output(pylint_output, keep_types=['warning', 'error', 'fatal'], max=N_PYLINT_MESSAGES_SHOWN_TO_AGENT)

def add_pylint_feedback_to_tasks():
    for task in tasks:
        instance_id = task['instance_id']
        pylint_path = f'gc_results/sweb-style-review/{instance_id}/run_evaluation/style_check2/gold/{instance_id}_styleReview/pylint_errors.json'

        if not os.path.exists(pylint_path):
            continue

        try:
            with open(pylint_path, 'r') as f:
                pylint_output = json.load(f)
        except JSONDecodeError as e:
            print(f'JSONDecodeError for {instance_id}')
            continue

        filtered_pylint_output = get_filtered_pylint_output(pylint_output)
        task['style_review'] = filtered_pylint_output

def create_style_review_task(task, pylint_output_path):
    """
    pylint_output_path: path to the pylint_errors.json file
    """

    task = task.copy()
    with open(pylint_output_path, 'r') as f:
        pylint_output = json.load(f)

    pylint_output = get_filtered_pylint_output(pylint_output)
    problem_statement = """<pylint_output>
{pylint_output}
</pylint_output>"""

    task['problem_statement'] = problem_statement.format(
        pylint_output=json.dumps(pylint_output, indent=2)
    )

    return task

from json.decoder import JSONDecodeError

def create_style_review_dataset():
    sr_tasks = []
    for task in tasks:
        instance_id = task['instance_id']
        pylint_path = f'/Users/simon/Downloads/sr2/sweb-style-review/{instance_id}/run_evaluation/style_check2/gold/{instance_id}_styleReview/pylint_errors.json'

        if not os.path.exists(pylint_path):
            continue

        try:
            sr_task = create_style_review_task(task, pylint_path)
        except JSONDecodeError as e:
            print(f'JSONDecodeError for {instance_id}')
            continue
        sr_tasks.append(sr_task)

    print(len(sr_tasks), "style review tasks created")
    # write to file
    with open('data/sweagent_style_review_instances.json', 'w') as f:
        json.dump(sr_tasks, f, indent=2)

def calculate_sweagent_score(old_pylint_report, new_pylint_report, test_report=None, max_fixable=None):
    """
    Score is calculated as 1(tests_passed) * min(1, n_fixed / max_fixable))
    where tests_passed is 1 if the new patch passed tests, and n_fixed is the number of pylint messages resolved.
    max_fixable is the maximum number of messages that can be fixed (the number given to the model to fix)

    Returns old_n, new_n, score where old_n and new_n are the number of messages in the old and new pylint files respectively.
    """
    def n_messages(report):
        return sum(len(file['messages']) for file in report)

    old_n = n_messages(old_pylint_report)
    new_n = n_messages(new_pylint_report)

    tests_passed = test_report['resolved'] if test_report else 1
    max_fixable = max_fixable if max_fixable else old_n

    n_resolved = old_n - new_n
    score = tests_passed * (n_resolved / max_fixable)
    score = max(0., min(1., score))

    return old_n, new_n, score

In [171]:
def import_sr_results_gold(path, run_id):
    sr_results = {}
    for folder in os.listdir(path):
        if folder not in instance_ids:
            continue

        instance_id = folder
        results_path = f'{path}/{folder}/run_evaluation/{run_id}/gold/{instance_id}_styleReview/pylint_errors.json'
        try:
            with open(results_path, 'r') as f:
                pylint_output = json.load(f)
        except FileNotFoundError:
            # print(f"File not found: {results_path}")
            continue
        except json.JSONDecodeError:
            # print(f"Error decoding JSON in file: {results_path}")
            continue

        sr_results[instance_id] = pylint_output

    return sr_results

def import_sr_results_nongold(path, run_id):
    sr_results = {}
    for folder in os.listdir(path):
        if folder not in instance_ids:
            continue

        instance_id = folder
        results_path = f'{path}/{folder}/run_evaluation/{run_id}/logs/{instance_id}_styleReview/pylint_errors.json'
        try:
            with open(results_path, 'r') as f:
                pylint_output = json.load(f)
        except FileNotFoundError:
            # print(f"File not found: {results_path}")
            continue
        except json.JSONDecodeError:
            # print(f"Error decoding JSON in file: {results_path}")
            continue

        sr_results[instance_id] = pylint_output

    return sr_results

def import_sweagent_bugfixing_results(path, run_id):
    """
    Import the results from the style review.
    """
    sr_results = {}
    for folder in os.listdir(path):
        if folder not in instance_ids:
            continue

        instance_id = folder
        results_paths = [
            f'{path}/{folder}/run_evaluation/{run_id}/logs/{instance_id}/report.json',
            f'{path}/{folder}/logs/run_evaluation/{run_id}/logs/{instance_id}/report.json'
        ]

        for results_path in results_paths:
            if os.path.exists(results_path) and (os.stat(results_path).st_size > 0):
                try:
                    with open(results_path, 'r') as f:
                        report = json.load(f)
                except FileNotFoundError:
                    # print(f"File not found: {results_path}")
                    continue
                except json.JSONDecodeError:
                    # print(f"Error decoding JSON in file: {results_path}")
                    continue

                sr_results[instance_id] = report[instance_id]

    return sr_results


def load_sweagent_results(path):
    results = {}
    # look at sweagent results
    for instance_id in os.listdir(path):
        if instance_id not in instance_ids:
            continue

        possible_paths = [
            f'{path}/{instance_id}/all_preds.jsonl',
            f'{path}/{instance_id}/logs/all_preds.jsonl',
        ]
        for all_preds_path in possible_paths:
            if instance_id in results:
                continue

            if os.path.exists(all_preds_path) and (os.stat(all_preds_path).st_size > 0):
                try:
                    with open(all_preds_path, 'r') as f:
                        preds = json.load(f)
                except json.JSONDecodeError:
                    continue

                # not sure why this is the format lol but whatever
                assert type(preds['model_patch']) == dict
                if preds['model_patch'] is None:
                    continue

                if type(preds['model_patch']) == dict:
                    preds= preds['model_patch']

                if pred['model_patch'] is None:
                    continue

                results[instance_id] = preds

    return results

In [137]:
sr_results = import_sr_results_gold('gc_results/sweb-style-review', run_id='style_check2')
print(len(sr_results))
good_ids = [id for id in instance_ids if id in sr_results]

sweagent_sr_check_results = import_sr_results_nongold('gc_results/sweb-sweagent-sr-check', run_id='sweagent_sr_check')
print(len(sweagent_sr_check_results))

sweagent_sr_bf_check_results = import_sweagent_bugfixing_results('gc_results/sweb-sweagent-sr-bf-check', run_id='sweagent_sr_bf_check')
print(len(sweagent_sr_bf_check_results))

sweagent_results = load_sweagent_results('gc_results/sweb-sweagent-sr')
still_need_sr_agent_ids = [id for id in good_ids if id not in sweagent_results]
sweagent_sr_done_ids = [id for id in good_ids if id in sweagent_results]

print(len(still_need_sr_agent_ids), len(sweagent_sr_done_ids))
save_ids(still_need_sr_agent_ids, 'ids/still_need_sr_agent_ids.txt')
save_ids(sweagent_sr_done_ids, 'ids/sweagent_sr_done_ids.txt')

still_need_sweagent_sr_check_ids = [id for id in good_ids if id not in sweagent_sr_check_results and id not in still_need_sr_agent_ids]
print(len(still_need_sweagent_sr_check_ids))
save_ids(still_need_sweagent_sr_check_ids, 'ids/still_need_sweagent_sr_check_ids.txt')

still_need_sweagent_sr_bf_check_ids = [id for id in good_ids if id not in sweagent_sr_bf_check_results and id not in still_need_sr_agent_ids]
print(len(still_need_sweagent_sr_bf_check_ids))
save_ids(still_need_sweagent_sr_bf_check_ids, 'ids/still_need_sweagent_sr_bf_check_ids.txt')

610
602
591
7 603
1
12


In [94]:
def save_sweagent_preds(sweagent_results, filename):
    # create one big all_preds.jsonl file with the results
    with open(filename, 'w') as f:
        for instance_id in sweagent_results:
            pred = sweagent_results[instance_id]
            f.write(json.dumps(pred) + "\n")

# save_sweagent_preds(sweagent_results, 'gc_results/sweagent_preds/sweagent_sr_all_preds.jsonl')

In [76]:
def save_overall_results():
    # saving the overall results
    # instance_id: original_style_review, sweagent_patch, new_style_review, report, score
    overall_results = {}
    have_all_ids = [id for id in good_ids if all([id in r for r in [sweagent_sr_check_results, sweagent_sr_bf_check_results, sweagent_results]])]
    # have_all_ids = [id for id in good_ids if all([id in r for r in [sweagent_sr_check_results, sweagent_results]])]
    for instance_id in have_all_ids:
        original_style_review_report = sr_results[instance_id]
        sweagent_patch = sweagent_results[instance_id]
        new_style_review_report = sweagent_sr_check_results[instance_id]
        test_report = sweagent_sr_bf_check_results[instance_id] if instance_id in sweagent_sr_bf_check_results else None
        max_fixable = N_PYLINT_MESSAGES_SHOWN_TO_AGENT
        old_n, new_n, score = calculate_sweagent_score(original_style_review_report, new_style_review_report, test_report, max_fixable=max_fixable)
        old_n, new_n, score_ignore_resolved = calculate_sweagent_score(original_style_review_report, new_style_review_report, test_report=None, max_fixable=max_fixable)

        overall_results[instance_id] = {
            'original_style_review_report': original_style_review_report,
            'sweagent_patch': sweagent_patch,
            'new_style_review_report': new_style_review_report,
            'test_report': test_report,
            'old_n_messages': old_n,
            'new_n_messages': new_n,
            'max_fixable': max_fixable,
            # 'resolved': test_report['resolved'],
            'resolved': "N/A",
            'score': score,
            'score_ignore_resolved': score_ignore_resolved,
            'score_formula': f"1(resolved) * min(1, ((old_n_messages - new_n_messages) / max_fixable))",
        }

    print(len(overall_results))

    with open('sweagent_style_review_results.json', 'w') as f:
        json.dump(overall_results, f, indent=2)

In [77]:
# overall_score = sum([r['score'] for r in overall_results.values()]) / len(overall_results)
# overall_score_ignore_resolved = sum([r['score_ignore_resolved'] for r in overall_results.values()]) / len(overall_results)
# print(f"Overall score: {overall_score:.3f}")
# print(f"Overall score ignoring whether resolved: {overall_score_ignore_resolved:.3f}")

In [78]:
# with open('sweagent_style_review_results.json', 'r') as f:
#     overall_results = json.load(f)

In [79]:
chosen_ids = load_ids('data/chosen_sane_instances.txt')

In [140]:
def split_into_n(lst, n):
    """
    Split *lst* into *n* nearly equal parts (size difference ≤ 1), preserving order.
    """
    if n <= 0:
        raise ValueError("n must be positive")
    k, m = divmod(len(lst), n)          # k = base size, m = 1‑extra chunks
    parts, start = [], 0
    for i in range(n):
        end = start + k + (i < m)       # first *m* parts get one extra element
        parts.append(lst[start:end])
        start = end
    return parts

In [141]:
def save_n_lists(id_list, n, filename):
    # save n lists of ids to a file
    lsts = split_into_n(id_list, n)
    for i in range(n):
        save_ids(lsts[i], f'{filename}{i}.txt')

# 1. llama style review

In [174]:
sweagent_results_llama = load_sweagent_results('gc_results/sweb-sweagent-sr-llama')
print(len(sweagent_results_llama))
save_sweagent_preds(sweagent_results_llama, 'gc_results/sweagent_preds/sweagent_sr_llama_all_preds.jsonl')

285


In [166]:
still_need_sr_agent_ids_llama = [id for id in chosen_ids if id not in sweagent_results_llama]
sweagent_sr_done_ids_llama = [id for id in chosen_ids if id in sweagent_results_llama]

print(len(still_need_sr_agent_ids_llama), len(sweagent_sr_done_ids_llama))
save_ids(still_need_sr_agent_ids_llama, 'ids/still_need_sr_agent_ids_llama.txt')
save_ids(sweagent_sr_done_ids_llama, 'ids/sweagent_sr_done_ids_llama.txt')

15 285


In [170]:
sweagent_sr_check_results_llama = import_sr_results_nongold('gc_results/sweb-sweagent-sr-check-llama', run_id='sweagent_sr_llama_check')
print(len(sweagent_sr_check_results_llama))

sweagent_sr_bf_check_results_llama = import_sweagent_bugfixing_results('gc_results/sweb-sweagent-sr-bf-check-llama', run_id='sweagent_sr_bf_llama_check')
print(len(sweagent_sr_bf_check_results_llama))

0
195


In [143]:
still_need_sweagent_sr_check_ids_llama = [id for id in chosen_ids if id not in sweagent_sr_check_results_llama and id not in still_need_sr_agent_ids_llama]
print(len(still_need_sweagent_sr_check_ids_llama))
save_ids(still_need_sweagent_sr_check_ids_llama, 'ids/still_need_sweagent_sr_check_ids_llama.txt')

still_need_sweagent_sr_bf_check_ids_llama = [id for id in chosen_ids if id not in sweagent_sr_bf_check_results_llama and id not in still_need_sr_agent_ids_llama]
print(len(still_need_sweagent_sr_bf_check_ids_llama))
save_ids(still_need_sweagent_sr_bf_check_ids_llama, 'ids/still_need_sweagent_sr_bf_check_ids_llama.txt')

save_n_lists(still_need_sweagent_sr_check_ids_llama, 4, 'ids/still_need_sweagent_sr_check_ids_llama')
save_n_lists(still_need_sweagent_sr_bf_check_ids_llama, 4, 'ids/still_need_sweagent_sr_bf_check_ids_llama')

285
285


# review addressal gemini

In [163]:
sweagent_review_results = load_sweagent_results('gc_results/sweb-sweagent-rf')
save_sweagent_preds(sweagent_review_results, 'gc_results/sweagent_preds/sweagent_rf_all_preds.jsonl')
len(sweagent_review_results)

262

In [164]:
still_need_rf_agent_ids = [id for id in chosen_ids if id not in sweagent_review_results]
sweagent_rf_done_ids = [id for id in chosen_ids if id in sweagent_review_results]

print(len(still_need_rf_agent_ids), len(sweagent_rf_done_ids))
save_ids(still_need_rf_agent_ids, 'ids/still_need_rf_agent_ids.txt')
save_ids(sweagent_rf_done_ids, 'ids/sweagent_rf_done_ids.txt')

sweagent_rf_check_results = import_sweagent_bugfixing_results('gc_results/sweb-sweagent-rf-check', run_id='sweagent_rf_check')
print(len(sweagent_rf_check_results))

still_need_sweagent_rf_check_ids = [id for id in chosen_ids if id not in sweagent_rf_check_results and id not in still_need_rf_agent_ids]
print(len(still_need_sweagent_rf_check_ids))
save_ids(still_need_sweagent_rf_check_ids, 'ids/still_need_sweagent_rf_check_ids.txt')

save_n_lists(still_need_sweagent_rf_check_ids, 4, 'ids/still_need_sweagent_rf_check_ids')

38 262
251
11


# review addressal llama

In [150]:
sweagent_review_results_llama = load_sweagent_results('gc_results/sweb-sweagent-rf-llama')
print(len(sweagent_review_results_llama))
save_sweagent_preds(sweagent_review_results_llama, 'gc_results/sweagent_preds/sweagent_rf_llama_all_preds.jsonl')

269


In [151]:
still_need_rf_agent_ids_llama = [id for id in chosen_ids if id not in sweagent_review_results_llama]
sweagent_rf_done_ids_llama = [id for id in chosen_ids if id in sweagent_review_results_llama]

save_n_lists(still_need_rf_agent_ids_llama, 4, 'ids/still_need_rf_agent_ids_llama')

print(len(still_need_rf_agent_ids_llama), len(sweagent_rf_done_ids_llama))
save_ids(still_need_rf_agent_ids_llama, 'ids/still_need_rf_agent_ids_llama.txt')
save_ids(sweagent_rf_done_ids_llama, 'ids/sweagent_rf_done_ids_llama.txt')

sweagent_rf_check_results_llama = []
# sweagent_rf_check_results_llama = import_sweagent_bugfixing_results('gc_results/sweb-sweagent-rf-check_llama', run_id='sweagent_rf_check_llama')
print(len(sweagent_rf_check_results_llama))

still_need_sweagent_rf_check_ids_llama = [id for id in chosen_ids if id not in sweagent_rf_check_results_llama and id not in still_need_rf_agent_ids_llama]
print(len(still_need_sweagent_rf_check_ids_llama))
save_ids(still_need_sweagent_rf_check_ids_llama, 'ids/still_need_sweagent_rf_check_ids_llama.txt')

31 269
0
269
