In [1]:
import json
import os

from datasets import load_dataset
from swebench import (
    MAP_VERSION_TO_INSTALL,
    get_logs_eval,
)
from tqdm.auto import tqdm

## Updating SWE-bench from `conda` sweep results

In [2]:
data = load_dataset('princeton-nlp/SWE-bench', split='test')
data_map = {x['instance_id']: x for x in data}

In [3]:
conda_ids = [
    "py39_23.10.0-1",
    "py39_23.9.0-0",
    "py311_23.10.0-1",
    "py311_23.9.0-0",
]

* `folder` points at the location of the `test_202404` folder, which contains the `.log` files generated from running validation sweeps across different conda installation links.
* `folder_out` is an arbitrary folder for storing the validation results. The information, particularly how many instances are successfully re-created, is used to inform which conda link to use for which repo/version

In [4]:
folder = "/Users/johnbyang/Documents/Research/swe-bench/data/validation/test_202404"
folder_out = "logs_per_conda"

<hr />

### Keep instances w/ 1+ F2P intact
* Iterate through all conda_ids
* Keep task instance if at least one pass to fail
* If at least 1+ F2P, keep any and all P2P that are reproduced successfully
* Save each experiment to a json, w/ `conda_id`, `new_dataset`, and `ids_reproduce_fail` fields

In [5]:
def survey_updated_logs(log_folder):
    """
    ids_no_log: List of (instance_id, version) for which no log exists (likely an installation error)
    ids_reproduce_fail: List of (instance_id, version) for which at least 1+ F2P could not be reproduced
    new_dataset: List of updated dataset entries
    changed_f2p: List of (instance_id, version) for which the F2P tests have changed
    changed_p2p: List of (instance_id, version) for which the P2P tests have changed
    """
    ids_no_log = []
    ids_reproduce_fail = []
    new_dataset = []
    changed_f2p = []
    changed_p2p = []
    
    for d in tqdm(data):
        log_path = os.path.join(log_folder, f"{d['instance_id']}.log")
        if not os.path.exists(log_path):
            ids_no_log.append((d['instance_id'], d['version']))
            continue
        status_map, applied = get_logs_eval(log_path)
        f2p_old = json.loads(d['FAIL_TO_PASS'])
        p2p_old = json.loads(d['PASS_TO_PASS'])

        # NOTE: Change to `all` to enforce f2ps must all exist
        tests_reproduced = any([ 
            f2p in status_map and status_map[f2p] == 'PASSED'
            for f2p in f2p_old
        ])
        if not tests_reproduced:
            ids_reproduce_fail.append((d['instance_id'], d['version']))
            continue

        f2p_new = [k for k, v in status_map.items() if v == 'PASSED' and k in f2p_old]
        p2p_new = [k for k, v in status_map.items() if v == 'PASSED' and k not in f2p_old]
    
        if sorted(f2p_old) != sorted(f2p_new):
            changed_f2p.append((d['instance_id'], d['version']))
        if sorted(p2p_old) != sorted(p2p_new):
            changed_p2p.append((d['instance_id'], d['version']))

        new_dataset.append({
            **d,
            # NOTE: Comment out following line to maintain original tests
            'FAIL_TO_PASS': f2p_new,
            'PASS_TO_PASS': p2p_new,
        })
    return ids_no_log, ids_reproduce_fail, new_dataset, changed_f2p, changed_p2p

<hr />

### Identify Best Conda Link

Loop to determine how many instances were reproduced per conda ID. The results for each conda ID are stored to a `.json` file in the `folder_out` directory.

In [6]:
# Loop through
map_folder_id_to_conda_id = {}
for conda_id in conda_ids:
    temp = conda_id.replace('.', '_').replace('-', '_')
    map_folder_id_to_conda_id[temp] = conda_id
    conda_id = temp
    print(conda_id)

    log_folder = f"{folder}/{conda_id}"
    ids_no_log, ids_reproduce_fail, new_dataset, changed_f2p, changed_p2p = survey_updated_logs(log_folder)
    if len(ids_no_log) == 0 and len(ids_reproduce_fail) == 0 and len(new_dataset) == 0:
        continue

    print(f"- # Reproduce Fail: {len(ids_reproduce_fail)}")
    print(f"- # New Dataset: {len(new_dataset)}")
    print(f"- # Changed (F2P): {len(changed_f2p)}")
    print(f"- # Changed (P2P): {len(changed_p2p)}")
    with open(f"{folder_out}/results_{conda_id}.json", "w") as f:
        json.dump({
            "conda_id": conda_id,
            "new_dataset": new_dataset,
            "ids_reproduce_fail": ids_reproduce_fail,
            "changed_f2p": changed_f2p,
            "changed_p2p": changed_p2p,
        }, fp=f)

py39_23_10_0_1


  0%|          | 0/2294 [00:00<?, ?it/s]

- # Reproduce Fail: 4
- # New Dataset: 2290
- # Changed (F2P): 32
- # Changed (P2P): 242
py39_23_9_0_0


  0%|          | 0/2294 [00:00<?, ?it/s]

- # Reproduce Fail: 10
- # New Dataset: 230
- # Changed (F2P): 7
- # Changed (P2P): 72
py311_23_10_0_1


  0%|          | 0/2294 [00:00<?, ?it/s]

- # Reproduce Fail: 10
- # New Dataset: 232
- # Changed (F2P): 5
- # Changed (P2P): 73
py311_23_9_0_0


  0%|          | 0/2294 [00:00<?, ?it/s]

- # Reproduce Fail: 10
- # New Dataset: 229
- # Changed (F2P): 4
- # Changed (P2P): 67


<hr />

#### Determine which Miniconda installer URL successfully captures the most instances for each repo/version pair

In [7]:
def determine_which_conda_id_is_best(repo, version):
    """
    Given a repo and version, determine which conda_id recreates the most instances based on the number of instances
    from that repo/version combo in new_dataset.
    """
    max_count, conda_id_best = 0, None
    for x in conda_ids:
        results_path = f"{folder_out}/results_{x.replace('-', '_').replace('.', '_')}.json"
        data_adjusted = json.load(open(results_path))
        conda_id = results_path.split("/")[1][len("results_"):-len(".json")]
        count = sum([1 for x in data_adjusted['new_dataset'] if x['repo'] == repo and x['version'] == version])
        if count > max_count:
            max_count = count
            conda_id_best = conda_id
    return max_count, conda_id_best

In [8]:
repo_version_to_conda_id = {}
total = 0
test_set_repo_version_pairs = set([(x['repo'], x['version']) for x in data_map.values()])

# Loop through all repo/version combos
for repo, v in MAP_VERSION_TO_INSTALL.items():
    if repo not in set(data['repo']):
        # Do not proceed for repos that are not in test set
        continue

    repo_version_to_conda_id[repo] = {}
    for version in list(v.keys()):
        if (repo, version) not in test_set_repo_version_pairs:
            # Do not proceed for (repo, version) pairs that are not in test set
            continue

        # Determine which conda_id is best for this repo/version combo
        max_count, conda_id_best = determine_which_conda_id_is_best(repo, version)
        if conda_id_best is not None:
            conda_id_best = map_folder_id_to_conda_id[conda_id_best]

        # Bookkeeping (log # of recreated instances and store the best conda_id)
        total += max_count
        repo_version_to_conda_id[repo][version] = conda_id_best
        print(repo, version, conda_id_best, max_count)

astropy/astropy 1.3 py39_23.10.0-1 11
astropy/astropy 3.0 py39_23.10.0-1 6
astropy/astropy 3.1 py39_23.10.0-1 5
astropy/astropy 4.2 py39_23.10.0-1 1
astropy/astropy 4.3 py39_23.10.0-1 11
astropy/astropy 5.0 py39_23.10.0-1 29
astropy/astropy 5.1 py39_23.10.0-1 23
astropy/astropy 5.2 py39_23.10.0-1 9
django/django 1.9 py39_23.10.0-1 1
django/django 1.10 py39_23.10.0-1 1
django/django 1.11 py39_23.10.0-1 4
django/django 2.0 py39_23.10.0-1 2
django/django 2.1 py39_23.10.0-1 2
django/django 2.2 py39_23.10.0-1 6
django/django 3.0 py39_23.10.0-1 137
django/django 3.1 py39_23.10.0-1 128
django/django 3.2 py39_23.10.0-1 157
django/django 4.0 py39_23.10.0-1 123
django/django 4.1 py39_23.10.0-1 124
django/django 4.2 py39_23.10.0-1 84
django/django 5.0 py39_23.10.0-1 81
matplotlib/matplotlib 3.5 py39_23.10.0-1 40
matplotlib/matplotlib 3.6 py39_23.10.0-1 40
matplotlib/matplotlib 3.7 py39_23.10.0-1 61
matplotlib/matplotlib 3.1 py39_23.10.0-1 2
matplotlib/matplotlib 3.2 py39_23.10.0-1 1
matplotlib/ma

Create dataset from the above repo/version-to-conda-ID map. Use the conda ID that recreates the most task instances for each repo/version pair.

In [9]:
new_dataset_agg = []
for repo, version_map in tqdm(repo_version_to_conda_id.items()):
    for version, conda_id in version_map.items():
        if conda_id != None:
            conda_id = conda_id.replace('.', '_').replace('-', '_')
            adjusted_data = json.load(open(f"{folder_out}/results_{conda_id}.json"))['new_dataset']
            adjusted_data = [x for x in adjusted_data if x['repo'] == repo and x['version'] == version]
            new_dataset_agg.extend(adjusted_data)

new_dataset_agg = sorted(new_dataset_agg, key=lambda x: x['instance_id'])
print(len(new_dataset_agg))

  0%|          | 0/12 [00:00<?, ?it/s]

2291


<hr />

#### What task instances are not resolved?

In [10]:
# Set subraction between original test set and new dataset
failing_inst_ids = sorted([
    x for x in list(
        set([(x['instance_id'], x['version']) for x in data]) -
        set([(x['instance_id'], x['version']) for x in new_dataset_agg])
    )])

In [11]:
# Show failinting instance IDs
failing_inst_ids

[('matplotlib__matplotlib-26399', '3.7'),
 ('sympy__sympy-11818', '1.0'),
 ('sympy__sympy-13865', '1.1')]