In [1]:
import pandas as pd
import os
from sklearn.model_selection import KFold

In [3]:
# load the 3 benchmark datasets
datasets = ["libraries", "cedar", "dafnyVMC"]

benchmarks = {}
for dataset in datasets:
    print("Processing", dataset)
    if os.path.exists(f"../results/training_{dataset}.csv") and os.path.exists(
        f"../results/testing_{dataset}.csv"
    ):
        print(f"Training and testing datasets already exist for {dataset}")
        benchmarks[dataset] = {
            "training": pd.read_csv(f"../results/training_{dataset}.csv"),
            "testing": pd.read_csv(f"../results/testing_{dataset}.csv"),
        }
    else:
        df = pd.read_csv(f"../results/non_verified_{dataset}.csv")

        # remove duplicates
        df.drop_duplicates(subset=["Original Method", "Assertion"], inplace=True)

        # rewrite the path of "New Method File" but keep the same file name
        df["New Method File"] = df["New Method File"].apply(
            lambda x: os.path.join("/exp/dafny_repair/results/", os.path.basename(x))
        )

        # split the dataset into training and testing
        training = df.sample(frac=0.5)
        testing = df.drop(training.index)

        # split the dataset using k fold cross validation

        # save the datasets
        training.to_csv(f"../results/training_{dataset}.csv", index=False)
        testing.to_csv(f"../results/testing_{dataset}.csv", index=False)
        benchmarks[dataset] = {"training": training, "testing": testing}

Processing libraries
Training and testing datasets already exist for libraries
Processing cedar
Training and testing datasets already exist for cedar
Processing dafnyVMC
Training and testing datasets already exist for dafnyVMC


In [6]:
k = 3
benchmarks = {}
for dataset in datasets:
    print("Processing", dataset)
    all_training_files_exist = all(
        os.path.exists(f"../results/training_{dataset}_k{k}_{i}.csv")
        for i in range(0, k)
    )
    all_testing_files_exist = all(
        os.path.exists(f"../results/testing_{dataset}_k{k}_{i}.csv")
        for i in range(0, k)
    )

    if all_testing_files_exist and all_training_files_exist:
        print(f"Training and testing datasets already exist for {dataset}")
        benchmarks[dataset] = {
            "training": [
                pd.read_csv(f"../results/training_{dataset}_k{k}_{i}.csv")
                for i in range(0, k)
            ],
            "testing": [
                pd.read_csv(f"../results/testing_{dataset}_k{k}_{i}.csv")
                for i in range(0, k)
            ],
        }
        continue
    df = pd.read_csv(f"../results/non_verified_{dataset}.csv")

    # remove duplicates
    df.drop_duplicates(subset=["Original Method", "Assertion"], inplace=True)

    # rewrite the path of "New Method File" but keep the same file name
    df["New Method File"] = df["New Method File"].apply(
        lambda x: os.path.join("/exp/dafny_repair/results/", os.path.basename(x))
    )

    kf = KFold(n_splits=k)

    training_sets = []
    testing_sets = []

    for train_index, test_index in kf.split(df):
        training = df.iloc[train_index]
        testing = df.iloc[test_index]
        training_sets.append(training)
        testing_sets.append(testing)
        # training.to_csv(f"../results/training_{dataset}_k{k}_id{train_index}.csv", index=False)
        # testing.to_csv(f"../results/testing_{dataset}_k{k}_id{test_index}.csv", index=False)
    for i, (training, testing) in enumerate(zip(training_sets, testing_sets)):
        training.to_csv(f"../results/training_{dataset}_k{k}_{i}.csv", index=False)
        testing.to_csv(f"../results/testing_{dataset}_k{k}_{i}.csv", index=False)
    benchmarks[dataset] = {
        "training": training_sets,
        "testing": testing_sets,
    }

Processing libraries
Training and testing datasets already exist for libraries
Processing cedar
Training and testing datasets already exist for cedar
Processing dafnyVMC
Training and testing datasets already exist for dafnyVMC


In [7]:
# Pick 6 random examples from the training set
for dataset in datasets:
    print("Processing", dataset)
    training = benchmarks[dataset]["training"]
    testing = benchmarks[dataset]["testing"]

    all_training_fold_files_exist = all(
        os.path.exists(f"../results/training_{dataset}_k{k}_sample_{i}.csv")
        for i in range(0, k)
    )
    if all_training_fold_files_exist:
        print(f"Training fold datasets already exist for {dataset}")
        benchmarks[dataset]["training"] = [
            pd.read_csv(f"../results/training_{dataset}_k{k}_sample_{i}.csv")
            for i in range(0, k)
        ]
        continue

    # pick 6 random examples from the training set
    for i, training_fold in enumerate(training):
        fold = training_fold.sample(n=6)
        fold.to_csv(f"../results/training_{dataset}_k{k}_sample_{i}.csv", index=False)
        benchmarks[dataset]["training"][i] = fold

Processing libraries
Training fold datasets already exist for libraries
Processing cedar
Training fold datasets already exist for cedar
Processing dafnyVMC


In [8]:
k = 3
for dataset in datasets:
    print("Processing", dataset)
    all_training_files_exist = all(
        os.path.exists(f"../results/training_{dataset}_k{k}__{i}.csv")
        for i in range(0, k)
    )
    all_testing_files_exist = all(
        os.path.exists(f"../results/testing_{dataset}_k{k}__{i}.csv")
        for i in range(0, k)
    )

    if all_testing_files_exist and all_training_files_exist:
        print(f"Training and testing datasets already exist for {dataset}")
        benchmarks[dataset] = {
            "training": [
                pd.read_csv(f"../results/training_{dataset}_k{k}__{i}.csv")
                for i in range(0, k)
            ],
            "testing": [
                pd.read_csv(f"../results/training_{dataset}_k{k}__{i}.csv")
                for i in range(0, k)
            ],
        }
        continue
    df = pd.read_csv(f"../results/non_verified_{dataset}.csv")

    # remove duplicates
    df.drop_duplicates(subset=["Original Method", "Assertion"], inplace=True)

    # rewrite the path of "New Method File" but keep the same file name
    df["New Method File"] = df["New Method File"].apply(
        lambda x: os.path.join("/exp/dafny_repair/results/", os.path.basename(x))
    )

    kf = KFold(n_splits=k)

    training_sets = []
    testing_sets = []

    for train_index, test_index in kf.split(df):
        training = df.iloc[train_index]
        testing = df.iloc[test_index]
        training_sets.append(training)
        testing_sets.append(testing)

    for i, (training, testing) in enumerate(zip(training_sets, testing_sets)):
        testing.to_csv(f"../results/training_{dataset}_k{k}__{i}.csv", index=False)
        training.to_csv(f"../results/testing_{dataset}_k{k}__{i}.csv", index=False)
        print(f"training length: {len(testing)}")
        print(f"testing length: {len(training)}")

Processing libraries
Training and testing datasets already exist for libraries
Processing cedar
Training and testing datasets already exist for cedar
Processing dafnyVMC
Training and testing datasets already exist for dafnyVMC


In [9]:
df = benchmarks["libraries"]["training"][0]

In [10]:
# Pick 6 random examples from the training set
for dataset in datasets:
    print("Processing", dataset)
    training = benchmarks[dataset]["training"]
    testing = benchmarks[dataset]["testing"]

    all_training_fold_files_exist = all(
        os.path.exists(f"../results/training_{dataset}_k{k}__sample_{i}.csv")
        for i in range(0, k)
    )
    if all_training_fold_files_exist:
        print(f"Training fold datasets already exist for {dataset}")
        benchmarks[dataset]["training"] = [
            pd.read_csv(f"../results/training_{dataset}_k{k}__sample_{i}.csv")
            for i in range(0, k)
        ]
        continue

    # pick 6 random examples from the training set
    for i, training_fold in enumerate(training):
        print(f"training length: {len(training_fold)}")
        fold = training_fold.sample(n=5)
        print(
            fold.drop(columns=["Time Difference"])
            .isin(training_fold.drop(columns=["Time Difference"]))
            .all()
            .all()
        )
        fold.to_csv(f"../results/training_{dataset}_k{k}__sample_{i}.csv", index=False)
        benchmarks[dataset]["training"][i] = fold

Processing libraries
Training fold datasets already exist for libraries
Processing cedar
Training fold datasets already exist for cedar
Processing dafnyVMC
Training fold datasets already exist for dafnyVMC


In [11]:
benchmarks["libraries"]["training"][0].isin(df)

Unnamed: 0,Index,Original File,Original Method,Original Method Time,Original Method Result,Original Method File,Assertion,Time Difference,New Method File,New Method,New Method Time,New Method Result,New Result File,Assertion Tokens,Method Tokens
0,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False
2,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False
5,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False


In [4]:
content = {}
for dataset in datasets:
    if dataset == "cedar":
        df = pd.read_csv(f"../results/non_verified_{dataset}.csv")

        # remove duplicates
        df.drop_duplicates(subset=["Original Method", "Assertion"], inplace=True)

        # rewrite the path of "New Method File" but keep the same file name
        df["New Method File"] = df["New Method File"].apply(
            lambda x: os.path.join("/exp/dafny_repair/results/", os.path.basename(x))
        )
        if dataset == "cedar":
            df = df.sample(frac=0.5)
        content[dataset] = df

        if not os.path.exists(f"../results/placeholder_dataset/"):
            os.makedirs(f"../results/placeholder_dataset/")

        df.to_csv(f"../results/placeholder_dataset/{dataset}.csv", index=False)

(58, 13)