In [None]:
import os
import functools

import pandas as pd
import sastvd as svd
import sastvd.linevd.run as lvdrun
import sastvd.helpers.joern as svdj
from ray import tune
from tqdm.notebook import tqdm
from multiprocessing import cpu_count, Manager, Pool, Queue

USE_CPU = cpu_count()

In [1]:
def process_joern_parallel(joern_input, queue: Queue):
    func_str, dataset, iid = joern_input
    svdj.full_run_joern_from_string(func_str, dataset, iid)
    return iid

savedir = svd.get_dir(svd.cache_dir() / "minimal_datasets")
df = pd.read_parquet(
                savedir / f"minimal_bigvul_False.pq", engine="fastparquet"
            ).dropna()

funcs = df["before"].tolist()
datasets = df["dataset"].tolist()
iids = df["id"].tolist()

func_cnt = len(funcs)

joern_inputs = [(func_str, dataset, iid) for func_str, dataset, iid in zip(funcs, datasets, iids)]

with Manager() as m:
    message_queue = m.Queue()
    pool = Pool(USE_CPU)
    
    process_func = functools.partial(process_joern_parallel, queue=message_queue)

    done_iids = [
            iid
            for iid in tqdm(
                pool.imap_unordered(process_func, joern_inputs),
                desc=f"Functions",
                total=func_cnt,
            )
        ]

    message_queue.put("finished")
    pool.close()
    pool.join()

NameError: name 'Queue' is not defined

In [10]:
%writefile test_hyperparameter_generation.py
import os

import sastvd as svd
import sastvd.linevd.run as lvdrun
from ray import tune

os.environ["SLURM_JOB_NAME"] = "bash"

config = {
    "hfeat": tune.choice([512]),
    "embtype": tune.choice(["codebert"]),
    "stmtweight": tune.choice([1, 5, 10]),
    "hdropout": tune.choice([0.25, 0.3]),
    "gatdropout": tune.choice([0.15, 0.2]),
    "modeltype": tune.choice(["gat2layer"]),
    "gnntype": tune.choice(["gat"]),
    "loss": tune.choice(["ce"]),
    "scea": tune.choice([0.4, 0.5, 0.6]),
    "gtype": tune.choice(["pdg-raw"]),
    "batch_size": tune.choice([1024]),
    "multitask": tune.choice(["linemethod"]),
    "splits": tune.choice(
        [
            "crossproject_Chrome",
            "crossproject_linux",
            "crossproject_Android",
            "crossproject_ImageMagick",
            "crossproject_php-src",
            "crossproject_tcpdump",
            "crossproject_openssl",
            "crossproject_krb5",
            "crossproject_php",
            "crossproject_qemu",
        ]
    ),
    "lr": tune.choice([1e-3, 1e-4, 3e-4, 5e-4]),
}

if __name__ == "__main__":
    samplesz = -1
    run_id = svd.get_run_id()
    sp = svd.get_dir(svd.processed_dir() / f"raytune_crossproject_{samplesz}" / run_id)
    trainable = tune.with_parameters(lvdrun.train_linevd, samplesz=samplesz, savepath=sp)

    analysis = tune.run(
        trainable,
        resources_per_trial={"cpu": 4, "gpu": 0},
        metric="val_loss",
        mode="min",
        config=config,
        num_samples=1000,
        name="tune_linevd",
        local_dir=sp,
        keep_checkpoints_num=2,
        checkpoint_score_attr="min-val_loss",
    )

0,1
Current time:,2023-03-15 19:12:44
Running for:,00:00:03.87
Memory:,8.0/15.3 GiB

Trial name,status,loc,batch_size,embtype,gatdropout,gnntype,gtype,hdropout,hfeat,loss,lr,modeltype,multitask,scea,splits,stmtweight
train_linevd_0795f_00000,RUNNING,10.136.104.29:182804,1024,codebert,0.2,gat,pdg-raw,0.25,512,ce,0.0001,gat2layer,linemethod,0.6,crossproject_krb5,1
train_linevd_0795f_00001,PENDING,,1024,codebert,0.2,gat,pdg-raw,0.25,512,ce,0.0005,gat2layer,linemethod,0.4,crossproject_linux,10
train_linevd_0795f_00002,PENDING,,1024,codebert,0.2,gat,pdg-raw,0.25,512,ce,0.0005,gat2layer,linemethod,0.5,crossproject_Im_c0d0,1
train_linevd_0795f_00003,PENDING,,1024,codebert,0.2,gat,pdg-raw,0.25,512,ce,0.0003,gat2layer,linemethod,0.5,crossproject_qemu,1
train_linevd_0795f_00004,PENDING,,1024,codebert,0.2,gat,pdg-raw,0.25,512,ce,0.0001,gat2layer,linemethod,0.5,crossproject_php-src,1
train_linevd_0795f_00005,PENDING,,1024,codebert,0.15,gat,pdg-raw,0.3,512,ce,0.0001,gat2layer,linemethod,0.4,crossproject_Android,1
train_linevd_0795f_00006,PENDING,,1024,codebert,0.2,gat,pdg-raw,0.3,512,ce,0.0001,gat2layer,linemethod,0.5,crossproject_php,10
train_linevd_0795f_00007,PENDING,,1024,codebert,0.15,gat,pdg-raw,0.3,512,ce,0.0005,gat2layer,linemethod,0.6,crossproject_openssl,1
train_linevd_0795f_00008,PENDING,,1024,codebert,0.2,gat,pdg-raw,0.3,512,ce,0.0001,gat2layer,linemethod,0.6,crossproject_krb5,1
train_linevd_0795f_00009,PENDING,,1024,codebert,0.15,gat,pdg-raw,0.3,512,ce,0.0001,gat2layer,linemethod,0.4,crossproject_php-src,5


[2m[36m(pid=182804)[0m   rank_zero_deprecation(
2023-03-15 19:12:44,671	ERROR tune.py:794 -- Trials did not complete: [train_linevd_0795f_00000, train_linevd_0795f_00001, train_linevd_0795f_00002, train_linevd_0795f_00003, train_linevd_0795f_00004, train_linevd_0795f_00005, train_linevd_0795f_00006, train_linevd_0795f_00007, train_linevd_0795f_00008, train_linevd_0795f_00009, train_linevd_0795f_00010, train_linevd_0795f_00011, train_linevd_0795f_00012, train_linevd_0795f_00013, train_linevd_0795f_00014, train_linevd_0795f_00015, train_linevd_0795f_00016]
2023-03-15 19:12:44,673	INFO tune.py:798 -- Total run time: 3.89 seconds (3.86 seconds for the tuning loop).
[2m[36m(train_linevd pid=182804)[0m *** SIGSEGV received at time=1678932764 on cpu 6 ***
[2m[36m(train_linevd pid=182804)[0m PC: @     0x7f39d7aaa404  (unknown)  (unknown)
[2m[36m(train_linevd pid=182804)[0m     @     0x7f3b55c543c0  (unknown)  (unknown)
[2m[36m(train_linevd pid=182804)[0m     @ ... and at least 1