In [None]:
%load_ext autoreload
%autoreload 2
from common import load_data, split_data
import supervised
import active
import pandas as pd
import itertools
import os.path as osp
import math

In [2]:
data_dir = "../data"
output_dir = "run_results"
dataset_sizes = [None, 1, 5, 0.01, 0.1, 0.25, 0.5, 0.75]
selection_methods = ["random", "kmeans", "submodular"]
learning_methods = supervised.methods
random_seeds = [i*100 for i in range(1, 6)]
tune_settings = [False]

In [3]:
full_data = load_data(data_dir)

In [4]:
systems = list(full_data["performance_properties"].keys())
inputs_feat_cols = full_data["feature_columns"]
data = full_data["data"]
commands_per_system = dict()
for s in systems:
    print(s)
    num_inputs = full_data["input_counts"][s]
    num_configs = len(
        pd.concat(
            (data[s, i][inputs_feat_cols[s]] for i in range(num_inputs))
        ).drop_duplicates()
    )

    avail_inputs = int(num_inputs * 0.9)
    avail_configs = int(num_configs * 0.9)

    selected_inputs = set(
        [
            math.ceil(avail_inputs * ds) if isinstance(ds, float) else ds
            for ds in dataset_sizes
        ]
    )
    selected_configs = set(
        [
            math.ceil(avail_configs * ds) if isinstance(ds, float) else ds
            for ds in dataset_sizes
        ]
    )
    print(s, "num. inputs", selected_inputs, "num. configs", selected_configs)
    # This should confirm the numbers, but adds ~2 minutes to execution
    # _, _, train_inp, train_cfg = split_data(data, s, full_data["input_counts"], inputs_feat_cols, 1)
    # print(f"{s}\t{avail_inputs}\t{avail_configs}\t{len(train_inp)}\t{len(train_cfg)}")
    performance_properties = full_data["performance_properties"][s]
    run_name = f"sl_{s}"

    commands_per_system[s] = []

    for seed, method, num_inp, num_cfg, sel_inp, sel_cfg, perf_property, tune in itertools.product(
        random_seeds,
        learning_methods,
        selected_inputs,
        selected_configs,
        selection_methods,
        selection_methods,
        performance_properties,
        tune_settings
    ):
        if method == "automl":
            continue

        if num_inp is None and sel_inp != "kmeans":
            continue

        if num_cfg is None and sel_cfg != "kmeans":
            continue

        if num_cfg == 1 and num_inp == 1:
            continue

        identifier = supervised.get_identifier(
            run_name=run_name,
            system=s,
            method=method,
            num_inputs=num_inp,
            input_selection=sel_inp,
            num_configs=num_cfg,
            config_selection=sel_cfg,
            performance_property=perf_property,
            seed=seed,
            tune=tune,
        )

        if osp.exists(osp.join("..", output_dir, identifier, "eval_metrics.json")):
            # Run already completed
            continue
        
        if num_inp is not None:
            ni = f"-ni {num_inp} -is {sel_inp}"
        else:
            ni = ""

        if num_cfg is not None:
            nc = f"-nc {num_cfg} -cs {sel_cfg}"
        else:
            nc = ""

        commands_per_system[s].append(
            f"python src/supervised.py {run_name} {s} {method} -o {output_dir} -pp {perf_property} {ni} {nc} --seed {seed}"
        )

    print(f"{s} Missing {len(commands_per_system[s])} runs")

total_runs_missing = sum([len(cmds) for cmds in commands_per_system.values()])
print(f"Missing in total {total_runs_missing} runs")

gcc
gcc num. inputs {1, 3, 5, 7, None, 14, 21} num. configs {1, 36, 5, 8, None, 18, 54}
gcc Missing 95 runs
imagemagick
imagemagick num. inputs {1, 225, 450, 675, 5, 9, None, 90} num. configs {1, 68, 5, 9, 45, None, 23}
imagemagick Missing 4096 runs
lingeling
lingeling num. inputs {32, 1, 4, 5, 237, None, 79, 158} num. configs {1, 68, 5, 9, 45, None, 23}
lingeling Missing 275 runs
nodejs
nodejs num. inputs {1, 5, 869, 174, None, 18, 435, 1304} num. configs {1, 34, 5, 12, None, 23}
nodejs Missing 1624 runs
poppler
poppler num. inputs {1, 5, 134, 999, 333, None, 14, 666} num. configs {1, 2, 4, 5, 7, 11, None}
poppler Missing 16360 runs
sqlite
sqlite num. inputs {1, 2, 34, 68, 5, 102, None, 14} num. configs {1, 2, 4, 5, 6, None}
sqlite Missing 7545 runs
x264
x264 num. inputs {1, 290, 579, 5, 869, 12, None, 116} num. configs {1, 2, 5, 135, 45, None, 18, 90}
x264 Missing 4765 runs
xz
xz num. inputs {1, 33, 5, 11, None, 22} num. configs {1, 3, 5, 7, None, 14, 21}
xz Missing 64 runs
Missing i

In [16]:
with open("../sl_run_commands.sh", "w") as f:
    for cmds in commands_per_system.values():
        f.writelines("\n".join(cmds))
        f.write("\n")

In [8]:
systems = list(full_data["performance_properties"].keys())
inputs_feat_cols = full_data["feature_columns"]
data = full_data["data"]
commands_per_system = dict()
learning_methods = active.methods
strategies = active.query_strategies
for s in systems:
    print(s)
    # This should confirm the numbers, but adds ~2 minutes to execution
    # _, _, train_inp, train_cfg = split_data(data, s, full_data["input_counts"], inputs_feat_cols, 1)
    # print(f"{s}\t{avail_inputs}\t{avail_configs}\t{len(train_inp)}\t{len(train_cfg)}")
    performance_properties = full_data["performance_properties"][s]
    run_name = f"al_{s}"

    commands_per_system[s] = []

    for seed, method, query_strat, perf_property, tune in itertools.product(
        random_seeds[:3],
        learning_methods,
        strategies,
        performance_properties,
        tune_settings
    ):
        identifier = active.get_identifier(
            run_name=run_name,
            system=s,
            method=method,
            query_strategy=query_strat,
            performance_property=perf_property,
            seed=seed,
            tune=tune,
        )

        if osp.exists(osp.join(output_dir, identifier, "eval_metrics.json")):
            # Run already completed
            continue
        

        commands_per_system[s].append(
            f"python src/active.py {run_name} {s} {method} -s {query_strat} -o {output_dir} -pp {perf_property} --seed {seed}"
        )

    print(f"{s} Missing {len(commands_per_system[s])} runs")

total_runs_missing = sum([len(cmds) for cmds in commands_per_system.values()])
print(f"Missing in total {total_runs_missing} runs")

gcc
gcc Missing 135 runs
imagemagick
imagemagick Missing 90 runs
lingeling
lingeling Missing 135 runs
nodejs
nodejs Missing 45 runs
poppler
poppler Missing 90 runs
sqlite
sqlite Missing 675 runs
x264
x264 Missing 225 runs
xz
xz Missing 90 runs
Missing in total 1485 runs


In [9]:
with open("../al_run_commands.sh", "w") as f:
    for cmds in commands_per_system.values():
        f.writelines("\n".join(cmds))
        f.write("\n")