In [1]:
import pathlib
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import torch
torch.cuda.is_available()

In [3]:
from autorocks.data.loader.all_models_result_aggregator import create_all_models_comparison_dataset
from autorocks.dir_struct import PackageRootDir
from sysgym.envs.rocksdb import schema

param_space = schema.RocksDB10Params()

all_model_df = create_all_models_comparison_dataset(PackageRootDir / "ProcessedDataNew/rocksdb/iops/zippy_workload_15min/10_params/50_iter", save_results=False)

In [4]:
import pandas as pd
import regex as re

combined_df = pd.merge(all_model_df.sys_params, all_model_df.sys_observations, on=["model", "iteration", "step"]).drop(columns=["model", "iteration", "step", "db_bench.mixgraph.name"])
combined_df.rename(columns=lambda x: re.sub(r'^db_bench\.mixgraph\.', '', x), inplace=True)
combined_df.rename(columns=lambda x: re.sub('rocksdb[\_]?', '', x), inplace=True)
combined_df.rename(columns=lambda x: re.sub(r'_statistics', '', x), inplace=True)
combined_df.rename(columns=lambda x: re.sub(r'_stats', '', x), inplace=True)
combined_df.rename(columns=lambda x: re.sub(r'^statistics.', '', x), inplace=True)
combined_df

In [5]:
intermediate_col = set(combined_df.columns) - set(param_space) -{"iops", "exe_time", "cpu_usage.count", "cpu_usage.p100", "cpu_usage.p99",
                                                                 "cpu_usage.p95", "cpu_usage.p90", "cpu_usage.p50",
"cpu_usage.sum",
                                                                 "mem_usage.count", "mem_usage.p100", "mem_usage.p99",
                                                                 "mem_usage.p95", "mem_usage.p90", "mem_usage.p50",
                                                                 "mem_usage.sum",
                                                                 }
intermediate_df = combined_df[list(intermediate_col)]
# Remove string
intermediate_df = intermediate_df[intermediate_df.T[intermediate_df.dtypes != object].index]
intermediate_df

In [6]:
list(sorted(intermediate_df.columns))



In [7]:
# create test/training dataset, split to 100 train, the rest will be used to test

X_FULL = combined_df[list(param_space)]
Y_FULL = combined_df["iops"]

In [385]:


from autorocks.optimizer.bograph import preprocessor
from autorocks.optimizer.bograph.dag_preprocessor import PreprocessingPipeline
from autorocks.optimizer.bograph.bograph_dao import BoGraphDataPandas

data = BoGraphDataPandas(params=combined_df[list(param_space)].copy(), objs=combined_df["iops"].copy(), intermediate=intermediate_df.copy())

dp = PreprocessingPipeline(
    preprocessors=[
        # Add average for count
        preprocessor.GrouperProcessor(-2, preprocessor.Compressor.COMBINER),
        # If there are any with useful statistics get it.
        preprocessor.FilterProcessor(-2 ),
        preprocessor.VarianceThresholdPreprocessor(),
        preprocessor.MetricsStandardizerProcessor(standardize_params=True),
        # preprocessor.ParamNormalizerProcessor(param_space.bounds().T),
        preprocessor.RankerProcessor(top_k = 15),
    ]
)

processed_data = dp.fit_transform(data)
sorted(processed_data.intermediate.columns)


In [None]:
from autorocks.optimizer.bograph import dag_discovery
import time

start_time = time.time()
full_G = dag_discovery.learn_dag(processed_data, dag_type=dag_discovery.DAGType.FULL)
learning_time = time.time() - start_time


In [None]:
import networkx as nx
from autorocks.optimizer.bograph import dag_postprocessor

main_targets = {'iops'}
# Prune anything not coming out of sources
full_dag_pro = dag_postprocessor.postprocess_structure(full_G, set(param_space) ,{"iops"})
max_dim = max(full_dag_pro.in_degree, key=lambda x: x[1])
print(f"{max_dim=}, num nodes: {len(full_dag_pro.nodes)}")

nx.draw_networkx(full_dag_pro, pos=nx.spring_layout(full_dag_pro))
# nx.write_gpickle(full_dag_pro, "full_dag_pro.gpickle") # Good


In [None]:
from typing import NamedTuple


class ExperimentResult(NamedTuple):
    model: str
    restart: int
    step: int
    score: float
    runtime: float
    
    def __str__(self):
        return f"[M: {self.model}, R:{self.restart}, S:{self.step}]: score={self.score}, runtime={self.runtime}]"

In [None]:
from notebooks.bobn_ch.bobn import BoBn
import networkx as nx

if full_dag_pro is None:
    print("Loading graph from file")
    nx.read_gpickle("full_dag_pro.gpickle")


bobn_model = BoBn(full_dag_pro, param_space, {"iops"}, conservative_mode=True)

In [137]:

unstructured_dag = nx.DiGraph()
for idx, p in enumerate(param_space):
    unstructured_dag.add_edge(p, "iops")

unstructured_bobn = BoBn(unstructured_dag, param_space, {"iops"}, False)

In [None]:
# MAIN exper
import torch
import baselines
import time
from botorch.utils.transforms import normalize, standardize

from gpytorch.metrics import metrics
from sklearn.model_selection import train_test_split
tkwargs = {
    "dtype": torch.double,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}
train_size = 50
num_restarts = 1

bounds = torch.from_numpy(param_space.bounds()).T
structured_results = []

data_copy = data.to_combi_pandas().copy(True)
X_FULL = data_copy[processed_data.param_intermediates_df().columns]
Y_FULL = data_copy['iops']

# X_FULL = processed_data.param_intermediates_df()
# Y_FULL = processed_data.objs

for restart, seed in enumerate([42]):
    X_train, X_test, y_train, y_test  = train_test_split(X_FULL, Y_FULL, train_size = train_size, random_state=seed)
    observations = pd.merge(X_train, y_train, left_index = True, right_index = True).reset_index(drop=True)

    X_test =  normalize(torch.tensor(X_test[list(param_space)].values), bounds).to(**tkwargs)
    y_test = standardize(torch.tensor(y_test.values).to(**tkwargs))

    for model_name in ["structured"]:

        for step in range(30, train_size):
            start_time = time.time()
            model  = baselines.run_model(bobn_model, observations = observations[:step])
            elapsed_time = time.time() - start_time

            score = float(metrics.negative_log_predictive_density(model(X_test).mvn, y_test))
            result = ExperimentResult(model = model_name, restart = restart, step = step, score = score, runtime = elapsed_time)
            structured_results.append(result)
            print(f"{result=}")


In [164]:
y_test

In [163]:
model(X_test).mvn.mean

In [180]:
import seaborn as sns 
import matplotlib.pyplot as plt

plt.style.use("ggplot")
sns.set_theme(style="ticks", rc={"axes.spines.right": False, "axes.spines.top": False})
sns.set_context("paper")  # , font_scale=1.5, rc={"lines.linewidth": 1.5})
plt.rcParams["svg.fonttype"] = "none"
plt.rcParams["font.family"] = "Arial"
plt.rc("text", usetex=False)
plt.rc("xtick", labelsize="small")
plt.rc("ytick", labelsize="small")
plt.rc("axes", labelsize="medium")
plt.rc("pdf", use14corefonts=True)

results_df = pd.DataFrame(results)

fig, ax = plt.subplots(figsize=(4, 3))
sns.lineplot(results_df, x="step", y="score", hue="model", markers = "model", ax=ax)


In [148]:
# Loop that goes through the params.
import torch
import baselines
import time

from gpytorch.metrics import metrics
from botorch.utils.transforms import normalize, standardize
from sklearn.model_selection import train_test_split

tkwargs = {
    "dtype": torch.double,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}

bounds = torch.from_numpy(param_space.bounds()).T

BOTORCH_X = X_FULL[list(param_space)].copy()

train_size = 50
num_restarts = 1
results = []
for restart in range(num_restarts):
    X_train, X_test, y_train, y_test  = train_test_split(BOTORCH_X, Y_FULL, train_size = train_size)

    X_train = normalize(torch.tensor(X_train.values), bounds).to(**tkwargs)
    X_test = normalize(torch.tensor(X_test.values), bounds).to(**tkwargs)

    y_train =standardize( torch.tensor(y_train.values)).to(**tkwargs)
    y_test = standardize(torch.tensor(y_test.values).to(**tkwargs))
    for model_name in ["botorch"]:
        for step in range(30, train_size):
            x = X_train[:step]
            y = y_train[:step].unsqueeze(1)

            start_time = time.time()
            model  = baselines.botorch_model(x, y, tkwargs)
            elapsed_time = time.time() - start_time

            with torch.no_grad():
                score = float(metrics.negative_log_predictive_density(model(X_test).mvn, y_test))
            result = ExperimentResult(model = model_name, restart = restart, step = step, score = score, runtime = elapsed_time)
            results.append(result)
            print(f"{result=}")



In [19]:
bounds = torch.from_numpy(param_space.bounds()).T.to(**tkwargs)

train_size = 100
num_restarts = 1
unstructured_results = []
for restart in range(num_restarts):
    X_train, X_test, y_train, y_test  = train_test_split(X_FULL, Y_FULL, train_size = train_size)
    observations = pd.merge(X_train, y_train, left_index = True, right_index = True).reset_index(drop=True)

    X_test = normalize(torch.tensor(X_test[list(param_space)].values), bounds).to(**tkwargs)
    y_test = torch.tensor(y_test.values)

    for model_name in ["bobn_unstructured"]:

        for step in range(5, train_size):
            start_time = time.time()
            model  = baselines.unstructured_model(observations = observations[:step])
            elapsed_time = time.time() - start_time

            score = float(metrics.negative_log_predictive_density(model(X_test).mvn, y_test))
            result = ExperimentResult(model = model_name, restart = restart, step = step, score = score, runtime = elapsed_time)
            unstructured_results.append(result)
            print(f"{result=}")
