In [1]:
import botorch.posteriors
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
from sysgym.envs.rocksdb.schema import RocksDB10Params
from sysgym.envs.rocksdb.benchmarks.dbbench.established_benchmarks import DBBenchTasks
from autorocks.data.loader.all_models_result_aggregator import create_all_models_comparison_dataset
import autorocks.dir_struct  as data_dirs
import pandas as pd
import numpy as np
import torch
from botorch.models.transforms import input as i_transformer
from botorch.models.transforms import outcome as o_transformer
from botorch.optim.fit import fit_gpytorch_torch
from gpytorch import ExactMarginalLogLikelihood
from botorch import models
from autorocks.viz import viz
import gpytorch


In [3]:
# NN inside the mean function: https://github.com/cornellius-gp/gpytorch/issues/674

data_x = pd.read_csv('random_analysis_params.csv', index_col = 0)
data_x = data_x.drop(columns=['step', 'iteration', 'model'])
data_y_full = pd.read_csv('random_analysis_selected_targets.csv', index_col = 0)

In [39]:
data_y = data_y_full.loc[:, [
                           'bytes_per_write',
                           # 'numfiles_in_singlecompaction',
                           'db_get',
                            # 'compaction_outfile_sync',
                           'iops']]

In [40]:
train_points = 80
test_points = data_x.shape[0] - train_points


In [41]:
train_x = torch.tensor(data_x.loc[:train_points].values)
train_y = torch.tensor(data_y.loc[:train_points].values)
x_bounds = torch.tensor(RocksDB10Params().bounds()).T


In [42]:
input_normalizer = i_transformer.Normalize(d=x_bounds.shape[1], bounds=x_bounds)
out_standardizer = o_transformer.Standardize(m=data_y.shape[1])

In [47]:
from botorch.optim.fit import fit_gpytorch_scipy

torch.cuda.empty_cache()
model = models.KroneckerMultiTaskGP(
        train_X=train_x,
        train_Y = train_y,
        input_transform = input_normalizer,
        outcome_transform = out_standardizer,
        linear=False
)
# model = models.SingleTaskGP(
#        train_X=train_x,
#     train_Y = train_y[:,-1].unsqueeze(-1),
#         input_transform = input_normalizer,
#         outcome_transform = o_transformer.Standardize(m=1),
#         mean_module=gpytorch.means.LinearMean(input_size=train_x.shape[-1])
# )


model_mll = ExactMarginalLogLikelihood(model.likelihood, model)
options={"maxiter": 3000, "lr": 0.01, "disp": True}
# options={}
fit_gpytorch_torch(model_mll, options=options)
# fit_gpytorch_scipy(model_mll, options=options)

In [48]:

def confidence_region(posterior):
    """
    Returns 2 standard deviations above and below the mean.

    :rtype: (torch.Tensor, torch.Tensor)
    :return: pair of tensors of size (b x d) or (d), where
        b is the batch size and d is the dimensionality of the random
        variable. The first (second) Tensor is the lower (upper) end of
        the confidence region.
    """
    stddev = posterior.variance.sqrt()
    std2 = stddev.mul_(2)
    mean = posterior.mean
    return mean.sub(std2), mean.add(std2)

In [49]:
import gpytorch

data_x_ = data_x.loc[train_points:].values
data_y_ = data_y.loc[train_points:].values

test_x = torch.tensor(data_x_)
test_y = torch.tensor(data_y_)

# sorted_y = np.take_along_axis(test_y, np.argsort(test_y.numpy(), 0), axis=0)
# sorted_x = test_x[np.argsort(test_y.numpy(), 0)[:,-1]]
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    model.eval()
    model.likelihood.eval()
    # posterior = model.posterior(torch.cat([test_x, main_task],-1))
    # posterior = model.posterior(torch.tensor(data_x.values))
    posterior = model.posterior(test_x, observation_noise = True)
    ci = confidence_region(posterior)

    in_data_posterior = model.posterior(train_x)
    in_data_ci = confidence_region(in_data_posterior)

In [50]:

# full_y = torch.tensor(data_y.values)

num_tasks = posterior.event_shape[1]
f, axes = plt.subplots(1, num_tasks, figsize=(18, 9))
# f, ax = plt.subplots(figsize=(4, 3))

for task in range(num_tasks):
    arg_sorted = np.argsort(test_y, 0)[:,task]
    prediction_mean = posterior.mean[arg_sorted, task]

    x_axis = np.arange(0, prediction_mean.shape[0])

    axes[task].scatter(x_axis, prediction_mean.detach().numpy(), c='b', label='prediction')
    axes[task].fill_between(x_axis, ci[0][arg_sorted,task], ci[1][arg_sorted,task], color = 'skyblue', label = '95% confidence interval', alpha=0.3)
    axes[task].scatter(x_axis, test_y[arg_sorted, task], c='r', label='truth')
    task_name = data_y.columns[task]
    axes[task].set(title = f"Task: {task_name}")

    rmse = torch.sqrt(torch.mean(torch.pow(prediction_mean - test_y[:,task], 2)))
    print(f"Task: {task_name} RMSE: {rmse}")
# f.legend()


In [33]:
# single task
# full_y = torch.tensor(data_y.values)

num_tasks = posterior.event_shape[1]
f, ax = plt.subplots(figsize=(4, 3))

task = -1
arg_sorted = np.argsort(test_y, 0)[:,task]
prediction_mean = posterior.mean[arg_sorted, task]

x_axis = np.arange(0, prediction_mean.shape[0])

ax.scatter(x_axis, prediction_mean.detach().numpy(), c='b', label='prediction')
ax.fill_between(x_axis, ci[0][arg_sorted,task], ci[1][arg_sorted,task], color = 'skyblue', label = '95% confidence interval', alpha=0.3)
ax.scatter(x_axis, test_y[arg_sorted, task], c='r', label='truth')
print("RMSE: ", torch.sqrt(torch.mean(torch.pow(prediction_mean - test_y[arg_sorted, task], 2))))
# f.legend()


In [34]:
#  In model training predictions


_, ax = plt.subplots(figsize=(24, 9))
task = -1

arg_sorted = np.argsort(train_x, 0)[:,task]
prediction_mean = in_data_posterior.mean[arg_sorted, task]

x_axis = np.arange(0, prediction_mean.shape[0])

ax.scatter(x_axis, prediction_mean.detach().numpy(), c='b', label='prediction')
ax.fill_between(x_axis, in_data_ci[0][arg_sorted,task], in_data_ci[1][arg_sorted,task], color = 'skyblue', label = '95% confidence interval', alpha=0.3)
ax.scatter(x_axis, train_y[arg_sorted, task], c='r', label='truth')

# f.legend()

print("RMSE: ", torch.sqrt(torch.mean(torch.pow(in_data_posterior.mean[:,-1] - train_y[:,-1], 2))))


In [16]:

# Multi-task scratchpad
from botorch.utils.transforms import normalize
task_train_x = []
normalized_x =normalize(train_x, bounds=x_bounds)
for task in range(data_y.shape[1]):
    task_idx = torch.ones(normalized_x.shape[0], 1) * task
    # task_train_x.append(torch.cat([torch.index_select(normalized_x, -1, index=torch.tensor(task)),
    #                               task_idx], -1))
    task_train_x.append(torch.cat([normalized_x, task_idx], -1))
task_train_x = torch.cat(task_train_x)

task_train_x.shape

In [86]:
num_tasks = train_y.shape[-1]
model = models.MultiTaskGP(
        train_X=task_train_x,
        train_Y =torch.cat(torch.chunk(train_y, num_tasks, dim=-1)),
        # input_transform = input_normalizer,
        outcome_transform = o_transformer.Standardize(m=1),
        task_feature = -1
)
main_task = torch.ones(test_x.shape[0], 1) * 4


model_mll = ExactMarginalLogLikelihood(model.likelihood, model)
# options={"maxiter": 3000, "lr": 0.001, "disp": False}
options={}
fit_gpytorch_scipy(model_mll)
# fit_gpytorch_torch(model_mll, options=options)

In [50]:
import gpytorch

data_x_ = data_x.loc[train_points:].values
data_y_ = data_y.loc[train_points:].values

test_x = torch.tensor(data_x_)
test_y = torch.tensor(data_y_)

# sorted_y = np.take_along_axis(test_y, np.argsort(test_y.numpy(), 0), axis=0)
# sorted_x = test_x[np.argsort(test_y.numpy(), 0)[:,-1]]
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    model.eval()
    model.likelihood.eval()
    posterior = model.posterior(test_x)
    ci = confidence_region(posterior)

    in_data_posterior = model.posterior(train_x)
    in_data_ci = confidence_region(in_data_posterior)

In [54]:
torch.cat(torch.chunk(train_y, num_tasks, dim=-1))