In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2


import matplotlib.pyplot as plt

In [2]:
from sysgym.envs.rocksdb.schema import RocksDB10Params
from sysgym.envs.rocksdb.benchmarks.dbbench.established_benchmarks import DBBenchTasks
from autorocks.data.loader.all_models_result_aggregator import create_all_models_comparison_dataset
import autorocks.dir_struct as data_dirs

bench_name = "".join(str(DBBenchTasks.READ_RANDOM_WRITE_RANDOM).split("_"))
obj_name = "iops"
param_space = RocksDB10Params()
param_names = set([p.name for p in param_space.parameters()])

exp_dir = data_dirs.LocalResultDir / f"rocksdb/{obj_name}/{bench_name}/{len(param_space)}_params/100_iter"
model_comparison_data = create_all_models_comparison_dataset(exp_dir)

In [None]:
import pandas as pd

print(pd.DataFrame(param_space.to_latex()).to_latex())

In [4]:
import autorocks.viz.viz as viz

model_comparison_data_c = viz.unify_model_name(model_comparison_data)
color_palette = viz.create_color_palette(
    model_comparison_data_c, ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
)

In [5]:
perf_df = model_comparison_data_c.sys_observations.copy()

In [6]:
perf_df = perf_df.select_dtypes(exclude=["object"])  # Remove the name column - hack around sysgym

In [136]:
# Checkpoint the dataset used in the argument
perf_df.to_csv("random_analysis")
model_comparison_data_c.sys_params.to_csv("random_analysis_params")

In [3]:
import pandas as pd

perf_df = pd.read_csv("random_analysis.csv")
params = pd.read_csv("random_analysis_params.csv")

In [4]:
# Hack to get the average from all the pandas dataframe from rocksdb

columns_to_look = []
avgs = {}

import re

expression = re.compile(r"(.+)\.p50")
p50_columns = set(filter(lambda x: x.endswith("p50"), perf_df.columns.get_level_values(0).tolist()))

col_to_exclude_later = set()
for col in p50_columns:
    col_name = expression.findall(col)[0]
    col_count_name = f"{col_name}.count"
    col_sum_name = f"{col_name}.sum"
    avgs[col_name] = perf_df[col_sum_name] / perf_df[col_count_name]
    col_to_exclude_later.add(col_count_name)

avgs_df = pd.DataFrame(avgs).fillna(0)
all_other_columns = list(
    set(perf_df.filter(regex="^(?!.*(p\d*|sum|interval|iops)$).*$").columns) - col_to_exclude_later
)
counts_data_df = perf_df.loc[:, all_other_columns].filter(regex="count$")
rest_of_data_df = perf_df[list(set(all_other_columns) - set(counts_data_df.columns))]
target_df = perf_df.filter(regex="iops$")

In [5]:
count_data_no_low_var = counts_data_df.loc[:, counts_data_df.var() >= 0.3]

In [6]:
import statsmodels.api as sm
from sklearn.preprocessing import scale

# Fit a Poisson regression model
poisson_model = sm.ZeroInflatedPoisson(target_df, count_data_no_low_var).fit()

# Print the coefficients of the model
print(poisson_model.params)

In [7]:
avgs_df.columns

In [10]:
import seaborn as sns

avgs_w_target = pd.concat([target_df, avgs_df], axis=1)

no_low_var = avgs_w_target.loc[:, (avgs_w_target.var() >= 0.3)]
target_name = list(target_df.columns)
correlation_matrix = (
    no_low_var.corr()[target_name].sort_values(by=target_name, ascending=False).drop(target_name, axis=0)
)

col_rename = {}

for col in correlation_matrix.index:
    col_rename[col] = col.replace("statistics.", "").replace("rocksdb_", "").replace("_micros", "")

correlation_matrix = correlation_matrix.rename(index=col_rename, columns={"db_bench.readrandomwriterandom.iops": "IOPS"}).drop(index='cpu_usage')

fig, ax = plt.subplots(figsize=(4, 3))
sns.heatmap(correlation_matrix, vmin=-1, vmax=1, annot=True, cmap="BrBG", ax=ax)
#
# output_location = "/Users/salabed/workspace/latex_writings/thesis/phd_dissertation/Chapters/MultiTask/Figures/"
# output_format = "pdf"
# fig.savefig(f"{output_location}/correlation.{output_format}", bbox_inches="tight", format=f"{output_format}", dpi=300)

In [18]:
subset_df = avgs_w_target.loc[
    :,
    [
        "statistics.rocksdb_bytes_per_write",
        # "statistics.rocksdb_numfiles_in_singlecompaction",
        "statistics.rocksdb_db_get_micros",
        "statistics.rocksdb_compaction_outfile_sync_micros",
        "db_bench.readrandomwriterandom.iops",
    ],
]

col_rename['db_bench.readrandomwriterandom.iops'] = 'iops'
subset_df = subset_df.rename(columns=col_rename)

In [16]:
subset_df.to_csv('./random_analysis_selected_targets.csv')

In [19]:
from sklearn.preprocessing import scale

subset_df.loc[:] = scale(subset_df)
subset_df

In [15]:
subset_df.columns

In [22]:
DPI = 600  # default dpi for most printers
plt.style.use("ggplot")
sns.set_theme(style="ticks", rc={"axes.spines.right": False, "axes.spines.top": False})
sns.set_context("paper")  # , font_scale=1.5, rc={"lines.linewidth": 1.5})
plt.rcParams["svg.fonttype"] = "none"
plt.rcParams["font.family"] = "Arial"
plt.rc("text", usetex=False)
plt.rc("xtick", labelsize="small")
plt.rc("ytick", labelsize="small")
plt.rc("axes", labelsize="medium")
plt.rc("pdf", use14corefonts=True)

output_location = "/Users/salabed/workspace/latex_writings/thesis/phd_dissertation/Chapters/MultiTask/Figures/"
output_format = "pdf"

f = sns.pairplot(
    data=subset_df.rename(columns={"iops": "IOPS"}),
    y_vars=["IOPS"],
    x_vars=["bytes_per_write", "db_get", "compaction_outfile_sync"], #"numfiles_in_singlecompaction"],
    kind="reg"
)
f.savefig(f"{output_location}/regs.{output_format}", bbox_inches="tight", format=f"{output_format}", dpi=300)

In [41]:
counts_w_target = pd.concat([target_df, counts_data_df], axis=1)

no_low_var = counts_w_target.loc[:, (counts_w_target.var() >= 0.3)]
target_name = list(target_df.columns)
correlation_matrix = (
    no_low_var.corr()[target_name].sort_values(by=target_name, ascending=False).drop(target_name, axis=0)
)
sns.heatmap(correlation_matrix, vmin=-1, vmax=1, annot=True, cmap="BrBG")

In [43]:
correlation_matrix