In [1]:
import pickle
import pandas as pd
import numpy as np
import optuna

#     optuna_hyperparameters_per_dataset.py
#  -> analysis_hyperparameters.ipynb
#  -> benchmark_best_hyperparameters.py
#  -> analysis_hyperparameters_runs.ipynb

In [2]:
storage = "sqlite:///optuna_results_parameter_tuning_full_gate.sqlite3"
#sqlite:///optuna_results_parameter_tuning_2D_gate.sqlite3

In [3]:
studies = optuna.study.get_all_study_summaries(storage=storage)

In [4]:
loaded_study = optuna.load_study(study_name=studies[0].study_name, storage=storage)
df = loaded_study.trials_dataframe()
for key, value in loaded_study.system_attrs.items():
    df[key] = value
for study in studies[1:]:
    loaded_study = optuna.load_study(study_name=study.study_name , storage=storage)
    df_new = loaded_study.trials_dataframe()
    for key, value in loaded_study.system_attrs.items():
        df_new[key] = value
    df = pd.concat([df, df_new])
df.reset_index(inplace=True, drop=True)

In [5]:
df.columns

Index(['number', 'value', 'datetime_start', 'datetime_complete', 'duration',
       'params_init_learning_rate', 'params_learning_rate_decay',
       'params_max_depth', 'params_n_experts', 'params_optimization_method',
       'params_use_2_dim_gate_based_on', 'params_use_posterior',
       'user_attrs_accuracy_train', 'user_attrs_accuracy_val',
       'user_attrs_std_train', 'user_attrs_std_val', 'state', 'Data X',
       'Data y', 'initialization_method', 'params_alpha', 'params_beta',
       'params_gamma', 'params_mean_precision_prior',
       'params_weight_concentration_prior',
       'params_weight_concentration_prior_type', 'params_weight_cutoff'],
      dtype='object')

In [6]:
df.head(1)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_init_learning_rate,params_learning_rate_decay,params_max_depth,params_n_experts,params_optimization_method,...,Data X,Data y,initialization_method,params_alpha,params_beta,params_gamma,params_mean_precision_prior,params_weight_concentration_prior,params_weight_concentration_prior_type,params_weight_cutoff
0,0,0.934402,2021-06-28 08:31:42.575957,2021-06-28 08:31:44.482651,0 days 00:00:01.906694,126.893719,0.979807,2,3,lasso_regression,...,banknote_input.pd,banknote_target.pd,str,,,,,,,


In [7]:
df.initialization_method.value_counts()

str              1200
KDTmeans_init    1200
Kmeans_init      1200
BGM_init         1200
Name: initialization_method, dtype: int64

In [8]:
np.unique(df.initialization_method)

array(['BGM_init', 'KDTmeans_init', 'Kmeans_init', 'str'], dtype=object)

In [9]:
np.unique(df["Data X"])

array(['abalone_input.pd', 'bank_input.pd', 'banknote_input.pd',
       'breast_cancer_input.np', 'cars_input.pd',
       'contraceptive_input.pd', 'hrss_input.pd', 'iris_input.pd',
       'occupancy_input.pd', 'pdm6_input.pd', 'steel_input.pd',
       'students_input.pd'], dtype=object)

In [10]:
top = 10
df_best = pd.DataFrame()
for dataset in np.unique(df["Data X"]):
    df_new = df[df["Data X"] == dataset].sort_values(by=["user_attrs_accuracy_val"], ascending=False).head(top)
    df_best = pd.concat([df_best, df_new])
df_best.reset_index(inplace=True, drop=True)


In [11]:
df_best.shape

(120, 27)

In [12]:
df_best.head(2)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_init_learning_rate,params_learning_rate_decay,params_max_depth,params_n_experts,params_optimization_method,...,Data X,Data y,initialization_method,params_alpha,params_beta,params_gamma,params_mean_precision_prior,params_weight_concentration_prior,params_weight_concentration_prior_type,params_weight_cutoff
0,93,0.745514,2021-06-29 12:07:27.597629,2021-06-29 12:07:32.297630,0 days 00:00:04.700001,77.143561,0.996708,2,3,least_squares_linear_regression,...,abalone_input.pd,abalone_target.pd,str,,,,,,,
1,13,0.744554,2021-06-29 12:01:27.584103,2021-06-29 12:01:32.140103,0 days 00:00:04.556000,91.218285,0.990082,2,3,least_squares_linear_regression,...,abalone_input.pd,abalone_target.pd,str,,,,,,,


In [13]:
#pickle.dump(df_best, open("dataframes/df_top10_hyperparameters_per_dataset_full_gate.pd", "wb"))
pickle.dump(df_best, open("dataframes/df_top10_hyperparameters_per_dataset_2D_gate.pd", "wb"))

In [14]:
df.groupby(["Data X"]).mean()

Unnamed: 0_level_0,number,value,params_init_learning_rate,params_learning_rate_decay,params_max_depth,params_n_experts,params_use_posterior,user_attrs_accuracy_train,user_attrs_accuracy_val,user_attrs_std_train,user_attrs_std_val,params_alpha,params_beta,params_gamma,params_mean_precision_prior,params_weight_concentration_prior,params_weight_cutoff
Data X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
abalone_input.pd,49.5,0.718313,100.870177,0.99112,2.0,3.0,0.1025,0.738198,0.718313,0.00906,0.013313,1.762906,0.216682,0.206034,0.416497,0.653325,0.0
bank_input.pd,49.5,0.894801,84.822159,0.985161,2.0,3.0,0.415,0.90806,0.894801,0.001192,0.017108,2.980814,0.261024,0.333999,0.680125,0.660794,0.0
banknote_input.pd,49.5,0.985703,91.835222,0.988745,2.0,3.0,0.1,0.992063,0.985703,0.002498,0.005923,1.955231,0.13167,0.14204,0.437,0.597199,0.0
breast_cancer_input.np,49.5,0.923378,77.86378,0.989836,2.0,3.0,0.835,0.964506,0.923378,0.005518,0.018881,2.391755,0.315951,0.396414,0.641454,0.416285,0.0
cars_input.pd,49.5,0.830913,73.803925,0.990044,2.0,3.0,0.1625,0.841654,0.830913,0.012895,0.019184,2.311915,0.290541,0.376067,0.73433,0.572777,0.0
contraceptive_input.pd,49.5,0.482352,89.062544,0.98716,2.0,3.0,0.37,0.518784,0.482352,0.020712,0.024101,2.621254,0.186496,0.351942,0.373333,0.388996,0.0
hrss_input.pd,49.5,0.772794,84.051263,0.989719,2.0,3.0,0.2,0.77424,0.772794,0.002667,0.004664,3.289162,0.190816,0.305248,0.685352,0.422043,0.0
iris_input.pd,49.5,0.941639,72.551168,0.98468,2.0,3.0,0.32,0.989928,0.941639,0.005979,0.031358,2.18794,0.272645,0.209092,0.390111,0.785381,0.0
occupancy_input.pd,49.5,0.990486,100.005012,0.992146,2.0,3.0,0.1425,0.99233,0.990486,0.001242,0.00218,2.604595,0.25804,0.335174,0.28389,0.682469,0.0
pdm6_input.pd,49.5,0.971841,90.668083,0.985388,2.0,3.0,0.63,0.974281,0.971841,0.001198,0.002664,3.176146,0.149032,0.341776,0.622306,0.537459,0.0


In [15]:
df["params_use_2_dim_gate_based_on"].value_counts(dropna=False)

NaN    4800
Name: params_use_2_dim_gate_based_on, dtype: int64