In [32]:
import pickle
import pandas as pd
import numpy as np
import optuna

#     optuna_ex1_hyperparameters_per_dataset.py
#  -> analysis_ex1_hyperparameters.ipynb
#  -> benchmark_ex1_best_hyperparameters.py
#  -> analysis_ex1_hyperparameters_best.ipynb

In [33]:
storage = "sqlite:///optuna_databases/optuna_ex1_parameter_tuning_2D_e3_d2.sqlite3" #CHANGE


In [34]:
studies = optuna.study.get_all_study_summaries(storage=storage)

In [35]:
loaded_study = optuna.load_study(study_name=studies[0].study_name, storage=storage)
df = loaded_study.trials_dataframe()
for key, value in loaded_study.system_attrs.items():
    df[key] = value
for study in studies[1:]:
    loaded_study = optuna.load_study(study_name=study.study_name , storage=storage)
    df_new = loaded_study.trials_dataframe()
    for key, value in loaded_study.system_attrs.items():
        df_new[key] = value
    df = pd.concat([df, df_new])
df.reset_index(inplace=True, drop=True)

In [36]:
df.columns

Index(['number', 'value', 'datetime_start', 'datetime_complete', 'duration',
       'params_init_learning_rate', 'params_learning_rate_decay',
       'params_max_depth', 'params_n_experts', 'params_optimization_method',
       'params_use_2_dim_clustering', 'params_use_2_dim_gate_based_on',
       'user_attrs_accuracy_train', 'user_attrs_accuracy_val',
       'user_attrs_std_train', 'user_attrs_std_val', 'state', 'Data X',
       'Data y', 'initialization_method', 'params_alpha', 'params_beta',
       'params_gamma', 'params_mean_precision_prior',
       'params_weight_concentration_prior',
       'params_weight_concentration_prior_type', 'params_weight_cutoff'],
      dtype='object')

In [37]:
df.head(1)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_init_learning_rate,params_learning_rate_decay,params_max_depth,params_n_experts,params_optimization_method,...,Data X,Data y,initialization_method,params_alpha,params_beta,params_gamma,params_mean_precision_prior,params_weight_concentration_prior,params_weight_concentration_prior_type,params_weight_cutoff
0,0,0.71582,2021-07-29 17:26:54.287672,2021-07-29 17:26:59.056671,0 days 00:00:04.768999,8.806749,0.987118,2,3,ridge_regression,...,abalone_input.pd,abalone_target.pd,str,,,,,,,


In [38]:
df.initialization_method.value_counts()

KDTmeans_init    12
BGM_init         12
Kmeans_init      12
str              12
Name: initialization_method, dtype: int64

In [39]:
np.unique(df.initialization_method)

array(['BGM_init', 'KDTmeans_init', 'Kmeans_init', 'str'], dtype=object)

In [40]:
np.unique(df["Data X"])

array(['abalone_input.pd', 'adult_input.pd', 'bank_input.pd',
       'banknote_input.pd', 'breast_cancer_input.np', 'cars_input.pd',
       'contraceptive_input.pd', 'generated6_input.np', 'hrss_input.pd',
       'iris_input.pd', 'steel_input.pd', 'students_input.pd'],
      dtype=object)

In [41]:
top = 10
df_best = pd.DataFrame()
for dataset in np.unique(df["Data X"]):
    df_new = df[df["Data X"] == dataset].sort_values(by=["user_attrs_accuracy_val"], ascending=False).head(top)
    df_best = pd.concat([df_best, df_new])
df_best.reset_index(inplace=True, drop=True)


In [42]:
df_best.shape

(48, 27)

In [43]:
df_best.head(2)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_init_learning_rate,params_learning_rate_decay,params_max_depth,params_n_experts,params_optimization_method,...,Data X,Data y,initialization_method,params_alpha,params_beta,params_gamma,params_mean_precision_prior,params_weight_concentration_prior,params_weight_concentration_prior_type,params_weight_cutoff
0,0,0.718941,2021-07-29 17:27:15.117671,2021-07-29 17:27:20.423670,0 days 00:00:05.305999,48.850576,0.983009,2,3,ridge_regression,...,abalone_input.pd,abalone_target.pd,BGM_init,,,,0.916953,0.469967,dirichlet_process,0.0
1,0,0.71582,2021-07-29 17:26:54.287672,2021-07-29 17:26:59.056671,0 days 00:00:04.768999,8.806749,0.987118,2,3,ridge_regression,...,abalone_input.pd,abalone_target.pd,str,,,,,,,


In [44]:
#pickle.dump(df_best, open("dataframes/ex1_df_top10_hyperparameters_per_dataset_FD_e3_d2.pd", "wb"))
pickle.dump(df_best, open("dataframes/ex1_df_top10_hyperparameters_per_dataset_2D_e3_d2.pd", "wb")) #CHANGE

In [45]:
df.groupby(["Data X"]).mean()

Unnamed: 0_level_0,number,value,params_init_learning_rate,params_learning_rate_decay,params_max_depth,params_n_experts,params_use_2_dim_clustering,user_attrs_accuracy_train,user_attrs_accuracy_val,user_attrs_std_train,user_attrs_std_val,params_alpha,params_beta,params_gamma,params_mean_precision_prior,params_weight_concentration_prior,params_weight_cutoff
Data X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
abalone_input.pd,0,0.701281,49.453637,0.986537,2,3,True,0.716702,0.701281,0.005949,0.015605,3.303388,0.253093,0.498901,0.916953,0.469967,0.0
adult_input.pd,0,0.835853,62.63967,0.989032,2,3,True,0.836129,0.835853,0.001651,0.00292,3.672029,0.411965,0.487561,0.355366,0.446429,0.0
bank_input.pd,0,0.906921,89.04696,0.985136,2,3,True,0.908875,0.906921,0.001077,0.003226,0.848082,0.172538,0.168568,0.285538,0.85071,0.0
banknote_input.pd,0,0.962099,109.769798,0.988419,2,3,True,0.970603,0.962099,0.006801,0.009878,2.337756,0.412412,0.353022,0.72615,0.481413,0.0
breast_cancer_input.np,0,0.917844,114.862085,0.983118,2,3,True,0.963971,0.917844,0.005387,0.018805,2.676592,0.019525,0.25637,0.397008,0.571719,0.0
cars_input.pd,0,0.789207,56.057145,0.987953,2,3,True,0.798418,0.789207,0.013844,0.014993,2.239653,0.152237,0.144669,0.669483,0.116355,0.0
contraceptive_input.pd,0,0.517471,93.286799,0.988437,2,3,True,0.540621,0.517471,0.007191,0.030801,0.792159,0.194415,0.292783,0.691058,0.338249,0.0
generated6_input.np,0,0.8431,31.328753,0.98788,2,3,True,0.851867,0.8431,0.01391,0.017502,1.032056,0.316308,0.255826,0.586819,0.466054,0.0
hrss_input.pd,0,0.767964,121.542232,0.983631,2,3,True,0.768894,0.767964,0.001652,0.00395,1.577939,0.460574,0.227964,0.538856,0.486627,0.0
iris_input.pd,0,0.941812,111.151958,0.986752,2,3,True,0.989447,0.941812,0.006275,0.034498,3.336038,0.467318,0.308037,0.923832,0.573598,0.0


In [46]:
df["params_use_2_dim_gate_based_on"].value_counts(dropna=False)

feature_importance_lr              8
feature_importance_lr_max          8
feature_importance_lda_max         7
feature_importance                 6
PCA                                6
feature_importance_pca_loadings    6
feature_importance_xgb             4
feature_importance_lda             3
Name: params_use_2_dim_gate_based_on, dtype: int64

In [48]:
df["params_use_2_dim_clustering"]

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
20    True
21    True
22    True
23    True
24    True
25    True
26    True
27    True
28    True
29    True
30    True
31    True
32    True
33    True
34    True
35    True
36    True
37    True
38    True
39    True
40    True
41    True
42    True
43    True
44    True
45    True
46    True
47    True
Name: params_use_2_dim_clustering, dtype: bool