In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import RepeatedKFold

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT

#     optuna_ex1_hyperparameters_per_dataset.py
#  -> analysis_ex1_hyperparameters.ipynb
#  -> benchmark_ex1_best_hyperparameters.py
#  -> analysis_ex1_hyperparameters_best.ipynb

In [2]:
df = pd.read_pickle("dataframes/ex1_df_runs_with_hyperparameters_per_dataset_2D_e3_d2.pd") #CHANGE

FileNotFoundError: [Errno 2] No such file or directory: 'dataframes/ex1_df_runs_with_hyperparameters_per_dataset_2D_e3_d2.pd'

In [None]:
df

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val
0,abalone_input.pd,0.753475,0.733716,0.008918,0.015058
1,bank_input.pd,0.908656,0.888749,0.001659,0.053686
2,banknote_input.pd,0.999271,0.995457,0.001218,0.004434
3,breast_cancer_input.np,0.96817,0.927191,0.007123,0.020998
4,cars_input.pd,0.91603,0.898071,0.013059,0.017307
5,contraceptive_input.pd,0.587267,0.517557,0.021712,0.03005
6,hrss_input.pd,0.776416,0.77476,0.004226,0.005406
7,iris_input.pd,0.990372,0.947807,0.009699,0.034951
8,occupancy_input.pd,0.993049,0.991134,0.001849,0.002877
9,pdm6_input.pd,0.983866,0.980423,0.002195,0.003455


# Comparison with plain scikit-learn DTs

In [None]:
def k_fold(data_input,data_target,depth,n_repeats):

    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True

    accuracies_training = []
    accuracies_validation = []

    rkf = RepeatedKFold(n_splits=4, n_repeats=n_repeats)
    for train_idx, val_idx in rkf.split(data_input):
        
        if use_dataframe:
            X_temp = data_input.iloc[train_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[train_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[train_idx]
            y_temp = data_target[train_idx]

        clf = tree.DecisionTreeClassifier(max_depth=depth)
        clf = clf.fit(X_temp, y_temp)
        accuracies_training.append(clf.score(X_temp,y_temp))

        if use_dataframe:
            X_temp = data_input.iloc[val_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[val_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[val_idx]
            y_temp = data_target[val_idx]
        accuracies_validation.append(clf.score(X_temp,y_temp))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [None]:
datasets = np.unique(df["dataset"])
runs = 100
results_rows = []
depth_row = {}
depths = [2,3,4]
df_results = pd.DataFrame(results_rows)
for dataset in datasets:
    data_input = pickle.load(open("../datasets/" + dataset, "rb"))
    data_target = pickle.load(open("../datasets/" + dataset.replace("input","target"), "rb"))

    modt = MoDT(data_input,data_target,n_experts=2,iterations=1,max_depth=1)  # Params do not matter, used for equivalent data pre-processing

    for depth in depths:
        dict_results = k_fold(modt.X, modt.y, depth=depth, n_repeats=runs)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainDT" + str(depth) : np.mean(accuracies_training),
            "acc_valDT" + str(depth) : np.mean(accuracies_validation),
            "std_trainDT" + str(depth) : np.std(accuracies_training),
            "std_valDT" + str(depth) : np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}
    results_rows.append(depth_row)
        
df_results = pd.DataFrame(results_rows)

In [None]:
df_results

Unnamed: 0,dataset,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,acc_valDT3,std_trainDT3,std_valDT3,acc_trainDT4,acc_valDT4,std_trainDT4,std_valDT4
0,abalone_input.pd,0.681689,0.671994,0.008574,0.013721,0.705253,0.695774,0.004428,0.013415,0.725229,0.707058,0.009092,0.014555
1,bank_input.pd,0.903723,0.903246,0.000961,0.002654,0.908889,0.908193,0.000934,0.002486,0.914374,0.912159,0.000956,0.002462
2,banknote_input.pd,0.915073,0.902784,0.007441,0.016093,0.940717,0.931778,0.006844,0.015613,0.967459,0.952996,0.009602,0.015956
3,breast_cancer_input.np,0.936597,0.914462,0.008071,0.019735,0.959111,0.923657,0.006014,0.018257,0.973726,0.923425,0.006358,0.02012
4,cars_input.pd,0.777778,0.777778,0.005818,0.017455,0.807656,0.798843,0.005448,0.015655,0.819688,0.803981,0.00759,0.017461
5,contraceptive_input.pd,0.494291,0.473292,0.017608,0.026573,0.532181,0.518656,0.01221,0.024783,0.574856,0.556532,0.010642,0.025391
6,hrss_input.pd,0.763388,0.763196,0.001664,0.004641,0.768245,0.767476,0.001981,0.004635,0.77177,0.770711,0.002678,0.004858
7,iris_input.pd,0.963711,0.937624,0.009519,0.033191,0.976357,0.944881,0.009508,0.035988,0.992246,0.944603,0.007197,0.033418
8,occupancy_input.pd,0.98879,0.987614,0.000881,0.002128,0.989837,0.988128,0.000934,0.002177,0.992655,0.989881,0.001378,0.002393
9,pdm6_input.pd,0.971702,0.971006,0.001132,0.003183,0.972372,0.971208,0.000989,0.002995,0.973784,0.972011,0.001183,0.00286


In [None]:
df_c = pd.concat([df, df_results], axis=1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]

In [None]:
df_c

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,acc_valDT3,std_trainDT3,std_valDT3,acc_trainDT4,acc_valDT4,std_trainDT4,std_valDT4
0,abalone_input.pd,0.753475,0.733716,0.008918,0.015058,0.681689,0.671994,0.008574,0.013721,0.705253,0.695774,0.004428,0.013415,0.725229,0.707058,0.009092,0.014555
1,bank_input.pd,0.908656,0.888749,0.001659,0.053686,0.903723,0.903246,0.000961,0.002654,0.908889,0.908193,0.000934,0.002486,0.914374,0.912159,0.000956,0.002462
2,banknote_input.pd,0.999271,0.995457,0.001218,0.004434,0.915073,0.902784,0.007441,0.016093,0.940717,0.931778,0.006844,0.015613,0.967459,0.952996,0.009602,0.015956
3,breast_cancer_input.np,0.96817,0.927191,0.007123,0.020998,0.936597,0.914462,0.008071,0.019735,0.959111,0.923657,0.006014,0.018257,0.973726,0.923425,0.006358,0.02012
4,cars_input.pd,0.91603,0.898071,0.013059,0.017307,0.777778,0.777778,0.005818,0.017455,0.807656,0.798843,0.005448,0.015655,0.819688,0.803981,0.00759,0.017461
5,contraceptive_input.pd,0.587267,0.517557,0.021712,0.03005,0.494291,0.473292,0.017608,0.026573,0.532181,0.518656,0.01221,0.024783,0.574856,0.556532,0.010642,0.025391
6,hrss_input.pd,0.776416,0.77476,0.004226,0.005406,0.763388,0.763196,0.001664,0.004641,0.768245,0.767476,0.001981,0.004635,0.77177,0.770711,0.002678,0.004858
7,iris_input.pd,0.990372,0.947807,0.009699,0.034951,0.963711,0.937624,0.009519,0.033191,0.976357,0.944881,0.009508,0.035988,0.992246,0.944603,0.007197,0.033418
8,occupancy_input.pd,0.993049,0.991134,0.001849,0.002877,0.98879,0.987614,0.000881,0.002128,0.989837,0.988128,0.000934,0.002177,0.992655,0.989881,0.001378,0.002393
9,pdm6_input.pd,0.983866,0.980423,0.002195,0.003455,0.971702,0.971006,0.001132,0.003183,0.972372,0.971208,0.000989,0.002995,0.973784,0.972011,0.001183,0.00286


In [None]:
for index, row in df_c.iterrows():
    print(row.dataset.replace("_input","").replace(".pd","").replace(".np","").replace("_"," "), "&",
     ("%.2f" % round(row.acc_train,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_train,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_val,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_val,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT2	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT2,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT3	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT3,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT4	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT4,2)).lstrip('0'), 
      "\\\\")

abalone & .75 $\pm$ .01 & .73 $\pm$ .02 & .67 $\pm$ .01 & .70 $\pm$ .01 & .71 $\pm$ .01 \\
bank & .91 $\pm$ .00 & .89 $\pm$ .05 & .90 $\pm$ .00 & .91 $\pm$ .00 & .91 $\pm$ .00 \\
banknote & 1.00 $\pm$ .00 & 1.00 $\pm$ .00 & .90 $\pm$ .02 & .93 $\pm$ .02 & .95 $\pm$ .02 \\
breast cancer & .97 $\pm$ .01 & .93 $\pm$ .02 & .91 $\pm$ .02 & .92 $\pm$ .02 & .92 $\pm$ .02 \\
cars & .92 $\pm$ .01 & .90 $\pm$ .02 & .78 $\pm$ .02 & .80 $\pm$ .02 & .80 $\pm$ .02 \\
contraceptive & .59 $\pm$ .02 & .52 $\pm$ .03 & .47 $\pm$ .03 & .52 $\pm$ .02 & .56 $\pm$ .03 \\
hrss & .78 $\pm$ .00 & .77 $\pm$ .01 & .76 $\pm$ .00 & .77 $\pm$ .00 & .77 $\pm$ .00 \\
iris & .99 $\pm$ .01 & .95 $\pm$ .03 & .94 $\pm$ .03 & .94 $\pm$ .04 & .94 $\pm$ .03 \\
occupancy & .99 $\pm$ .00 & .99 $\pm$ .00 & .99 $\pm$ .00 & .99 $\pm$ .00 & .99 $\pm$ .00 \\
pdm6 & .98 $\pm$ .00 & .98 $\pm$ .00 & .97 $\pm$ .00 & .97 $\pm$ .00 & .97 $\pm$ .00 \\
steel & .71 $\pm$ .02 & .67 $\pm$ .03 & .53 $\pm$ .02 & .54 $\pm$ .02 & .61 $\pm$ .02 \\

In [None]:
df_c[["acc_train","acc_trainDT2"]]

Unnamed: 0,acc_train,acc_trainDT2
0,0.753475,0.681689
1,0.908656,0.903723
2,0.999271,0.915073
3,0.96817,0.936597
4,0.91603,0.777778
5,0.587267,0.494291
6,0.776416,0.763388
7,0.990372,0.963711
8,0.993049,0.98879
9,0.983866,0.971702
