In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import RepeatedKFold

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT

#     optuna_ex1_hyperparameters_per_dataset.py
#  -> analysis_ex1_hyperparameters.ipynb
#  -> benchmark_ex1_best_hyperparameters.py
#  -> analysis_ex1_hyperparameters_best.ipynb

In [2]:
SETUP = "2D"  # "FG" or "2D"
df = pd.read_pickle("dataframes/ex1_df_runs_with_hyperparameters_per_dataset_{}_e3_d2.pd".format(SETUP)) 

In [3]:
df

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val
0,abalone_input.pd,0.706897,0.689234,0.015887,0.017984
1,banknote_input.pd,0.981535,0.967201,0.010756,0.005205
2,breast_cancer_input.np,0.961268,0.928322,0.004232,0.01514
3,cars_input.pd,0.816358,0.782407,0.001336,0.012028
4,contraceptive_input.pd,0.542572,0.523713,0.014134,0.013868
5,generated6_input.np,0.854667,0.8566,0.00604,0.004094
6,iris_input.pd,0.986607,0.953947,0.004464,0.02182
7,steel_input.pd,0.583849,0.549897,0.017985,0.026685
8,students_input.pd,0.543587,0.413174,0.032356,0.052545


# Comparison with plain scikit-learn DTs

In [4]:
def k_fold(data_input,data_target,depth,n_repeats):

    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True

    accuracies_training = []
    accuracies_validation = []

    rkf = RepeatedKFold(n_splits=4, n_repeats=n_repeats)
    for train_idx, val_idx in rkf.split(data_input):
        
        if use_dataframe:
            X_temp = data_input.iloc[train_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[train_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[train_idx]
            y_temp = data_target[train_idx]

        clf = tree.DecisionTreeClassifier(max_depth=depth)
        clf = clf.fit(X_temp, y_temp)
        accuracies_training.append(clf.score(X_temp,y_temp))

        if use_dataframe:
            X_temp = data_input.iloc[val_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[val_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[val_idx]
            y_temp = data_target[val_idx]
        accuracies_validation.append(clf.score(X_temp,y_temp))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [5]:
datasets = np.unique(df["dataset"])
runs = 100
results_rows = []
depth_row = {}
depths = [2,3,4]
df_results = pd.DataFrame(results_rows)
for dataset in datasets:
    data_input = pickle.load(open("../datasets/" + dataset, "rb"))
    data_target = pickle.load(open("../datasets/" + dataset.replace("input","target"), "rb"))

    modt = MoDT(data_input,data_target,n_experts=2,iterations=1,max_depth=1)  # Params do not matter, used for equivalent data pre-processing

    for depth in depths:
        dict_results = k_fold(modt.X, modt.y, depth=depth, n_repeats=runs)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainDT" + str(depth) : np.mean(accuracies_training),
            "acc_valDT" + str(depth) : np.mean(accuracies_validation),
            "std_trainDT" + str(depth) : np.std(accuracies_training),
            "std_valDT" + str(depth) : np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}
    results_rows.append(depth_row)
        
df_results = pd.DataFrame(results_rows)

In [6]:
df_results

Unnamed: 0,dataset,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,acc_valDT3,std_trainDT3,std_valDT3,acc_trainDT4,acc_valDT4,std_trainDT4,std_valDT4
0,abalone_input.pd,0.681753,0.67237,0.008829,0.013847,0.705242,0.695059,0.004428,0.012526,0.724608,0.706577,0.009579,0.013639
1,adult_input.pd,0.82423,0.824126,0.001304,0.003916,0.840315,0.840177,0.001283,0.003884,0.841327,0.840537,0.002212,0.003887
2,bank_input.pd,0.90368,0.903195,0.000951,0.002412,0.908897,0.908124,0.000917,0.002345,0.91433,0.912044,0.000873,0.002308
3,banknote_input.pd,0.915381,0.903652,0.006966,0.015847,0.94138,0.932529,0.006524,0.014289,0.967556,0.95344,0.009147,0.015077
4,breast_cancer_input.np,0.936275,0.913361,0.008751,0.021181,0.958841,0.925939,0.006331,0.019116,0.973926,0.923048,0.006557,0.020423
5,cars_input.pd,0.777778,0.777778,0.006007,0.018021,0.807658,0.798692,0.005823,0.016101,0.819375,0.804537,0.008242,0.015945
6,contraceptive_input.pd,0.49422,0.473566,0.016963,0.026048,0.531174,0.518791,0.01256,0.026625,0.574245,0.55304,0.010516,0.026296
7,generated6_input.np,0.740953,0.73768,0.003817,0.010722,0.749883,0.742566,0.007898,0.013133,0.823634,0.815186,0.007732,0.011502
8,hrss_input.pd,0.763458,0.763236,0.001717,0.00473,0.768203,0.767458,0.002155,0.004994,0.771911,0.770814,0.002924,0.005021
9,iris_input.pd,0.964219,0.936495,0.009371,0.032762,0.976422,0.944781,0.009967,0.034991,0.992268,0.941846,0.00693,0.030844


In [7]:
df_c = pd.concat([df, df_results], axis=1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]

In [8]:
df_c

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,acc_valDT3,std_trainDT3,std_valDT3,acc_trainDT4,acc_valDT4,std_trainDT4,std_valDT4
0,abalone_input.pd,0.751729,0.732281,0.010941,0.014406,0.681753,0.67237,0.008829,0.013847,0.705242,0.695059,0.004428,0.012526,0.724608,0.706577,0.009579,0.013639
1,adult_input.pd,0.837288,0.829456,0.003152,0.02307,0.82423,0.824126,0.001304,0.003916,0.840315,0.840177,0.001283,0.003884,0.841327,0.840537,0.002212,0.003887
2,bank_input.pd,0.909411,0.896069,0.002791,0.033133,0.90368,0.903195,0.000951,0.002412,0.908897,0.908124,0.000917,0.002345,0.91433,0.912044,0.000873,0.002308
3,banknote_input.pd,0.999461,0.996399,0.001128,0.004728,0.915381,0.903652,0.006966,0.015847,0.94138,0.932529,0.006524,0.014289,0.967556,0.95344,0.009147,0.015077
4,breast_cancer_input.np,0.981664,0.921825,0.007198,0.024731,0.936275,0.913361,0.008751,0.021181,0.958841,0.925939,0.006331,0.019116,0.973926,0.923048,0.006557,0.020423
5,cars_input.pd,0.91142,0.893611,0.020054,0.021458,0.777778,0.777778,0.006007,0.018021,0.807658,0.798692,0.005823,0.016101,0.819375,0.804537,0.008242,0.015945
6,contraceptive_input.pd,0.588496,0.520401,0.017325,0.026364,0.49422,0.473566,0.016963,0.026048,0.531174,0.518791,0.01256,0.026625,0.574245,0.55304,0.010516,0.026296
7,generated6_input.np,0.969011,0.964952,0.006259,0.006843,0.740953,0.73768,0.003817,0.010722,0.749883,0.742566,0.007898,0.013133,0.823634,0.815186,0.007732,0.011502
8,hrss_input.pd,0.775816,0.774205,0.004313,0.006172,0.763458,0.763236,0.001717,0.00473,0.768203,0.767458,0.002155,0.004994,0.771911,0.770814,0.002924,0.005021
9,iris_input.pd,0.994401,0.94596,0.006303,0.033928,0.964219,0.936495,0.009371,0.032762,0.976422,0.944781,0.009967,0.034991,0.992268,0.941846,0.00693,0.030844


In [9]:
for index, row in df_c.iterrows():
    print(row.dataset.replace("_input","").replace(".pd","").replace(".np","").replace("_"," "), "&",
     ("%.2f" % round(row.acc_train,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_train,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_val,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_val,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT2	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT2,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT3	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT3,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT4	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT4,2)).lstrip('0'), 
      "\\\\")

abalone & .75 $\pm$ .01 & .73 $\pm$ .01 & .67 $\pm$ .01 & .70 $\pm$ .01 & .71 $\pm$ .01 \\
adult & .84 $\pm$ .00 & .83 $\pm$ .02 & .82 $\pm$ .00 & .84 $\pm$ .00 & .84 $\pm$ .00 \\
bank & .91 $\pm$ .00 & .90 $\pm$ .03 & .90 $\pm$ .00 & .91 $\pm$ .00 & .91 $\pm$ .00 \\
banknote & 1.00 $\pm$ .00 & 1.00 $\pm$ .00 & .90 $\pm$ .02 & .93 $\pm$ .01 & .95 $\pm$ .02 \\
breast cancer & .98 $\pm$ .01 & .92 $\pm$ .02 & .91 $\pm$ .02 & .93 $\pm$ .02 & .92 $\pm$ .02 \\
cars & .91 $\pm$ .02 & .89 $\pm$ .02 & .78 $\pm$ .02 & .80 $\pm$ .02 & .80 $\pm$ .02 \\
contraceptive & .59 $\pm$ .02 & .52 $\pm$ .03 & .47 $\pm$ .03 & .52 $\pm$ .03 & .55 $\pm$ .03 \\
generated6 & .97 $\pm$ .01 & .96 $\pm$ .01 & .74 $\pm$ .01 & .74 $\pm$ .01 & .82 $\pm$ .01 \\
hrss & .78 $\pm$ .00 & .77 $\pm$ .01 & .76 $\pm$ .00 & .77 $\pm$ .00 & .77 $\pm$ .01 \\
iris & .99 $\pm$ .01 & .95 $\pm$ .03 & .94 $\pm$ .03 & .94 $\pm$ .03 & .94 $\pm$ .03 \\
steel & .71 $\pm$ .02 & .66 $\pm$ .02 & .53 $\pm$ .02 & .53 $\pm$ .02 & .61 $\pm$ .02 

In [10]:
df_c[["acc_train","acc_trainDT2"]]

Unnamed: 0,acc_train,acc_trainDT2
0,0.751729,0.681753
1,0.837288,0.82423
2,0.909411,0.90368
3,0.999461,0.915381
4,0.981664,0.936275
5,0.91142,0.777778
6,0.588496,0.49422
7,0.969011,0.740953
8,0.775816,0.763458
9,0.994401,0.964219
