In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import RepeatedKFold

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT

#     optuna_ex1_hyperparameters_per_dataset.py
#  -> analysis_ex1_hyperparameters.ipynb
#  -> benchmark_ex1_best_hyperparameters.py
#  -> analysis_ex1_hyperparameters_best.ipynb

In [2]:
df = pd.read_pickle("dataframes/ex1_df_runs_with_hyperparameters_per_dataset_FG_e3_d2.pd") #CHANGE

In [3]:
df

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val
0,abalone_input.pd,0.737079,0.72114,0.007772,0.013922
1,adult_input.pd,0.840277,0.83561,0.0012,0.013667
2,bank_input.pd,0.911964,0.90616,0.002626,0.024989
3,banknote_input.pd,0.997075,0.992609,0.002832,0.005645
4,breast_cancer_input.np,0.966573,0.92478,0.007139,0.020943
5,cars_input.pd,0.804209,0.795498,0.01499,0.018751
6,contraceptive_input.pd,0.588065,0.559744,0.012452,0.028444
7,generated6_input.np,0.969355,0.965232,0.004528,0.006312
8,hrss_input.pd,0.774616,0.773898,0.002766,0.005019
9,iris_input.pd,0.989511,0.951888,0.007836,0.034196


# Comparison with plain scikit-learn DTs

In [4]:
def k_fold(data_input,data_target,depth,n_repeats):

    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True

    accuracies_training = []
    accuracies_validation = []

    rkf = RepeatedKFold(n_splits=4, n_repeats=n_repeats)
    for train_idx, val_idx in rkf.split(data_input):
        
        if use_dataframe:
            X_temp = data_input.iloc[train_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[train_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[train_idx]
            y_temp = data_target[train_idx]

        clf = tree.DecisionTreeClassifier(max_depth=depth)
        clf = clf.fit(X_temp, y_temp)
        accuracies_training.append(clf.score(X_temp,y_temp))

        if use_dataframe:
            X_temp = data_input.iloc[val_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[val_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[val_idx]
            y_temp = data_target[val_idx]
        accuracies_validation.append(clf.score(X_temp,y_temp))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [5]:
datasets = np.unique(df["dataset"])
runs = 100
results_rows = []
depth_row = {}
depths = [2,3,4]
df_results = pd.DataFrame(results_rows)
for dataset in datasets:
    data_input = pickle.load(open("../datasets/" + dataset, "rb"))
    data_target = pickle.load(open("../datasets/" + dataset.replace("input","target"), "rb"))

    modt = MoDT(data_input,data_target,n_experts=2,iterations=1,max_depth=1)  # Params do not matter, used for equivalent data pre-processing

    for depth in depths:
        dict_results = k_fold(modt.X, modt.y, depth=depth, n_repeats=runs)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainDT" + str(depth) : np.mean(accuracies_training),
            "acc_valDT" + str(depth) : np.mean(accuracies_validation),
            "std_trainDT" + str(depth) : np.std(accuracies_training),
            "std_valDT" + str(depth) : np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}
    results_rows.append(depth_row)
        
df_results = pd.DataFrame(results_rows)

In [6]:
df_results

Unnamed: 0,dataset,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,acc_valDT3,std_trainDT3,std_valDT3,acc_trainDT4,acc_valDT4,std_trainDT4,std_valDT4
0,abalone_input.pd,0.681998,0.67191,0.008878,0.013857,0.705388,0.69623,0.004809,0.012943,0.724076,0.706505,0.009758,0.014635
1,adult_input.pd,0.82422,0.824163,0.001331,0.004002,0.84032,0.84016,0.001237,0.003694,0.841458,0.840737,0.002404,0.003872
2,bank_input.pd,0.903712,0.903222,0.000904,0.002412,0.908872,0.908157,0.000866,0.002229,0.914318,0.912247,0.000865,0.002419
3,banknote_input.pd,0.915897,0.903776,0.00602,0.015103,0.940938,0.932194,0.006294,0.014595,0.96723,0.954001,0.009038,0.014599
4,breast_cancer_input.np,0.936297,0.913709,0.008778,0.020847,0.958939,0.923572,0.006346,0.02057,0.974065,0.92351,0.006204,0.019213
5,cars_input.pd,0.777778,0.777778,0.005792,0.017377,0.807668,0.798385,0.00581,0.015734,0.820035,0.804115,0.007546,0.017212
6,contraceptive_input.pd,0.494637,0.474141,0.017081,0.024974,0.532077,0.519523,0.012011,0.025821,0.574852,0.555402,0.01022,0.024689
7,generated6_input.np,0.741035,0.73775,0.004026,0.011063,0.749773,0.742364,0.007727,0.01362,0.823915,0.815724,0.00766,0.011697
8,hrss_input.pd,0.76343,0.763217,0.001815,0.004984,0.768267,0.767503,0.002074,0.004918,0.771804,0.770777,0.002784,0.00503
9,iris_input.pd,0.964176,0.936198,0.009131,0.032923,0.976512,0.943195,0.009547,0.038656,0.9924,0.942601,0.007219,0.032482


In [7]:
df_c = pd.concat([df, df_results], axis=1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]

In [8]:
df_c

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,acc_valDT3,std_trainDT3,std_valDT3,acc_trainDT4,acc_valDT4,std_trainDT4,std_valDT4
0,abalone_input.pd,0.737079,0.72114,0.007772,0.013922,0.681998,0.67191,0.008878,0.013857,0.705388,0.69623,0.004809,0.012943,0.724076,0.706505,0.009758,0.014635
1,adult_input.pd,0.840277,0.83561,0.0012,0.013667,0.82422,0.824163,0.001331,0.004002,0.84032,0.84016,0.001237,0.003694,0.841458,0.840737,0.002404,0.003872
2,bank_input.pd,0.911964,0.90616,0.002626,0.024989,0.903712,0.903222,0.000904,0.002412,0.908872,0.908157,0.000866,0.002229,0.914318,0.912247,0.000865,0.002419
3,banknote_input.pd,0.997075,0.992609,0.002832,0.005645,0.915897,0.903776,0.00602,0.015103,0.940938,0.932194,0.006294,0.014595,0.96723,0.954001,0.009038,0.014599
4,breast_cancer_input.np,0.966573,0.92478,0.007139,0.020943,0.936297,0.913709,0.008778,0.020847,0.958939,0.923572,0.006346,0.02057,0.974065,0.92351,0.006204,0.019213
5,cars_input.pd,0.804209,0.795498,0.01499,0.018751,0.777778,0.777778,0.005792,0.017377,0.807668,0.798385,0.00581,0.015734,0.820035,0.804115,0.007546,0.017212
6,contraceptive_input.pd,0.588065,0.559744,0.012452,0.028444,0.494637,0.474141,0.017081,0.024974,0.532077,0.519523,0.012011,0.025821,0.574852,0.555402,0.01022,0.024689
7,generated6_input.np,0.969355,0.965232,0.004528,0.006312,0.741035,0.73775,0.004026,0.011063,0.749773,0.742364,0.007727,0.01362,0.823915,0.815724,0.00766,0.011697
8,hrss_input.pd,0.774616,0.773898,0.002766,0.005019,0.76343,0.763217,0.001815,0.004984,0.768267,0.767503,0.002074,0.004918,0.771804,0.770777,0.002784,0.00503
9,iris_input.pd,0.989511,0.951888,0.007836,0.034196,0.964176,0.936198,0.009131,0.032923,0.976512,0.943195,0.009547,0.038656,0.9924,0.942601,0.007219,0.032482


In [9]:
for index, row in df_c.iterrows():
    print(row.dataset.replace("_input","").replace(".pd","").replace(".np","").replace("_"," "), "&",
     ("%.2f" % round(row.acc_train,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_train,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_val,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_val,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT2	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT2,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT3	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT3,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT4	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT4,2)).lstrip('0'), 
      "\\\\")

abalone & .74 $\pm$ .01 & .72 $\pm$ .01 & .67 $\pm$ .01 & .70 $\pm$ .01 & .71 $\pm$ .01 \\
adult & .84 $\pm$ .00 & .84 $\pm$ .01 & .82 $\pm$ .00 & .84 $\pm$ .00 & .84 $\pm$ .00 \\
bank & .91 $\pm$ .00 & .91 $\pm$ .02 & .90 $\pm$ .00 & .91 $\pm$ .00 & .91 $\pm$ .00 \\
banknote & 1.00 $\pm$ .00 & .99 $\pm$ .01 & .90 $\pm$ .02 & .93 $\pm$ .01 & .95 $\pm$ .01 \\
breast cancer & .97 $\pm$ .01 & .92 $\pm$ .02 & .91 $\pm$ .02 & .92 $\pm$ .02 & .92 $\pm$ .02 \\
cars & .80 $\pm$ .01 & .80 $\pm$ .02 & .78 $\pm$ .02 & .80 $\pm$ .02 & .80 $\pm$ .02 \\
contraceptive & .59 $\pm$ .01 & .56 $\pm$ .03 & .47 $\pm$ .02 & .52 $\pm$ .03 & .56 $\pm$ .02 \\
generated6 & .97 $\pm$ .00 & .97 $\pm$ .01 & .74 $\pm$ .01 & .74 $\pm$ .01 & .82 $\pm$ .01 \\
hrss & .77 $\pm$ .00 & .77 $\pm$ .01 & .76 $\pm$ .00 & .77 $\pm$ .00 & .77 $\pm$ .01 \\
iris & .99 $\pm$ .01 & .95 $\pm$ .03 & .94 $\pm$ .03 & .94 $\pm$ .04 & .94 $\pm$ .03 \\
steel & .68 $\pm$ .02 & .66 $\pm$ .03 & .53 $\pm$ .02 & .54 $\pm$ .02 & .61 $\pm$ .02 \

In [10]:
df_c[["acc_train","acc_trainDT2"]]

Unnamed: 0,acc_train,acc_trainDT2
0,0.737079,0.681998
1,0.840277,0.82422
2,0.911964,0.903712
3,0.997075,0.915897
4,0.966573,0.936297
5,0.804209,0.777778
6,0.588065,0.494637
7,0.969355,0.741035
8,0.774616,0.76343
9,0.989511,0.964176
