In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT

#     optuna_ex1_hyperparameters_per_dataset.py
#  -> analysis_ex1_hyperparameters.ipynb
#  -> benchmark_ex1_best_hyperparameters.py
#  -> analysis_ex1_hyperparameters_best.ipynb

In [2]:
SETUP = "FG"  # "FG" or "2D"
df = pd.read_pickle("dataframes/ex1_df_runs_with_hyperparameters_per_dataset_{}_e3_d2.pd".format(SETUP)) 

In [3]:
df

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val
0,abalone_input.pd,0.747837,0.726982,0.008180226,0.011599
1,banknote_input.pd,0.999786,0.995977,0.0006855274,0.002796
2,breast_cancer_input.np,0.969678,0.94935,0.008339815,0.017807
3,cars_input.pd,0.920281,0.884074,0.01306303,0.014943
4,contraceptive_input.pd,0.570316,0.532634,0.01218697,0.01865
5,generated6_input.np,0.9752,0.97416,3.330669e-16,0.000367
6,iris_input.pd,0.985259,0.959316,0.009240959,0.02297
7,steel_input.pd,0.700606,0.674677,0.01648044,0.013608
8,students_input.pd,0.530365,0.414647,0.01052493,0.025097


# Comparison with plain scikit-learn DTs

In [4]:
def k_fold(data_input,data_target,depth,n_repeats):

    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True

    accuracies_training = []
    accuracies_validation = []

    rkf = RepeatedKFold(n_splits=4, n_repeats=n_repeats)
    for train_idx, val_idx in rkf.split(data_input):
        
        if use_dataframe:
            X_temp = data_input.iloc[train_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[train_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[train_idx]
            y_temp = data_target[train_idx]

        clf = tree.DecisionTreeClassifier(max_depth=depth)
        clf = clf.fit(X_temp, y_temp)
        accuracies_training.append(clf.score(X_temp,y_temp))

        if use_dataframe:
            X_temp = data_input.iloc[val_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[val_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[val_idx]
            y_temp = data_target[val_idx]
        accuracies_validation.append(clf.score(X_temp,y_temp))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [5]:
def run_DT(data_complete_input,data_complete_target,depth,n_repeats):
    
    accuracies_training = []
    accuracies_validation = []

    for _ in range(n_repeats):
        
        shuffled_X, shuffled_y = shuffle(data_complete_input, data_complete_target)
        data_input_train, data_input_test, data_target_train, data_target_test = train_test_split(shuffled_X, shuffled_y, test_size=0.25)

        if isinstance(data_input_train, pd.core.frame.DataFrame):
            data_input_train.reset_index(inplace=True, drop=True)
            data_input_test.reset_index(inplace=True, drop=True)        
            data_target_train.reset_index(inplace=True, drop=True)
            data_target_test.reset_index(inplace=True, drop=True)
                
        clf = tree.DecisionTreeClassifier(max_depth=depth)
        clf = clf.fit(data_input_train, data_target_train)
        accuracies_training.append(clf.score(data_input_train,data_target_train))
        accuracies_validation.append(clf.score(data_input_test,data_target_test))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [6]:
def run_forest(data_complete_input,data_complete_target,estimators,n_repeats,max_depth=None):
    accuracies_training = []
    accuracies_validation = []

    for _ in range(n_repeats):       
        shuffled_X, shuffled_y = shuffle(data_complete_input, data_complete_target)
        data_input_train, data_input_test, data_target_train, data_target_test = train_test_split(shuffled_X, shuffled_y, test_size=0.25)

        if isinstance(data_input_train, pd.core.frame.DataFrame):
            data_input_train.reset_index(inplace=True, drop=True)
            data_input_test.reset_index(inplace=True, drop=True)        
            data_target_train.reset_index(inplace=True, drop=True)
            data_target_test.reset_index(inplace=True, drop=True)
        
        clf = RandomForestClassifier(n_estimators=estimators,max_depth=max_depth)
        clf = clf.fit(data_input_train, data_target_train)
        accuracies_training.append(clf.score(data_input_train,data_target_train))
        accuracies_validation.append(clf.score(data_input_test,data_target_test))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [7]:
datasets = np.unique(df["dataset"])
runs = 100
results_rows = []
depth_row = {}
depths = [2,3,4]
df_results = pd.DataFrame(results_rows)
for dataset in datasets:
    data_input = pickle.load(open("../datasets/" + dataset, "rb"))
    data_target = pickle.load(open("../datasets/" + dataset.replace("input","target"), "rb"))

    modt = MoDT(data_input,data_target,n_experts=2,iterations=1,max_depth=1)  # Params do not matter, used for equivalent data pre-processing

    for depth in depths:
        dict_results = run_DT(modt.X, modt.y, depth=depth, n_repeats=runs)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainDT" + str(depth) : np.mean(accuracies_training),
            "acc_valDT" + str(depth) : np.mean(accuracies_validation),
            "std_trainDT" + str(depth) : np.std(accuracies_training),
            "std_valDT" + str(depth) : np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}
        
    for estimators in [3,100]:
        dict_results = run_forest(modt.X, modt.y, estimators=estimators, n_repeats=runs)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainRF" + str(estimators) : np.mean(accuracies_training),
            "acc_valRF" + str(estimators) : np.mean(accuracies_validation),
            "std_trainRF" + str(estimators) : np.std(accuracies_training),
            "std_valRF" + str(estimators) : np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}
 
    for estimators in [3]:
        dict_results = run_forest(modt.X, modt.y, estimators=estimators, n_repeats=runs, max_depth=2)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainRF" + str(estimators) + "_d2" : np.mean(accuracies_training),
            "acc_valRF" + str(estimators) + "_d2"  : np.mean(accuracies_validation),
            "std_trainRF" + str(estimators) + "_d2"  : np.std(accuracies_training),
            "std_valRF" + str(estimators) + "_d2": np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}

    results_rows.append(depth_row)
        
df_results = pd.DataFrame(results_rows)

In [8]:
df_results

Unnamed: 0,dataset,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,acc_valDT3,std_trainDT3,std_valDT3,acc_trainDT4,...,std_trainRF3,std_valRF3,acc_trainRF100,acc_valRF100,std_trainRF100,std_valRF100,acc_trainRF3_d2,acc_valRF3_d2,std_trainRF3_d2,std_valRF3_d2
0,abalone_input.pd,0.680961,0.673866,0.008947,0.014059,0.705453,0.694794,0.00464,0.012224,0.724457,...,0.004067,0.013867,0.999987,0.726641,6.3e-05,0.010764,0.669508,0.664153,0.007526,0.013991
1,banknote_input.pd,0.914704,0.903265,0.006765,0.018199,0.940933,0.932624,0.007122,0.014157,0.969096,...,0.001763,0.007917,1.0,0.991516,0.0,0.004774,0.902391,0.895685,0.037409,0.041308
2,breast_cancer_input.np,0.935986,0.915385,0.00887,0.02576,0.959836,0.922937,0.006445,0.01953,0.973427,...,0.005802,0.019082,0.999977,0.94049,0.000234,0.018541,0.927183,0.911818,0.013436,0.024423
3,cars_input.pd,0.778094,0.776829,0.005621,0.016862,0.808511,0.795579,0.00596,0.016679,0.819668,...,0.004369,0.019449,1.0,0.956736,0.0,0.010438,0.709745,0.711458,0.01859,0.028725
4,contraceptive_input.pd,0.490924,0.475881,0.016777,0.023075,0.532473,0.521463,0.011705,0.023969,0.572926,...,0.008559,0.021677,0.964013,0.509566,0.003288,0.022513,0.476277,0.459593,0.028695,0.035287
5,generated6_input.np,0.740931,0.7368,0.003997,0.01109,0.749157,0.74096,0.008112,0.013093,0.825373,...,0.001341,0.004595,0.999989,0.971472,5.2e-05,0.004807,0.733531,0.729848,0.030087,0.03425
6,iris_input.pd,0.963304,0.941053,0.007975,0.028615,0.975982,0.946053,0.009998,0.034588,0.992232,...,0.009868,0.035246,1.0,0.948684,0.0,0.032524,0.945982,0.926579,0.032546,0.058525
7,steel_input.pd,0.537718,0.530041,0.007593,0.021793,0.553842,0.539712,0.010324,0.019371,0.626405,...,0.005797,0.021813,0.999993,0.77537,6.8e-05,0.016417,0.53167,0.529547,0.012442,0.019358
8,students_input.pd,0.49503,0.479102,0.011125,0.040021,0.533046,0.510479,0.011356,0.029909,0.558056,...,0.012507,0.037371,0.941944,0.481557,0.005728,0.035041,0.467034,0.432455,0.051782,0.056221


In [9]:
df_c = pd.concat([df, df_results], axis=1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]

In [10]:
df_c

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,...,std_trainRF3,std_valRF3,acc_trainRF100,acc_valRF100,std_trainRF100,std_valRF100,acc_trainRF3_d2,acc_valRF3_d2,std_trainRF3_d2,std_valRF3_d2
0,abalone_input.pd,0.747837,0.726982,0.008180226,0.011599,0.680961,0.673866,0.008947,0.014059,0.705453,...,0.004067,0.013867,0.999987,0.726641,6.3e-05,0.010764,0.669508,0.664153,0.007526,0.013991
1,banknote_input.pd,0.999786,0.995977,0.0006855274,0.002796,0.914704,0.903265,0.006765,0.018199,0.940933,...,0.001763,0.007917,1.0,0.991516,0.0,0.004774,0.902391,0.895685,0.037409,0.041308
2,breast_cancer_input.np,0.969678,0.94935,0.008339815,0.017807,0.935986,0.915385,0.00887,0.02576,0.959836,...,0.005802,0.019082,0.999977,0.94049,0.000234,0.018541,0.927183,0.911818,0.013436,0.024423
3,cars_input.pd,0.920281,0.884074,0.01306303,0.014943,0.778094,0.776829,0.005621,0.016862,0.808511,...,0.004369,0.019449,1.0,0.956736,0.0,0.010438,0.709745,0.711458,0.01859,0.028725
4,contraceptive_input.pd,0.570316,0.532634,0.01218697,0.01865,0.490924,0.475881,0.016777,0.023075,0.532473,...,0.008559,0.021677,0.964013,0.509566,0.003288,0.022513,0.476277,0.459593,0.028695,0.035287
5,generated6_input.np,0.9752,0.97416,3.330669e-16,0.000367,0.740931,0.7368,0.003997,0.01109,0.749157,...,0.001341,0.004595,0.999989,0.971472,5.2e-05,0.004807,0.733531,0.729848,0.030087,0.03425
6,iris_input.pd,0.985259,0.959316,0.009240959,0.02297,0.963304,0.941053,0.007975,0.028615,0.975982,...,0.009868,0.035246,1.0,0.948684,0.0,0.032524,0.945982,0.926579,0.032546,0.058525
7,steel_input.pd,0.700606,0.674677,0.01648044,0.013608,0.537718,0.530041,0.007593,0.021793,0.553842,...,0.005797,0.021813,0.999993,0.77537,6.8e-05,0.016417,0.53167,0.529547,0.012442,0.019358
8,students_input.pd,0.530365,0.414647,0.01052493,0.025097,0.49503,0.479102,0.011125,0.040021,0.533046,...,0.012507,0.037371,0.941944,0.481557,0.005728,0.035041,0.467034,0.432455,0.051782,0.056221


In [11]:
for index, row in df_c.iterrows():
    print(row.dataset.replace("_input","").replace(".pd","").replace(".np","").replace("_"," "), "&",
     ("%.2f" % round(row.acc_train,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_train,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_val,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_val,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT2	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT2,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT3	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT3,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT4	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT4,2)).lstrip('0'), "&",
     #("%.2f" % round(row.acc_valRF3	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valRF3,2)).lstrip('0'), "&",  
     ("%.2f" % round(row.acc_valRF3_d2,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valRF3_d2,2)).lstrip('0'), "&",            
     ("%.2f" % round(row.acc_valRF100,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valRF100,2)).lstrip('0'),           
      "\\\\")

abalone & .75 $\pm$ .01 & .73 $\pm$ .01 & .67 $\pm$ .01 & .69 $\pm$ .01 & .71 $\pm$ .02 & .66 $\pm$ .01 & .73 $\pm$ .01 \\
banknote & 1.00 $\pm$ .00 & 1.00 $\pm$ .00 & .90 $\pm$ .02 & .93 $\pm$ .01 & .96 $\pm$ .02 & .90 $\pm$ .04 & .99 $\pm$ .00 \\
breast cancer & .97 $\pm$ .01 & .95 $\pm$ .02 & .92 $\pm$ .03 & .92 $\pm$ .02 & .92 $\pm$ .02 & .91 $\pm$ .02 & .94 $\pm$ .02 \\
cars & .92 $\pm$ .01 & .88 $\pm$ .01 & .78 $\pm$ .02 & .80 $\pm$ .02 & .81 $\pm$ .02 & .71 $\pm$ .03 & .96 $\pm$ .01 \\
contraceptive & .57 $\pm$ .01 & .53 $\pm$ .02 & .48 $\pm$ .02 & .52 $\pm$ .02 & .55 $\pm$ .03 & .46 $\pm$ .04 & .51 $\pm$ .02 \\
generated6 & .98 $\pm$ .00 & .97 $\pm$ .00 & .74 $\pm$ .01 & .74 $\pm$ .01 & .82 $\pm$ .01 & .73 $\pm$ .03 & .97 $\pm$ .00 \\
iris & .99 $\pm$ .01 & .96 $\pm$ .02 & .94 $\pm$ .03 & .95 $\pm$ .03 & .95 $\pm$ .03 & .93 $\pm$ .06 & .95 $\pm$ .03 \\
steel & .70 $\pm$ .02 & .67 $\pm$ .01 & .53 $\pm$ .02 & .54 $\pm$ .02 & .61 $\pm$ .02 & .53 $\pm$ .02 & .78 $\pm$ .02 \\
studen

In [12]:
df_c[["acc_train","acc_trainDT2"]]

Unnamed: 0,acc_train,acc_trainDT2
0,0.747837,0.680961
1,0.999786,0.914704
2,0.969678,0.935986
3,0.920281,0.778094
4,0.570316,0.490924
5,0.9752,0.740931
6,0.985259,0.963304
7,0.700606,0.537718
8,0.530365,0.49503


In [13]:
for index, row in df_c.iterrows():
    print(
     ("%.2f" % round(row.acc_train,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_train,2)).lstrip('0'), "&")

.75 $\pm$ .01 &
1.00 $\pm$ .00 &
.97 $\pm$ .01 &
.92 $\pm$ .01 &
.57 $\pm$ .01 &
.98 $\pm$ .00 &
.99 $\pm$ .01 &
.70 $\pm$ .02 &
.53 $\pm$ .01 &


In [15]:
for index, row in df_c.iterrows():
    print(
     ("%.2f" % round(row.acc_val,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_val,2)).lstrip('0'), "\\\\")

.73 $\pm$ .01 \\
1.00 $\pm$ .00 \\
.95 $\pm$ .02 \\
.88 $\pm$ .01 \\
.53 $\pm$ .02 \\
.97 $\pm$ .00 \\
.96 $\pm$ .02 \\
.67 $\pm$ .01 \\
.41 $\pm$ .03 \\
