In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT

#     optuna_ex1_hyperparameters_per_dataset.py
#  -> analysis_ex1_hyperparameters.ipynb
#  -> benchmark_ex1_best_hyperparameters.py
#  -> analysis_ex1_hyperparameters_best.ipynb

In [2]:
SETUP = "FG"  # "FG" or "2D"
df = pd.read_pickle("dataframes/ex1_df_runs_with_hyperparameters_per_dataset_{}_e3_d2.pd".format(SETUP)) 

In [3]:
df

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val
0,abalone_input.pd,0.740544,0.714638,0.004586,0.008545
1,banknote_input.pd,0.999944,0.988236,0.000227,0.001404
2,breast_cancer_input.np,0.961615,0.942867,0.003741,0.013413
3,cars_input.pd,0.816618,0.775979,0.011827,0.010095
4,contraceptive_input.pd,0.594323,0.581661,0.005829,0.005286
5,generated6_input.np,0.9752,0.97416,3.330669e-16,0.000367
6,iris_input.pd,0.986732,0.946921,0.007783,0.021871
7,steel_input.pd,0.680449,0.680965,0.014563,0.013425
8,students_input.pd,0.5769,0.449228,0.020305,0.009204


# Comparison with plain scikit-learn DTs and random forests

In [4]:
def k_fold(data_input,data_target,depth,n_repeats):

    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True

    accuracies_training = []
    accuracies_validation = []

    rkf = RepeatedKFold(n_splits=4, n_repeats=n_repeats)
    for train_idx, val_idx in rkf.split(data_input):
        
        if use_dataframe:
            X_temp = data_input.iloc[train_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[train_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[train_idx]
            y_temp = data_target[train_idx]

        clf = tree.DecisionTreeClassifier(max_depth=depth)
        clf = clf.fit(X_temp, y_temp)
        accuracies_training.append(clf.score(X_temp,y_temp))

        if use_dataframe:
            X_temp = data_input.iloc[val_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[val_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[val_idx]
            y_temp = data_target[val_idx]
        accuracies_validation.append(clf.score(X_temp,y_temp))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [5]:
def run_DT(data_complete_input,data_complete_target,depth,n_repeats):
    
    accuracies_training = []
    accuracies_validation = []

    for _ in range(n_repeats):
        
        shuffled_X, shuffled_y = shuffle(data_complete_input, data_complete_target)
        data_input_train, data_input_test, data_target_train, data_target_test = train_test_split(shuffled_X, shuffled_y, test_size=0.25)

        if isinstance(data_input_train, pd.core.frame.DataFrame):
            data_input_train.reset_index(inplace=True, drop=True)
            data_input_test.reset_index(inplace=True, drop=True)        
            data_target_train.reset_index(inplace=True, drop=True)
            data_target_test.reset_index(inplace=True, drop=True)
                
        clf = tree.DecisionTreeClassifier(max_depth=depth)
        clf = clf.fit(data_input_train, data_target_train)
        accuracies_training.append(clf.score(data_input_train,data_target_train))
        accuracies_validation.append(clf.score(data_input_test,data_target_test))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [6]:
def run_forest(data_complete_input,data_complete_target,estimators,n_repeats,max_depth=None):
    accuracies_training = []
    accuracies_validation = []

    for _ in range(n_repeats):       
        shuffled_X, shuffled_y = shuffle(data_complete_input, data_complete_target)
        data_input_train, data_input_test, data_target_train, data_target_test = train_test_split(shuffled_X, shuffled_y, test_size=0.25)

        if isinstance(data_input_train, pd.core.frame.DataFrame):
            data_input_train.reset_index(inplace=True, drop=True)
            data_input_test.reset_index(inplace=True, drop=True)        
            data_target_train.reset_index(inplace=True, drop=True)
            data_target_test.reset_index(inplace=True, drop=True)
        
        clf = RandomForestClassifier(n_estimators=estimators,max_depth=max_depth)
        clf = clf.fit(data_input_train, data_target_train)
        accuracies_training.append(clf.score(data_input_train,data_target_train))
        accuracies_validation.append(clf.score(data_input_test,data_target_test))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [7]:
datasets = np.unique(df["dataset"])
runs = 100
results_rows = []
depth_row = {}
depths = [2,3,4]
df_results = pd.DataFrame(results_rows)
for dataset in datasets:
    data_input = pickle.load(open("../datasets/" + dataset, "rb"))
    data_target = pickle.load(open("../datasets/" + dataset.replace("input","target"), "rb"))

    modt = MoDT(data_input,data_target,n_experts=2,iterations=1,max_depth=1)  # Params do not matter, used for equivalent data pre-processing

    for depth in depths:
        dict_results = run_DT(modt.X, modt.y, depth=depth, n_repeats=runs)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainDT" + str(depth) : np.mean(accuracies_training),
            "acc_valDT" + str(depth) : np.mean(accuracies_validation),
            "std_trainDT" + str(depth) : np.std(accuracies_training),
            "std_valDT" + str(depth) : np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}
        
    for estimators in [3,100]:
        dict_results = run_forest(modt.X, modt.y, estimators=estimators, n_repeats=runs)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainRF" + str(estimators) : np.mean(accuracies_training),
            "acc_valRF" + str(estimators) : np.mean(accuracies_validation),
            "std_trainRF" + str(estimators) : np.std(accuracies_training),
            "std_valRF" + str(estimators) : np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}
 
    for estimators in [3]:
        dict_results = run_forest(modt.X, modt.y, estimators=estimators, n_repeats=runs, max_depth=2)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainRF" + str(estimators) + "_d2" : np.mean(accuracies_training),
            "acc_valRF" + str(estimators) + "_d2"  : np.mean(accuracies_validation),
            "std_trainRF" + str(estimators) + "_d2"  : np.std(accuracies_training),
            "std_valRF" + str(estimators) + "_d2": np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}

    results_rows.append(depth_row)
        
df_results = pd.DataFrame(results_rows)

In [8]:
df_results

Unnamed: 0,dataset,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,acc_valDT3,std_trainDT3,std_valDT3,acc_trainDT4,...,std_trainRF3,std_valRF3,acc_trainRF100,acc_valRF100,std_trainRF100,std_valRF100,acc_trainRF3_d2,acc_valRF3_d2,std_trainRF3_d2,std_valRF3_d2
0,abalone_input.pd,0.680996,0.670459,0.00904,0.014046,0.705731,0.694373,0.00434,0.011914,0.726213,...,0.004138,0.014694,0.999987,0.731148,6.3e-05,0.011213,0.668183,0.665254,0.007797,0.013788
1,banknote_input.pd,0.913926,0.905831,0.007651,0.01764,0.940845,0.932507,0.00593,0.013209,0.967609,...,0.001534,0.007482,1.0,0.992828,0.0,0.004562,0.903878,0.893615,0.034442,0.037548
2,breast_cancer_input.np,0.935493,0.90972,0.009749,0.020794,0.95885,0.924196,0.006062,0.018521,0.974554,...,0.005615,0.022484,1.0,0.937273,0.0,0.018113,0.925962,0.908881,0.012821,0.024832
3,cars_input.pd,0.778611,0.775278,0.005698,0.017094,0.809128,0.794838,0.004757,0.014272,0.819761,...,0.005002,0.021097,1.0,0.959028,0.0,0.011065,0.708032,0.709537,0.016608,0.024444
4,contraceptive_input.pd,0.497717,0.476829,0.018118,0.024355,0.532373,0.519485,0.012721,0.028792,0.576322,...,0.008476,0.02498,0.964139,0.515095,0.003571,0.020411,0.473533,0.4571,0.025356,0.035595
5,generated6_input.np,0.740771,0.73696,0.003443,0.010113,0.749824,0.742728,0.007546,0.014518,0.825435,...,0.0013,0.004805,0.999992,0.970912,4.5e-05,0.004091,0.729003,0.72796,0.037685,0.037991
6,iris_input.pd,0.963214,0.941579,0.008682,0.03082,0.976607,0.938684,0.010241,0.03512,0.991518,...,0.009805,0.036498,1.0,0.947105,0.0,0.032973,0.940625,0.921579,0.035544,0.056069
7,steel_input.pd,0.535546,0.534198,0.005581,0.016874,0.554536,0.533457,0.006528,0.015539,0.626983,...,0.006212,0.024534,1.0,0.776975,0.0,0.016769,0.531141,0.528539,0.012785,0.024615
8,students_input.pd,0.495491,0.483653,0.01242,0.039674,0.532064,0.508263,0.014279,0.032172,0.556212,...,0.013867,0.037258,0.942124,0.487365,0.005865,0.033414,0.467695,0.43521,0.051231,0.06239


In [9]:
df_c = pd.concat([df, df_results], axis=1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]

In [10]:
df_c

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,...,std_trainRF3,std_valRF3,acc_trainRF100,acc_valRF100,std_trainRF100,std_valRF100,acc_trainRF3_d2,acc_valRF3_d2,std_trainRF3_d2,std_valRF3_d2
0,abalone_input.pd,0.740544,0.714638,0.004586,0.008545,0.680996,0.670459,0.00904,0.014046,0.705731,...,0.004138,0.014694,0.999987,0.731148,6.3e-05,0.011213,0.668183,0.665254,0.007797,0.013788
1,banknote_input.pd,0.999944,0.988236,0.000227,0.001404,0.913926,0.905831,0.007651,0.01764,0.940845,...,0.001534,0.007482,1.0,0.992828,0.0,0.004562,0.903878,0.893615,0.034442,0.037548
2,breast_cancer_input.np,0.961615,0.942867,0.003741,0.013413,0.935493,0.90972,0.009749,0.020794,0.95885,...,0.005615,0.022484,1.0,0.937273,0.0,0.018113,0.925962,0.908881,0.012821,0.024832
3,cars_input.pd,0.816618,0.775979,0.011827,0.010095,0.778611,0.775278,0.005698,0.017094,0.809128,...,0.005002,0.021097,1.0,0.959028,0.0,0.011065,0.708032,0.709537,0.016608,0.024444
4,contraceptive_input.pd,0.594323,0.581661,0.005829,0.005286,0.497717,0.476829,0.018118,0.024355,0.532373,...,0.008476,0.02498,0.964139,0.515095,0.003571,0.020411,0.473533,0.4571,0.025356,0.035595
5,generated6_input.np,0.971227,0.97016,0.01192,0.012456,0.740771,0.73696,0.003443,0.010113,0.749824,...,0.0013,0.004805,0.999992,0.970912,4.5e-05,0.004091,0.729003,0.72796,0.037685,0.037991
6,iris_input.pd,0.986732,0.946921,0.007783,0.021871,0.963214,0.941579,0.008682,0.03082,0.976607,...,0.009805,0.036498,1.0,0.947105,0.0,0.032973,0.940625,0.921579,0.035544,0.056069
7,steel_input.pd,0.680449,0.680965,0.014563,0.013425,0.535546,0.534198,0.005581,0.016874,0.554536,...,0.006212,0.024534,1.0,0.776975,0.0,0.016769,0.531141,0.528539,0.012785,0.024615
8,students_input.pd,0.5769,0.449228,0.020305,0.009204,0.495491,0.483653,0.01242,0.039674,0.532064,...,0.013867,0.037258,0.942124,0.487365,0.005865,0.033414,0.467695,0.43521,0.051231,0.06239


In [17]:
for index, row in df_c.iterrows():
    print(row.dataset.replace("_input","").replace(".pd","").replace(".np","").replace("_"," "), "&",
     ("%.2f" % round(row.acc_train,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_train,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_val,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_val,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT2	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT2,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT3	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT3,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT4	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT4,2)).lstrip('0'), "&",
     #("%.2f" % round(row.acc_valRF3	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valRF3,2)).lstrip('0'), "&",  
     ("%.2f" % round(row.acc_valRF3_d2,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valRF3_d2,2)).lstrip('0'), "&",            
     ("%.2f" % round(row.acc_valRF100,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valRF100,2)).lstrip('0'),           
      "\\\\")

abalone & .74 $\pm$ .00 & .71 $\pm$ .01 & .67 $\pm$ .01 & .69 $\pm$ .01 & .70 $\pm$ .01 & .67 $\pm$ .01 & .73 $\pm$ .01 \\
banknote & 1.00 $\pm$ .00 & .99 $\pm$ .00 & .91 $\pm$ .02 & .93 $\pm$ .01 & .95 $\pm$ .02 & .89 $\pm$ .04 & .99 $\pm$ .00 \\
breast cancer & .96 $\pm$ .00 & .94 $\pm$ .01 & .91 $\pm$ .02 & .92 $\pm$ .02 & .92 $\pm$ .02 & .91 $\pm$ .02 & .94 $\pm$ .02 \\
cars & .82 $\pm$ .01 & .78 $\pm$ .01 & .78 $\pm$ .02 & .79 $\pm$ .01 & .81 $\pm$ .02 & .71 $\pm$ .02 & .96 $\pm$ .01 \\
contraceptive & .59 $\pm$ .01 & .58 $\pm$ .01 & .48 $\pm$ .02 & .52 $\pm$ .03 & .55 $\pm$ .02 & .46 $\pm$ .04 & .52 $\pm$ .02 \\
generated6 & .97 $\pm$ .01 & .97 $\pm$ .01 & .74 $\pm$ .01 & .74 $\pm$ .01 & .82 $\pm$ .01 & .73 $\pm$ .04 & .97 $\pm$ .00 \\
iris & .99 $\pm$ .01 & .95 $\pm$ .02 & .94 $\pm$ .03 & .94 $\pm$ .04 & .94 $\pm$ .03 & .92 $\pm$ .06 & .95 $\pm$ .03 \\
steel & .68 $\pm$ .01 & .68 $\pm$ .01 & .53 $\pm$ .02 & .53 $\pm$ .02 & .61 $\pm$ .02 & .53 $\pm$ .02 & .78 $\pm$ .02 \\
student

In [16]:
df_c[["dataset","acc_train","acc_val","acc_trainDT2"]]

Unnamed: 0,dataset,acc_train,acc_val,acc_trainDT2
0,abalone_input.pd,0.740544,0.714638,0.680996
1,banknote_input.pd,0.999944,0.988236,0.913926
2,breast_cancer_input.np,0.961615,0.942867,0.935493
3,cars_input.pd,0.816618,0.775979,0.778611
4,contraceptive_input.pd,0.594323,0.581661,0.497717
5,generated6_input.np,0.971227,0.97016,0.740771
6,iris_input.pd,0.986732,0.946921,0.963214
7,steel_input.pd,0.680449,0.680965,0.535546
8,students_input.pd,0.5769,0.449228,0.495491


In [20]:
for index, row in df_c.iterrows():
    print(row.dataset.replace("_input","").replace(".pd","").replace(".np","").replace("_"," "), "&",
     ("%.2f" % round(row.acc_train,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_train,2)).lstrip('0'), "&")

abalone & .74 $\pm$ .00 &
banknote & 1.00 $\pm$ .00 &
breast cancer & .96 $\pm$ .00 &
cars & .82 $\pm$ .01 &
contraceptive & .59 $\pm$ .01 &
generated6 & .97 $\pm$ .01 &
iris & .99 $\pm$ .01 &
steel & .68 $\pm$ .01 &
students & .58 $\pm$ .02 &


In [21]:
for index, row in df_c.iterrows():
    print(
     ("%.2f" % round(row.acc_val,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_val,2)).lstrip('0'), "&")

.71 $\pm$ .01 &
.99 $\pm$ .00 &
.94 $\pm$ .01 &
.78 $\pm$ .01 &
.58 $\pm$ .01 &
.97 $\pm$ .01 &
.95 $\pm$ .02 &
.68 $\pm$ .01 &
.45 $\pm$ .01 &
