In [17]:
import pickle
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from modt.modt import MoDT

#     optuna_ex1_hyperparameters_per_dataset.py
#  -> analysis_ex1_hyperparameters.ipynb
#  -> benchmark_ex1_best_hyperparameters.py
#  -> analysis_ex1_hyperparameters_best.ipynb

In [18]:
SETUP = "2D"  # "FG" or "2D"
df = pd.read_pickle("dataframes/ex1_df_runs_with_hyperparameters_per_dataset_{}_e3_d2.pd".format(SETUP)) 

In [10]:
df

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val
0,abalone_input.pd,0.715437,0.692344,0.003587,0.007489
1,banknote_input.pd,0.952867,0.949708,0.026919,0.020447
2,breast_cancer_input.np,0.964789,0.933566,0.007423,0.018168
3,cars_input.pd,0.805748,0.762153,0.018481,0.013535
4,contraceptive_input.pd,0.547328,0.52981,0.025195,0.032322
5,generated6_input.np,0.895533,0.8978,0.061187,0.056981
6,iris_input.pd,0.988839,0.960526,0.007403,0.013158
7,steel_input.pd,0.608763,0.601337,0.013637,0.015696
8,students_input.pd,0.564128,0.450599,0.022871,0.012964


# Comparison with plain scikit-learn DTs

In [11]:
def k_fold(data_input,data_target,depth,n_repeats):

    use_dataframe = False
    if isinstance(data_input, pd.core.frame.DataFrame):
        use_dataframe = True

    accuracies_training = []
    accuracies_validation = []

    rkf = RepeatedKFold(n_splits=4, n_repeats=n_repeats)
    for train_idx, val_idx in rkf.split(data_input):
        
        if use_dataframe:
            X_temp = data_input.iloc[train_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[train_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[train_idx]
            y_temp = data_target[train_idx]

        clf = tree.DecisionTreeClassifier(max_depth=depth)
        clf = clf.fit(X_temp, y_temp)
        accuracies_training.append(clf.score(X_temp,y_temp))

        if use_dataframe:
            X_temp = data_input.iloc[val_idx].reset_index(inplace=False, drop=True)
            y_temp = data_target.iloc[val_idx].reset_index(inplace=False, drop=True)
        else:
            X_temp = data_input[val_idx]
            y_temp = data_target[val_idx]
        accuracies_validation.append(clf.score(X_temp,y_temp))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [27]:
def run_DT(data_complete_input,data_complete_target,depth,n_repeats):
    
    accuracies_training = []
    accuracies_validation = []

    for _ in range(n_repeats):
        
        shuffled_X, shuffled_y = shuffle(data_complete_input, data_complete_target)
        data_input_train, data_input_test, data_target_train, data_target_test = train_test_split(shuffled_X, shuffled_y, test_size=0.25)

        if isinstance(data_input_train, pd.core.frame.DataFrame):
            data_input_train.reset_index(inplace=True, drop=True)
            data_input_test.reset_index(inplace=True, drop=True)        
            data_target_train.reset_index(inplace=True, drop=True)
            data_target_test.reset_index(inplace=True, drop=True)
                
        clf = tree.DecisionTreeClassifier(max_depth=depth)
        clf = clf.fit(data_input_train, data_target_train)
        accuracies_training.append(clf.score(data_input_train,data_target_train))
        accuracies_validation.append(clf.score(data_input_test,data_target_test))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [28]:
def run_forest(data_complete_input,data_complete_target,estimators,n_repeats,max_depth=None):
    accuracies_training = []
    accuracies_validation = []

    for _ in range(n_repeats):       
        shuffled_X, shuffled_y = shuffle(data_complete_input, data_complete_target)
        data_input_train, data_input_test, data_target_train, data_target_test = train_test_split(shuffled_X, shuffled_y, test_size=0.25)

        if isinstance(data_input_train, pd.core.frame.DataFrame):
            data_input_train.reset_index(inplace=True, drop=True)
            data_input_test.reset_index(inplace=True, drop=True)        
            data_target_train.reset_index(inplace=True, drop=True)
            data_target_test.reset_index(inplace=True, drop=True)
        
        clf = RandomForestClassifier(n_estimators=estimators,max_depth=max_depth)
        clf = clf.fit(data_input_train, data_target_train)
        accuracies_training.append(clf.score(data_input_train,data_target_train))
        accuracies_validation.append(clf.score(data_input_test,data_target_test))

    dict_results = {}
    dict_results["accuracy_train"] = accuracies_training
    dict_results["accuracy_val"] = accuracies_validation

    return dict_results

In [36]:
datasets = np.unique(df["dataset"])
runs = 5
results_rows = []
depth_row = {}
depths = [2,3,4]
df_results = pd.DataFrame(results_rows)
for dataset in datasets:
    data_input = pickle.load(open("../datasets/" + dataset, "rb"))
    data_target = pickle.load(open("../datasets/" + dataset.replace("input","target"), "rb"))

    modt = MoDT(data_input,data_target,n_experts=2,iterations=1,max_depth=1)  # Params do not matter, used for equivalent data pre-processing

    for depth in depths:
        dict_results = run_DT(modt.X, modt.y, depth=depth, n_repeats=runs)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainDT" + str(depth) : np.mean(accuracies_training),
            "acc_valDT" + str(depth) : np.mean(accuracies_validation),
            "std_trainDT" + str(depth) : np.std(accuracies_training),
            "std_valDT" + str(depth) : np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}
        
    for estimators in [3,100]:
        dict_results = run_forest(modt.X, modt.y, estimators=estimators, n_repeats=runs)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainRF" + str(estimators) : np.mean(accuracies_training),
            "acc_valRF" + str(estimators) : np.mean(accuracies_validation),
            "std_trainRF" + str(estimators) : np.std(accuracies_training),
            "std_valRF" + str(estimators) : np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}
 
    for estimators in [3]:
        dict_results = run_forest(modt.X, modt.y, estimators=estimators, n_repeats=runs, max_depth=2)
        accuracies_training = dict_results["accuracy_train"]
        accuracies_validation = dict_results["accuracy_val"]

        row = {
            "dataset" : dataset,
            "acc_trainRF" + str(estimators) + "_d" + str(max_depth) : np.mean(accuracies_training),
            "acc_valRF" + str(estimators) + "_d" + str(max_depth) : np.mean(accuracies_validation),
            "std_trainRF" + str(estimators) + "_d" + str(max_depth) : np.std(accuracies_training),
            "std_valRF" + str(estimators) + "_d" + str(max_depth) : np.std(accuracies_validation),
        }
        depth_row = {**depth_row, **row}

    results_rows.append(depth_row)
        
df_results = pd.DataFrame(results_rows)

TypeError: run_forest() got an unexpected keyword argument 'max_depth'

In [None]:
df_results

In [31]:
df_c = pd.concat([df, df_results], axis=1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]

In [32]:
df_c

Unnamed: 0,dataset,acc_train,acc_val,std_train,std_val,acc_trainDT2,acc_valDT2,std_trainDT2,std_valDT2,acc_trainDT3,...,std_trainDT4,std_valDT4,acc_trainRF3,acc_valRF3,std_trainRF3,std_valRF3,acc_trainRF100,acc_valRF100,std_trainRF100,std_valRF100
0,abalone_input.pd,0.715437,0.692344,0.003587,0.007489,0.682375,0.667368,0.00632,0.009427,0.708174,...,0.012684,0.014581,0.935888,0.656459,0.002324,0.012695,1.0,0.737416,0.0,0.007125
1,banknote_input.pd,0.952867,0.949708,0.026919,0.020447,0.913703,0.909038,0.008346,0.018568,0.934888,...,0.003048,0.014235,0.996501,0.984257,0.001579,0.008369,1.0,0.990671,0.0,0.005016
2,breast_cancer_input.np,0.964789,0.933566,0.007423,0.018168,0.940376,0.917483,0.006228,0.012031,0.96338,...,0.004552,0.005594,0.982629,0.918881,0.004093,0.016899,1.0,0.935664,0.0,0.01356
3,cars_input.pd,0.805748,0.762153,0.018481,0.013535,0.778086,0.776852,0.005132,0.015397,0.811265,...,0.005104,0.015313,0.980247,0.886574,0.004381,0.009143,1.0,0.960185,0.0,0.003704
4,contraceptive_input.pd,0.547328,0.52981,0.025195,0.032322,0.493116,0.456369,0.013708,0.018348,0.529167,...,0.005613,0.024057,0.869203,0.474797,0.006186,0.025618,0.963768,0.515447,0.002497,0.014784
5,generated6_input.np,0.895533,0.8978,0.061187,0.056981,0.73872,0.744,0.002189,0.008,0.749173,...,0.006404,0.011234,0.9912,0.96096,0.001106,0.002968,1.0,0.9688,0.0,0.002817
6,iris_input.pd,0.988839,0.960526,0.007403,0.013158,0.967857,0.926316,0.007143,0.034912,0.975,...,0.0,0.026837,0.991071,0.947368,0.005647,0.033287,1.0,0.952632,0.0,0.030689
7,steel_input.pd,0.608763,0.601337,0.013637,0.015696,0.537595,0.536214,0.008218,0.020477,0.552852,...,0.008027,0.025221,0.938832,0.663374,0.006831,0.023518,1.0,0.754321,0.0,0.008189
8,students_input.pd,0.564128,0.450599,0.022871,0.012964,0.499399,0.456287,0.00985,0.021557,0.539078,...,0.014617,0.044263,0.849699,0.422754,0.011547,0.026129,0.947495,0.504192,0.006634,0.033958


In [33]:
for index, row in df_c.iterrows():
    print(row.dataset.replace("_input","").replace(".pd","").replace(".np","").replace("_"," "), "&",
     ("%.2f" % round(row.acc_train,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_train,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_val,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_val,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT2	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT2,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT3	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT3,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valDT4	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valDT4,2)).lstrip('0'), "&",
     ("%.2f" % round(row.acc_valRF3	,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valRF3,2)).lstrip('0'), "&",  
     ("%.2f" % round(row.acc_valRF100,2)).lstrip('0'), "$\\pm$", ("%.2f" % round(row.std_valRF100,2)).lstrip('0'),           
      "\\\\")

abalone & .72 $\pm$ .00 & .69 $\pm$ .01 & .67 $\pm$ .01 & .69 $\pm$ .00 & .70 $\pm$ .01 & .66 $\pm$ .01 & .74 $\pm$ .01 \\
banknote & .95 $\pm$ .03 & .95 $\pm$ .02 & .91 $\pm$ .02 & .93 $\pm$ .01 & .96 $\pm$ .01 & .98 $\pm$ .01 & .99 $\pm$ .01 \\
breast cancer & .96 $\pm$ .01 & .93 $\pm$ .02 & .92 $\pm$ .01 & .91 $\pm$ .02 & .93 $\pm$ .01 & .92 $\pm$ .02 & .94 $\pm$ .01 \\
cars & .81 $\pm$ .02 & .76 $\pm$ .01 & .78 $\pm$ .02 & .79 $\pm$ .01 & .81 $\pm$ .02 & .89 $\pm$ .01 & .96 $\pm$ .00 \\
contraceptive & .55 $\pm$ .03 & .53 $\pm$ .03 & .46 $\pm$ .02 & .52 $\pm$ .02 & .55 $\pm$ .02 & .47 $\pm$ .03 & .52 $\pm$ .01 \\
generated6 & .90 $\pm$ .06 & .90 $\pm$ .06 & .74 $\pm$ .01 & .74 $\pm$ .01 & .81 $\pm$ .01 & .96 $\pm$ .00 & .97 $\pm$ .00 \\
iris & .99 $\pm$ .01 & .96 $\pm$ .01 & .93 $\pm$ .03 & .96 $\pm$ .02 & .96 $\pm$ .03 & .95 $\pm$ .03 & .95 $\pm$ .03 \\
steel & .61 $\pm$ .01 & .60 $\pm$ .02 & .54 $\pm$ .02 & .53 $\pm$ .02 & .58 $\pm$ .03 & .66 $\pm$ .02 & .75 $\pm$ .01 \\
students

In [34]:
df_c[["acc_train","acc_trainDT2"]]

Unnamed: 0,acc_train,acc_trainDT2
0,0.715437,0.682375
1,0.952867,0.913703
2,0.964789,0.940376
3,0.805748,0.778086
4,0.547328,0.493116
5,0.895533,0.73872
6,0.988839,0.967857
7,0.608763,0.537595
8,0.564128,0.499399
