<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [19]:
from wildwood.datasets import load_car, load_adult, load_bank
from wildwood import ForestClassifier
import sys
import pandas as pd
import numpy as np

%config Completer.use_jedi = False

In [22]:
from sklearn.metrics import average_precision_score, roc_auc_score, log_loss, accuracy_score


from sklearn.preprocessing import LabelBinarizer


In [21]:
import matplotlib.pyplot as plt

%matplotlib inline

def dynamic_print(stri):
    sys.stdout.write('\r'+stri)
    sys.stdout.flush()


In [27]:
datasets = [load_adult(), load_car(), load_churn(), load_bank()]

col_dataset = []
col_msl = []
col_dirichlet = []
col_step = []
col_roc_auc = []
col_avg_prec = []
col_logloss = []
col_acc = []

random_state = 0

for dataset in datasets:

    dataset.one_hot_encode = False
    dataset.standardize = False
    X_train, X_test, y_train, y_test = dataset.extract(random_state=random_state)
    task = dataset.task

    y_test_binary = LabelBinarizer().fit_transform(y_test)

    for min_samples_leaf in [1, 5, 10]:
        
        clf = ForestClassifier(
            max_features=None,
            #class_weight="balanced",
            min_samples_leaf=min_samples_leaf,
            min_samples_split=2*min_samples_leaf,
            categorical_features=dataset.categorical_features_,
            random_state=random_state,
            #verbose=True
        )
        clf.fit(X_train, y_train)
        for dirichlet in [0.5, 0.1, 2.5]:
            clf.dirichlet = dirichlet
            for step in [1.0, 0.1, 10.0]:
                clf.step = step
                col_dataset.append(dataset.name)
                col_msl.append(min_samples_leaf)
                col_dirichlet.append(dirichlet)
                col_step.append(step)
                
                y_scores = clf.predict_proba(X_test)
                y_pred = np.argmax(y_scores, axis=1)

                if task == "binary-classification":
                    col_roc_auc.append(roc_auc_score(y_test, y_scores[:, 1]))
                    col_avg_prec.append(average_precision_score(y_test, y_scores[:, 1]))

                elif task == "multiclass-classification":
                    col_roc_auc.append(roc_auc_score(y_test, y_scores, multi_class="ovr", average="macro"))
                    col_avg_prec.append(average_precision_score(y_test_binary, y_scores))

                col_logloss.append(log_loss(y_test, y_scores))

                col_acc.append(accuracy_score(y_test, y_pred))

df = pd.DataFrame({"dataset" : col_dataset, "min_samples_leaf": col_msl, "dirichlet" : col_dirichlet, "step": col_step, "roc_auc":col_roc_auc,
             "average_precision":col_avg_prec, "logloss":col_logloss, "accuracy":col_acc})


In [29]:
pd.set_option("display.max_rows", 999)
df

Unnamed: 0,dataset,min_samples_leaf,dirichlet,step,roc_auc,average_precision,logloss,accuracy
0,adult,1,0.5,1.0,0.916634,0.807712,0.296982,0.864738
1,adult,1,0.5,0.1,0.912887,0.80119,0.304586,0.860779
2,adult,1,0.5,10.0,0.918725,0.81146,0.293576,0.866239
3,adult,1,0.1,1.0,0.916452,0.807432,0.297314,0.864465
4,adult,1,0.1,0.1,0.912539,0.80063,0.305303,0.861189
5,adult,1,0.1,10.0,0.918407,0.810995,0.294133,0.865625
6,adult,1,2.5,1.0,0.916887,0.8085,0.296727,0.864806
7,adult,1,2.5,0.1,0.913209,0.80186,0.304147,0.860575
8,adult,1,2.5,10.0,0.919196,0.812878,0.293069,0.866375
9,adult,5,0.5,1.0,0.916322,0.806767,0.297273,0.864055


In [30]:
import pickle as pkl

In [31]:

filename = "table_hyperparameters.pickle"
with open(filename, "wb") as f:
    pkl.dump({"results": df}, f)


In [32]:
pwd

'/Users/merad/Documents/wildwood'

In [None]:
dirichlet_values = np.array([2**(i) for i in np.linspace(-7, 0, 27)])
step_values = np.array([2**(i) for i in np.linspace(-2, 4, 29)])


values = np.zeros((4, len(dirichlet_values), len(step_values)))

for i, dirichlet in enumerate(dirichlet_values):
    clf.dirichlet = dirichlet
    for j, step in enumerate(step_values):
        clf.step = step
        dynamic_print(str(i+1) + "/" + str(len(dirichlet_values)) + "\t" + str(j+1) + "/" + str(len(step_values)))
        
        y_scores = clf.predict_proba(X_test)

        
        y_pred = np.argmax(y_scores, axis=1)
        
        if task == "binary-classification":
            values[0,i,j] = roc_auc_score(y_test, y_scores[:, 1])
            values[1,i,j] = average_precision_score(y_test, y_scores[:, 1])

        elif task == "multiclass-classification":
            values[0,i,j] = roc_auc_score(y_test, y_scores, multi_class="ovr", average="macro")
            values[1,i,j] = average_precision_score(y_test_binary, y_scores)

        values[2,i,j] = log_loss(y_test, y_scores)

        values[3,i,j] = accuracy_score(y_test, y_pred)


In [None]:
import seaborn as sns
import matplotlib.pylab as plb

def skip_ticks(ticks, skip=1):
    return [x if i%skip==0 else '' for i,x in enumerate(ticks)]
st=5
metrics_names = ["roc auc", "average precision", "log loss", "accuracy"]
metric_index=0

ax = sns.heatmap(values[metric_index], xticklabels=skip_ticks(np.around(step_values, decimals=2), st), yticklabels=
                 skip_ticks(np.around(dirichlet_values, decimals=2), st))
ax.set(xlabel="step", ylabel="dirichlet", title=metrics_names[metric_index]+" for "+dataset.name)
plb.show()

In [None]:
metric_index=1

ax = sns.heatmap(values[metric_index], xticklabels=skip_ticks(np.around(step_values, decimals=2), st), yticklabels=
                 skip_ticks(np.around(dirichlet_values, decimals=2), st))
ax.set(xlabel="step", ylabel="dirichlet", title=metrics_names[metric_index]+" for "+dataset.name)
plb.show()

In [None]:
metric_index=2

ax = sns.heatmap(values[metric_index], xticklabels=skip_ticks(np.around(step_values, decimals=2), st), yticklabels=
                 skip_ticks(np.around(dirichlet_values, decimals=2), st))
ax.set(xlabel="step", ylabel="dirichlet", title=metrics_names[metric_index]+" for "+dataset.name)
plb.show()

In [None]:
metric_index=3

ax = sns.heatmap(values[metric_index], xticklabels=skip_ticks(np.around(step_values, decimals=2), st), yticklabels=
                 skip_ticks(np.around(dirichlet_values, decimals=2), st))
ax.set(xlabel="step", ylabel="dirichlet", title=metrics_names[metric_index]+" for "+dataset.name)
plb.show()