In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, auc

In [29]:
df = pd.read_csv('divorce.csv', delimiter=';')
y = df.Class
X = df.drop(columns='Class')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [34]:
# Random forest
start_time = time.time()

rf = RandomForestClassifier(max_depth=1) # Stumps only, feature bagging with sqrt(# features), bootstrap samples
rf_clf = GridSearchCV(rf, {'n_estimators':[3,4,5,6,7,8,9,10,11,12,13,14,15]})
rf_clf.fit(X_train, y_train)

# Best number of trees
print(rf_clf.best_params_)

# Test/train set AUC
y_train_pred_rf = rf_clf.predict_proba(X_train).T[1]
y_test_pred_rf = rf_clf.predict_proba(X_test).T[1]
rf_train_score = roc_auc_score(y_train, y_train_pred_rf)
rf_test_score = roc_auc_score(y_test, y_test_pred_rf)
print(rf_train_score, rf_test_score)

# Print runtime
print("Runtime: %s sec" % (time.time() - start_time))

{'n_estimators': 12}
1.0 0.9978354978354979
Runtime: 0.9109563827514648 sec


In [35]:
# See where we start to get diminishing returns
rf_clf.cv_results_

{'mean_fit_time': array([0.0074398 , 0.00663929, 0.00717387, 0.00795078, 0.00869312,
        0.00964475, 0.01054444, 0.01141162, 0.01246724, 0.01319175,
        0.01419678, 0.01515222, 0.01656451]),
 'std_fit_time': array([0.00114986, 0.00051637, 0.00024644, 0.00015004, 0.0002457 ,
        0.00014758, 0.00011206, 0.0002477 , 0.00012206, 0.00019933,
        0.00013203, 0.00024432, 0.000487  ]),
 'mean_score_time': array([0.00267515, 0.00211077, 0.00212426, 0.00197773, 0.0021431 ,
        0.0021112 , 0.0023025 , 0.00232444, 0.00244908, 0.00263314,
        0.00268579, 0.00247378, 0.00264745]),
 'std_score_time': array([1.11893941e-04, 1.52367746e-04, 1.94140394e-04, 1.03286429e-04,
        2.42704933e-04, 8.66207215e-05, 1.42260480e-04, 1.50332895e-04,
        9.27765579e-05, 1.47889751e-04, 1.81461945e-04, 8.76870214e-05,
        1.53651953e-04]),
 'param_n_estimators': masked_array(data=[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
              mask=[False, False, False, False, False,

In [26]:
# Random forest
start_time = time.time()

rf = RandomForestClassifier(max_depth=1, n_estimators=100000) # Stumps only, feature bagging with sqrt(# features), bootstrap samples
rf.fit(X,y)

feat_imp = rf.feature_importances_
for j in range(len(feat_imp)):
    print('Atr{}'.format(j+1), feat_imp[j])

# Print runtime
print("Runtime: %s sec" % (time.time() - start_time))

Atr1 0.004540000000000001
Atr2 0.0036300000000000004
Atr3 7.000000000000001e-05
Atr4 0.010680000000000002
Atr5 0.012650000000000002
Atr6 0.0
Atr7 0.0
Atr8 0.012790000000000001
Atr9 0.06355000000000001
Atr10 0.0031900000000000006
Atr11 0.08361000000000002
Atr12 0.025630000000000003
Atr13 0.0016500000000000002
Atr14 0.026650000000000004
Atr15 0.027320000000000004
Atr16 0.056620000000000004
Atr17 0.07939000000000002
Atr18 0.10887000000000001
Atr19 0.08167000000000002
Atr20 0.06883000000000002
Atr21 0.011670000000000002
Atr22 0.0017800000000000001
Atr23 0.0010200000000000003
Atr24 0.0022300000000000006
Atr25 0.028450000000000003
Atr26 0.048380000000000006
Atr27 0.012270000000000001
Atr28 0.013390000000000003
Atr29 0.021870000000000004
Atr30 0.026840000000000003
Atr31 0.0016900000000000003
Atr32 0.0007400000000000001
Atr33 0.0014400000000000003
Atr34 0.0007100000000000001
Atr35 0.0020400000000000006
Atr36 0.021010000000000004
Atr37 0.006850000000000001
Atr38 0.011050000000000003
Atr39 0.022