### Optimize the decision threshold of the RF classifier exploiting the out-of-bag probabilities

Import the ghostml library

In [None]:
import ghostml

**Function to calculate classification metrics**

In [2]:
from sklearn import metrics
import numpy as np

def calc_metrics(labels_test, test_probs, threshold = 0.5):
    scores = [1 if x>=threshold else 0 for x in test_probs]
    auc = metrics.roc_auc_score(labels_test, test_probs)
    kappa = metrics.cohen_kappa_score(labels_test,scores)
    confusion = metrics.confusion_matrix(labels_test,scores, labels=list(set(labels_test)))
    print('thresh: %.2f, kappa: %.3f, AUC test-set: %.3f'%(threshold, kappa, auc))
    print(confusion)
    print(metrics.classification_report(labels_test,scores))
    return 

**Example**

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate a binary imbalanced classification problem, with 80% zeros and 20% ones.
X, y = make_classification(n_samples=1000, n_features=20,
                           n_informative=14, n_redundant=0,
                           random_state=0, shuffle=False, weights = [0.8, 0.2])

# Train - test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state=0)

# Train a RF classifier
cls = RandomForestClassifier(max_depth=6, oob_score=True)
cls.fit(X_train, y_train)

# Get prediction probabilities for the test set
test_probs = cls.predict_proba(X_test)[:,1] 

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_probs, threshold = 0.5)

thresh: 0.50, kappa: 0.202, AUC test-set: 0.829
[[158   1]
 [ 35   6]]
              precision    recall  f1-score   support

           0       0.82      0.99      0.90       159
           1       0.86      0.15      0.25        41

    accuracy                           0.82       200
   macro avg       0.84      0.57      0.57       200
weighted avg       0.83      0.82      0.76       200



**Optimize the decision threshold:**

Use the Cohen's Kappa as optimization metric:

In [4]:
# extract oob prediction probabilities from the trained RF model
oob_probs = cls.oob_decision_function_
oob_probs = [x[1] for x in oob_probs]

# optmize the threshold 
thresholds = np.round(np.arange(0.05,0.55,0.05),2)
threshold1 = ghostml.optimize_threshold_from_oob_predictions(y_train, oob_probs,
                                                             thresholds=thresholds, ThOpt_metrics='mcc')

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_probs, threshold = threshold1)

thresh: 0.30, kappa: 0.559, AUC test-set: 0.829
[[144  15]
 [ 14  27]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       159
           1       0.64      0.66      0.65        41

    accuracy                           0.85       200
   macro avg       0.78      0.78      0.78       200
weighted avg       0.86      0.85      0.86       200

