### Optimize the decision threshold using GHOST

Import the ghostml library

In [1]:
import ghostml

**Function to calculate classification metrics**

In [2]:
from sklearn import metrics
import numpy as np

def calc_metrics(labels_test, test_probs, threshold = 0.5):
    scores = [1 if x>=threshold else 0 for x in test_probs]
    auc = metrics.roc_auc_score(labels_test, test_probs)
    kappa = metrics.cohen_kappa_score(labels_test,scores)
    confusion = metrics.confusion_matrix(labels_test,scores, labels=list(set(labels_test)))
    print('thresh: %.2f, kappa: %.3f, AUC test-set: %.3f'%(threshold, kappa, auc))
    print(confusion)
    print(metrics.classification_report(labels_test,scores))
    return 

**Example**

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate a binary imbalanced classification problem, with 80% zeros and 20% ones.
X, y = make_classification(n_samples=1000, n_features=20,
                           n_informative=14, n_redundant=0,
                           random_state=0, shuffle=False, weights = [0.8, 0.2])

# Train - test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state=0)

# Train a RF classifier
cls = RandomForestClassifier(max_depth=6, oob_score=True)
cls.fit(X_train, y_train)

# Get prediction probabilities for the test set
test_probs = cls.predict_proba(X_test)[:,1] 

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_probs, threshold = 0.5)

thresh: 0.50, kappa: 0.247, AUC test-set: 0.836
[[159   0]
 [ 34   7]]
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       159
           1       1.00      0.17      0.29        41

    accuracy                           0.83       200
   macro avg       0.91      0.59      0.60       200
weighted avg       0.86      0.83      0.78       200



**Optimize the decision threshold using GHOST**

Use the Cohen's Kappa as optimization metric:

In [4]:
# extract the positive prediction probabilities for the training set from the trained RF model
train_probs = cls.predict_proba(X_train)[:,1]

# optmize the threshold 
thresholds = np.round(np.arange(0.05,0.55,0.05),2)
threshold1 = ghostml.optimize_threshold_from_predictions(y_train, train_probs, thresholds, ThOpt_metrics = 'Kappa') 

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_probs, threshold = threshold1)

thresh: 0.25, kappa: 0.455, AUC test-set: 0.836
[[132  27]
 [ 13  28]]
              precision    recall  f1-score   support

           0       0.91      0.83      0.87       159
           1       0.51      0.68      0.58        41

    accuracy                           0.80       200
   macro avg       0.71      0.76      0.73       200
weighted avg       0.83      0.80      0.81       200

