### Optimize the decision threshold of binary classifiers using GHOST

Import the ghostml library

In [None]:
import ghostml

**Function to calculate classification metrics**

In [None]:
from sklearn import metrics
import numpy as np

def calc_metrics(labels_test, test_preds = None, test_probs = None, threshold = 0.5):
    if test_preds is None and test_probs is None:
        return print("ERROR: specify test_preds or test_probs")
    if test_preds is not None:
        scores = list(test_preds)
        auc = metrics.roc_auc_score(labels_test, scores)
    if test_probs is not None:
        scores = [1 if x>=threshold else 0 for x in test_probs]
        auc = metrics.roc_auc_score(labels_test, test_probs)
    kappa = metrics.cohen_kappa_score(labels_test,scores)
    confusion = metrics.confusion_matrix(labels_test,scores, labels=list(set(labels_test)))
    print('thresh: %.2f, kappa: %.3f, AUC test-set: %.3f'%(threshold, kappa, auc))
    print(confusion)
    print(metrics.classification_report(labels_test,scores))
    return 

#### Example 1: Optimize the classification threshold of binary classifiers returning probability estimates

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate a binary imbalanced classification problem, with 80% zeros and 20% ones.
X, y = make_classification(n_samples=1000, n_features=20,
                           n_informative=14, n_redundant=0,
                           random_state=0, shuffle=False, weights = [0.8, 0.2])

# Train - test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state=0)

# Train a RF classifier
cls = RandomForestClassifier(max_depth=6, oob_score=True)
cls.fit(X_train, y_train)

# Get prediction probabilities for the test set
test_probs = cls.predict_proba(X_test)[:,1] 

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_probs = test_probs, threshold = 0.5)

**Optimize the decision threshold using GHOST**

Use the Cohen's Kappa as optimization metric:

In [None]:
# extract the positive prediction probabilities for the training set from the trained RF model
train_probs = cls.predict_proba(X_train)[:,1]

# optmize the threshold 
thresholds = np.round(np.arange(0.05,0.55,0.05),2)
threshold1 = ghostml.optimize_threshold_from_predictions(y_train, train_probs, thresholds, ThOpt_metrics = 'Kappa') 

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_probs = test_probs, threshold = threshold1)

#### Example 2: Optimize the hyperplane position of hyperplane based classifiers (e.g. SVM, Ridge)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate a binary imbalanced classification problem, with 80% zeros and 20% ones.
X, y = make_classification(n_samples=1000, n_features=20,
                           n_informative=14, n_redundant=0,
                           random_state=0, shuffle=False, weights = [0.9, 0.1])

# Train - test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state=0)

# Train a linear SVC classifier
cls = make_pipeline(StandardScaler(), LinearSVC())
cls.fit(X_train, y_train)
cls_model = cls['linearsvc']

# Get prediction probabilities for the test set
test_preds = cls.predict(X_test)

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_preds = test_preds)

**Optimize the hyperplane position using GHOST**

Use the MCC as optimization metric.

With `average='curve'`, GHOST calculates the median classification metric per hyperplane shift over the N_subsets
            and returns the hyperplane shift that optimizes the median optimization curve.

In [None]:
# extract the decision function values for the training set.
# If decision_function_shape=’ovo’, these values are proportional to the distance of the samples to the separating hyperplane.
# If the exact distances are required, divide the function values by the norm of the weight vector (coef_)
train_distances = cls_model.decision_function(X_train)

# optimize the hyperplane shift using the Matthews correlation coefficient as optimization metric
hyperplane_shift = ghostml.svm_othr_from_predictions(y_train, train_distances, ThOpt_metrics = 'mcc', average='curve',
                                                     plot_optimization_curve=True, figure_folder='.', figure_basename='curve_average', N_subsets=300
                                                     )

# shift the hyperplane and recalculate the test predictions
scores_test = cls_model.decision_function(X_test) + hyperplane_shift
test_preds_new = [1 if x>=0 else 0 for x in scores_test]

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_preds = test_preds_new)

With `average='threshold'`, GHOST calculates the hyperplane shift for every subset
            and returns the median optimal hyperplane shift over the N_subsets.

In [None]:
# optimize the hyperplane shift using the Matthews correlation coefficient as optimization metric
hyperplane_shift = ghostml.svm_othr_from_predictions(y_train, train_distances, ThOpt_metrics = 'mcc', average='threshold',
                                                     plot_optimization_curve=True, figure_folder='.', figure_basename='threshold_average', N_subsets=300
                                                     )

# shift the hyperplane and recalculate the test predictions
scores_test = cls_model.decision_function(X_test) + hyperplane_shift
test_preds_new = [1 if x>=0 else 0 for x in scores_test]

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_preds = test_preds_new)