**Function to optimize the decision threshold using the prediction probabilities of N training subsets**

In [1]:
import pandas as pd
from collections import defaultdict


def optimize_threshold_train_subset(cls, fps_train, labels_train, thresholds, 
                                    ThOpt_metrics = 'Kappa', N_subsets = 100, 
                                    subsets_size = 0.2, with_replacement = False, random_seed = None):

    """Optimize the decision threshold based on subsets of the training set.
    The threshold that maximizes the Cohen's kappa coefficient or a ROC-based criterion 
    on the training subsets is chosen as optimal.
    
    Parameters
    ----------
    cls : obj
        Trained machine learning classifier built using scikit-learn
    fps_train: list 
        Molecular descriptors for the training set
    labels_train: list of int
        True labels for the training set
    thresholds: list of floats
        List of decision thresholds to screen for classification
    ThOpt_metrics: str
        Optimization metric. Choose between "Kappa" and "ROC"
    N_subsets: int
        Number of training subsets to use in the optimization
    subsets_size: float or int
        Size of the subsets. if float, represents the proportion of the dataset to include in the subsets. 
        If integer, it represents the actual number of instances to include in the subsets. 
    with_replacement: bool
        The subsets are drawn randomly. True to draw the subsets with replacement
    random_seed: int    
        random number to seed the drawing of the subsets
    
    Returns
    ----------
    thresh: float
        Optimal decision threshold for classification
    """
    
    # seeding
    np.random.seed(random_seed)
    random_seeds = np.random.randint(N_subsets*10, size=N_subsets)  
    
    def helper_calc_median_std(specificity):
        # Calculate median and std of the columns of a pandas dataframe
        arr = np.array(specificity)
        y_values_median = np.median(arr,axis=0)
        y_values_std = np.std(arr,axis=0)
        return y_values_median, y_values_std    

    # calculate prediction probability for the training set
    probs_train = cls.predict_proba(fps_train)[:,1]
    labels_train_thresh = {'labels': labels_train}
    labels_train_thresh.update({'probs': probs_train})
    # recalculate the predictions for the training set using different thresholds and
    # store the predictions in a dataframe
    for thresh in thresholds:
        labels_train_thresh.update({str(thresh): [1 if x >= thresh else 0 for x in probs_train]})
    df_preds = pd.DataFrame(labels_train_thresh)
    # Optmize the decision threshold based on the Cohen's Kappa coefficient
    if ThOpt_metrics == 'Kappa':
        # pick N_subsets training subsets and determine the threshold that provides the highest kappa on each 
        # of the subsets
        kappa_accum = []
        for i in range(N_subsets):
            if with_replacement:
                if isinstance(subsets_size, float):
                    Nsamples = int(df_preds.shape[0]*subsets_size)
                elif isinstance(subsets_size, int):
                    Nsamples = subsets_size                    
                df_subset = resample(df_preds, n_samples = Nsamples, stratify=list(df_preds.labels), random_state = random_seeds[i])
                labels_subset = df_subset['labels']
            else:
                df_tmp, df_subset, labels_tmp, labels_subset = train_test_split(df_preds, labels_train, test_size = subsets_size, stratify = labels_train, random_state = random_seeds[i])
            probs_subset = list(df_subset['probs'])
            thresh_names = [x for x in df_preds.columns if (x != 'labels' and x != 'probs')]
            kappa_train_subset = []
            for col1 in thresh_names:
                kappa_train_subset.append(metrics.cohen_kappa_score(labels_subset, list(df_subset[col1])))
            kappa_accum.append(kappa_train_subset)
        # determine the threshold that provides the best results on the training subsets
        y_values_median, y_values_std = helper_calc_median_std(kappa_accum)
        opt_thresh = thresholds[np.argmax(y_values_median)]
    # Optmize the decision threshold based on the ROC-curve, as described here https://doi.org/10.1007/s11548-013-0913-8
    elif ThOpt_metrics == 'ROC':
        sensitivity_accum = []
        specificity_accum = []
        # Calculate sensitivity and specificity for a range of thresholds and N_subsets
        for i in range(N_subsets):
            if with_replacement:
                if isinstance(subsets_size, float):
                    Nsamples = int(df_preds.shape[0]*subsets_size)
                elif isinstance(subsets_size, int):
                    Nsamples = subsets_size                    
                df_subset = resample(df_preds, n_samples = Nsamples, stratify=list(df_preds.labels), random_state = random_seeds[i])
                labels_subset = list(df_subset['labels'])
            else:
                df_tmp, df_subset, labels_tmp, labels_subset = train_test_split(df_preds, labels_train, test_size = subsets_size, stratify = labels_train, random_state = random_seeds[i])
            probs_subset = list(df_subset['probs'])
            sensitivity = []
            specificity = []
            for thresh in thresholds:
                scores = [1 if x >= thresh else 0 for x in probs_subset]
                tn, fp, fn, tp = metrics.confusion_matrix(labels_subset, scores, labels=list(set(labels_train))).ravel()
                sensitivity.append(tp/(tp+fn))
                specificity.append(tn/(tn+fp))
            sensitivity_accum.append(sensitivity)
            specificity_accum.append(specificity)
        # determine the threshold that provides the best results on the training subsets
        median_sensitivity, std_sensitivity = helper_calc_median_std(sensitivity_accum)
        median_specificity, std_specificity = helper_calc_median_std(specificity_accum)
        roc_dist_01corner = (2*median_sensitivity*median_specificity)/(median_sensitivity+median_specificity)
        opt_thresh = thresholds[np.argmax(roc_dist_01corner)]
    return opt_thresh








**Function to calculate classification metrics**

In [2]:
from sklearn import metrics
import numpy as np

def calc_metrics(labels_test, test_probs, threshold = 0.5):
    scores = [1 if x>=threshold else 0 for x in test_probs]
    auc = metrics.roc_auc_score(labels_test, test_probs)
    kappa = metrics.cohen_kappa_score(labels_test,scores)
    confusion = metrics.confusion_matrix(labels_test,scores, labels=list(set(labels_test)))
    print('thresh: %.2f, kappa: %.3f, AUC test-set: %.3f'%(threshold, kappa, auc))
    print(confusion)
    print(metrics.classification_report(labels_test,scores))
    return 

**Example**

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate a binary imbalanced classification problem, with 80% zeros and 20% ones.
X, y = make_classification(n_samples=1000, n_features=20,
                           n_informative=14, n_redundant=0,
                           random_state=0, shuffle=False, weights = [0.8, 0.2])

# Train - test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state=0)

# Train a RF classifier
cls = RandomForestClassifier(max_depth=6, oob_score=True)
cls.fit(X_train, y_train)

# Get prediction probabilities for the test set
test_probs = cls.predict_proba(X_test)[:,1] 

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_probs, threshold = 0.5)

thresh: 0.50, kappa: 0.147, AUC test-set: 0.832
[[159   0]
 [ 37   4]]
              precision    recall  f1-score   support

           0       0.81      1.00      0.90       159
           1       1.00      0.10      0.18        41

    accuracy                           0.81       200
   macro avg       0.91      0.55      0.54       200
weighted avg       0.85      0.81      0.75       200



**Optimize the decision threshold:**

Use the Cohen's Kappa as optimization metric:

In [4]:
# optmize the threshold 
thresholds = np.round(np.arange(0.05,0.55,0.05),2)
opt_threshold = optimize_threshold_train_subset(cls, X_train, y_train, thresholds, 
                                    ThOpt_metrics = 'Kappa', with_replacement = False, N_subsets = 100, 
                                    subsets_size = 0.2)

# Print confusion matrix and classification metrics
calc_metrics(y_test, test_probs, threshold = opt_threshold)

thresh: 0.30, kappa: 0.537, AUC test-set: 0.832
[[149  10]
 [ 18  23]]
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       159
           1       0.70      0.56      0.62        41

    accuracy                           0.86       200
   macro avg       0.79      0.75      0.77       200
weighted avg       0.85      0.86      0.85       200

