In [None]:
# default_exp bcml_model

# bcml_model

> Core class representing a binary classification (BC) machine learning (ML) model

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
import numpy as np
import sklearn.metrics as skm
import scipy.interpolate
import scipy.optimize
import pandas as pd
import joblib

In [None]:
# export
class bcml_model:
    """
    Represents a machine learning (ML) binary classification (BC) model.
    """
    
    def __init__(self, model, train, test, num_bgs=3):
        self.model = model
        self.train = train
        self.test = test
        self.num_bgs = num_bgs
        self.train_preds = train[:,:-1]
        self.train_labels = np.where(train[:,-1]==1, np.ones_like(train[:,-1]), np.zeros_like(train[:,-1]))
        self.test_preds = test[:,:-1]
        self.test_labels = np.where(test[:,-1]==1, np.ones_like(test[:,-1]), np.zeros_like(test[:,-1]))
        self.test_bgs = [self.test[self.test[:,-1] == -i] for i in range(1,self.num_bgs+1)]
        self.test_bgs_preds = [data[:,:-1] for data in self.test_bgs]
        self.test_sigs = self.test[self.test[:,-1] == 1]
        self.test_sigs_preds = self.test_sigs[:,:-1]
    
    def fit(self, preds=None, labels=None):
        """ 
        Fits `model` to data.
        
        If predictors `preds` and labels `labels` aren't provided, 
        `self.train_preds` and `self.train_labels` are used, respectively.
        """
        preds = preds if preds is not None else self.train_preds
        labels = labels if labels is not None else self.train_labels
        self.model.fit(preds, labels)
        
    def predict_proba(self, preds=None):
        r"""
        Predicts signal probability for each element of a dataset ($? \times M$ `numpy` array).
        
        Returns `numpy` array of length $M$ with values in $[0,1]$ giving predicted signal probabilities.
        
        If predictors `preds` aren't provided, `self.test_preds` is used. 
        Uses the `predict_proba` method built into `scikit-learn` models.
        """
        preds = preds if preds is not None else self.test_preds
        try:
            return self.model.predict_proba(preds)[:,1]
        except:
            print("self.model doesn't have a predict_proba function")
            
    def predict(self, preds=None, threshold=None):
        """
        Predicts signal ($1$) or background ($2$) for each element of a dataset ($? \times M$ `numpy` array).
        
        Returns `numpy` array of length $M$ with values in $\{0,1\}$ giving predicted classifications. 
        
        If predictors `preds` aren't provided, `self.test_preds` is used.
        Uses the `predict` method built into `scikit-learn` models.
        """
        preds = preds if preds is not None else self.test_preds
        if threshold is not None:
            probs = self.model.predict_proba(preds)[:,1]
            return np.where(probs > threshold, np.ones_like(probs), np.zeros_like(probs))
        else:
            try:
                return self.model.predict(preds)
            except:
                print("self.model doesn't have a predict function")
                
    def predict_hist(self, preds=None, labels=None, num_bins=100, sepbg=False, sig_norm=1, bg_norm=1, dataframe=False):
        r"""
        Constructs a histogram of predicted signal probabilities for signal and background constituents of 
        a dataset ($? \times M$ `numpy` array).
        
        If `sepbg` is `False` (the default), background is combined and a list of $3$ $?_i \times M$ `numpy` arrays are returned,
        containing bin edges (partitioning $[0,1]$), signal bin contents, and background bin contents.
        
        If `sepbg` is `True`, backgrounds are differentiatedlist of $2 +$ `num_bgs` $?_i \times M$ `numpy` arrays are returned,
        containing bin edges (partitioning $[0,1]$), signal bin contents, and `self.num_bgs` background bin contents.
        
        If predictors `preds` aren't provided, `self.test_preds` is used. 
        If `labels` aren't provided, `self.test_labels` is used.
        """
        
        preds = preds if preds is not None else self.test_preds
        labels = labels if labels is not None else self.test_labels
        predictions = self.predict_proba(preds)
        sig_bins, bin_edges = np.histogram(predictions[labels==1], bins=num_bins, density=True)
        sig_bins *= sig_norm
        if sepbg:
            bg_norms = bg_norm
            bg_binss = [
                bg_norm * np.histogram(predictions[labels==-i], bins=num_bins, density=True)[0] 
                for i, bg_norm in enumerate(bg_norms)]
            if dataframe:
                return pd.DataFrame(data=[bin_edges, sig_bins] + bg_binss, columns=['Bin Edges', 'Signal'] + ['Background {}'.format(i) for i in range(1, self.num_bgs+1)])
            else:
                return [bin_edges, sig_bins] + bg_binss
        else:
            bg_bins = np.histogram(predictions[labels!=1], bins=num_bins, density=True)
            if dataframe:
                return pd.DataFrame(data=[bin_edges, sig_bins, bg_bins], columns=['Bin Edges', 'Signal', 'Background'])
            else:
                return [bin_edges, sig_bins, bg_bins]
            
    def feature_importance(model):
        """
        Returns the importance of the $M$ features used to train the `model` argument.
        """
        return model.feature_importances_
    
    def sorted_feature_importance(features, importances):
        """
        Returns list of features sorted by importance.
        
        Given arguments `features` and `importances`, lists of length $M$, returns list of size $M \times 2$ where
        the first column gives features and the second their associated importances, sorted by importance.
        """
        ranked_indices = np.argsort(-np.abs(importances))
        return [[features[i], importances[i]] for i in ranked_indices]
        
    def accuracy(self, preds=None, labels=None, threshold=None):
        r"""
        Computes model accuracy on a dataset ($? x M$ predictors, length $?$ labels).
        
        Returns value in $[0,1]$ giving model accuracy on the provided predictors and labels.
        
        If predictors `preds` aren't provided, `self.test_preds` is used. 
        If `labels` aren't provided, `self.test_labels` is used.
        """
        preds = preds if preds is not None else self.test_preds
        labels = labels if labels is not None else self.test_labels
        predictions = self.predict(preds=preds, threshold=threshold)
        return len(preds) - np.sum(np.abs(predictions - labels)) / len(preds)
        
    def conf_matrix(self, predictions=None, labels=None):
        r"""
        Computes the confusion matrix of the trained model on a dataset ($? x M$ predictors, length $?$ labels).
        
        Returns $2 \times 2$ confusion matrix using `sklearn.metrics.confusion_matrix`.
        
        If predictors `preds` aren't provided, `self.test_preds` is used. 
        If `labels` aren't provided, `self.test_labels` is used.
        """
        predictions = predictions if predictions is not None else self.predict(self.test_preds)
        labels = labels if np.any(labels) is not None else self.test_labels
        return skm.confusion_matrix(labels, predictions, labels=[0,1])
    
    def tpr_cm(self, conf_matrix):
        """
        Computes the true positive rate (tpr; correctly identified signal/total signal) 
        of a trained model given a confusion matrix.
        
        Returns value in $[0,1]$.
        """
        return conf_matrix[1,1]/np.sum(conf_matrix[1])
        
    def fpr_cm(self, conf_matrix):
        """
        Computes the false positive rate (fpr; misidentified background/total background) 
        of a trained model given a confusion matrix.
        
        Returns value in $[0,1]$.
        """
        return conf_matrix[0,1]/np.sum(conf_matrix[0])
    
    def tpr(self, predictions=None, labels=None):
        r"""
        Computes the true positive rate (tpr; correctly identified signal/total signal) 
        of a trained model given predictions and labels (both `numpy` array of length $?$ with values in $\{0,1\}$)
        
        Returns value in $[0,1]$.
        
        If `predictions` aren't provided, `self.predict(self.test_preds)` is used. 
        If `labels` aren't provided, `self.test_labels` is used.
        """
        predictions = predictions if predictions is not None else self.predict(self.test_preds)
        labels = labels if labels is not None else self.test_labels
        return self.tpr_cm(self.conf_matrix(predictions, labels))
    
    def fpr(self, predictions=None, labels=None):
        r"""
        Computes the false positive rate (fpr; misidentified background/total background) 
        of a trained model given predictions and labels (both `numpy` array of length $?$ with values in $\{0,1\}$)
        
        Returns value in $[0,1]$.
        
        If `predictions` aren't provided, `self.predict(self.test_preds)` is used. 
        If `labels` aren't provided, `self.test_labels` is used.
        """
        predictions = predictions if predictions is not None else self.predict(self.test_preds)
        labels = labels if labels is not None else self.test_labels
        return self.fpr_cm(self.conf_matrix(predictions, labels))
    
    def significance(self, signal, background, tpr=None, fpr=None, sepbg=False):
        r"""
        Computes signal significance of a trained model given signal and background yield.
        
        Returns a positive real number computed by 
        $$\frac{S \cdot TPR}{\sqrt{S \cdot TPR + B \cdot FPR}}$$
        which corresponds to signal significance after selecting only datapoints the model identifies as signal.
        
        If `sepbg` is `False`, `background` should be a single real number and is multiplied by `fpr`. If `sepbg` is `True`,
        `background` should be a list of length `self.num_bgs` where the $i$th element contains background yield of the $i$th
        background type. `fpr`, if passed, is then also a list of length `self.num_bgs` giving false positive rates for each
        of the background types.
        
        If `tpr` isn't provided, `self.tpr()` is used. 
        If `fpr` isn't provided, `self.fpr()` or a list of false positive rates coming from `self.test_bgs`
        are used, depending on the value of `sepbg`. 
        """
        tpr = tpr if tpr is not None else self.tpr()
        if sepbg:
            fprs = fpr if fpr is not None else [
                self.fpr(self.predict(pred), label) for pred, label in zip(
                    self.test_bgs_preds, np.zeros(np.sum([len(x) for x in self.test_bgs_preds])))]
            fprXbackground = np.sum(np.multiply(fprs, background), axis=-1)
            return signal * tpr / np.sqrt(signal * tpr + fprXbackground + 1e-10)
        else:
            fpr = fpr if fpr is not None else self.fpr()
            return signal * tpr / np.sqrt(signal * tpr + background * fpr + 1e-10)
    
    def newvar2thresh(self, newvar):
        r"""
        Helper method for `bcml.max_allowable_threshold()`, `bcml.get_tprs_fprs()`, and `bcml.best_threshold()`, 
        performing change of variables from `newvar` to `threshold`
        
        In particular, threshold $= 1 - 10^{\text{newvar}}$
        """
        
        return 1 - np.power(10, newvar)
    
    def thresh2newvar(self, thresh):
        r"""
        Helper method for `bcml.max_allowable_threshold()`, `bcml.get_tprs_fprs()`, and `bcml.best_threshold()`, 
        performing change of variables from `threshold` to `newvar`
        
        In particular, newvar $= \log_{10}(1 - \text{threhold})$
        """
        
        return np.log10(1 - thresh)
    
    def max_allowable_threshold(self, preds=None, labels=None):
        """
        Returns the highest threshold such that only labelling elements of `self.test_pred` with predicted
        probabilities higher than that threshold as signal still yields 25 signal.
        
        To achieve a discovery potential of $5\sigma$, even in the best case scenario ($TPR = 1, FPR = 0$) we still
        require $5^2 = 25$ signal events, hence we cannot chose a threshold so high that we do not keep at least
        25 signal events.
        
        If predictors `preds` aren't provided, `self.test_preds` is used. 
        If `labels` aren't provided, `self.test_labels` is used.
        
        NOTE: this function frequently returns an extrapolation error as $25$ frequently falls outside of the 
        function interpolation range (i.e., to achieve less than 25 signal events requires coming extremely close to $1$).
        """
        
        preds = preds if preds is not None else self.test_preds
        labels = labels if labels is not None else self.test_labels
        newvars = np.concatenate((np.linspace(-8, -2, 10, endpoint=False), np.linspace(-2, 0, 51, endpoint=False)))
        probs = self.model.predict_proba(preds)[:,1]
        predicts = np.array(
            [np.where(probs > self.newvar2thresh(newvar),
                      np.ones_like(probs), np.zeros_like(probs)) for newvar in newvars])
        num_sig = [np.sum(predict[(predict == 1) & (labels == 1)]) for predict in predicts]
        f = scipy.interpolate.interp1d(num_sig, newvars, kind='cubic')
        return self.newvar2thresh(f(25))
    
    def get_tprs_fprs(self, preds=None, labels=None, sepbg=False):
        """
        Produces (true positive rate, false positive rate) pairs for various thresholds 
        for the trained model on data sets.
        
        If `sepbg` is `True`, background is combined and a list of length $4$ is returned containing a list of $L$ sampled
        newvars (a convenient change of variable to approach arbitrarily close to 1: related to thresholds by 
        `bcml_model.newvar2thresh()`), an $L$-list of tprs associated to those thresholds, an $L$-list of fprs 
        related to those thresholds, and an $L$-list of length $?$ `numpy` arrays giving the predicted signal probabilities
        for the given data set.
        
        If `sepbg` is `Frue`, background is split and a list of length $4$ `self.num_bgs` is returned containing a 
        list of $L$ sampled newvars, an $L$-list of tprs associated to those thresholds, an $L$-list of lists of length
        `self.num_bgs` containing fprs for each background type for each threshold, and an $L$-list of length $?$ 
        `numpy` arrays giving the predicted signal probabilities for the given data set.
        
        If predictors `preds` aren't provided, `self.test_preds` is used. 
        If `labels` aren't provided, `self.test_labels` is used.
        Changes are made to both of the above depending on the value of `sepbg`.
        """
        
        # setting up variables
        preds = preds if preds is not None else ([self.test_sigs_preds] + self.test_bgs_preds if sepbg else self.test_preds)
        labels = labels if labels is not None else ([np.ones_like(preds[0][:,0])] + [np.zeros_like(bg_preds[:,0]) for bg_preds in preds[1:]] if sepbg else self.test_labels)
        min_newvar, max_newvar = [-8, 0]
        newvars = np.concatenate((np.linspace(min_newvar, -2, 10, endpoint=False), np.linspace(-2, max_newvar, 51, endpoint=False)))
        
        # computing significance as a function of threshold
        if sepbg:
            predss = preds
            labelss = labels
            probss = [self.predict_proba(preds) for preds in predss]
            predictionsss = np.array(
                [[np.where(probs > self.newvar2thresh(newvar), 
                          np.ones_like(probs), np.zeros_like(probs)) for probs in probss] for newvar in newvars])
            sig_conf_matrices = [
                self.conf_matrix(predictions=predictionss[0], labels=labelss[0]) for predictionss in predictionsss]
            bg_conf_matricess = [
                [self.conf_matrix(predictions=predictions, labels=labelss[i+1]) for i, predictions in enumerate(predictionss[1:])] for predictionss in predictionsss]
            tprs = np.array([self.tpr_cm(conf_matrix) for conf_matrix in sig_conf_matrices])
            fprss = np.array([[self.fpr_cm(conf_matrix) for conf_matrix in conf_matrices] for conf_matrices in bg_conf_matricess])
            return [newvars, tprs, fprss, probss]
        else:
            probs = self.predict_proba(preds)
            predictionss = np.array(
                [np.where(probs > self.newvar2thresh(newvar), 
                          np.ones_like(probs), np.zeros_like(probs)) for newvar in newvars])
            conf_matrices = [self.conf_matrix(predictions=predictions, labels=labels) for predictions in predictionss]
            tprs = [self.tpr_cm(conf_matrix) for conf_matrix in conf_matrices]
            fprs = [self.fpr_cm(conf_matrix) for conf_matrix in conf_matrices]
            return [newvars, tprs, fprs, probs]
        
    def best_threshold(self, signal, background, preds=None, labels=None, sepbg=False):
        """
        Optimizes the threshold on a given data set ($? x M$ predictors, length $?$ labels).
        """
        
        preds = preds if preds is not None else ([self.test_sigs_preds] + self.test_bgs_preds if sepbg else self.test_preds)
        labels = labels if labels is not None else ([np.ones_like(preds[0][:,0])] + [np.zeros_like(bg_preds[:,0]) for bg_preds in preds[1:]] if sepbg else self.test_labels)
        newvars, tprs, fprs, probs = self.get_tprs_fprs(preds, labels, sepbg)
        significances = -self.significance(signal, background, tprs, fprs, sepbg=sepbg)

        # interpolating significance as a function of threshold, then maximizing
        f = scipy.interpolate.interp1d(newvars, significances, kind='cubic')
        res = scipy.optimize.minimize(f, [-2], bounds=[(newvars[0] + 1e-1, newvars[-1] - 1e-1)])
        
        # computing significance, tpr, fpr for optimized threshold
        best_threshold = self.newvar2thresh(res.x[0])
        if sepbg:
            probss = probs
            labelss = labels
            fprss = fprs
            best_predictss = [np.where(probs > best_threshold, np.ones_like(probs), np.zeros_like(probs)) for probs in probss]
            sig_conf_matrix = self.conf_matrix(predictions=best_predictss[0], labels=labelss[0])
            bg_conf_matrices = [self.conf_matrix(predictions=best_predicts, labels=labelss[i+1]) for i, best_predicts in enumerate(best_predictss[1:])]
            tpr = self.tpr_cm(sig_conf_matrix)
            fprs = [self.fpr_cm(conf_matrix) for conf_matrix in bg_conf_matrices]
            best_sig = self.significance(signal, background, tpr, fprs, sepbg=sepbg)
            return [best_threshold, best_sig, tpr, fprs, tprs, fprss]
        else:
            best_predictss = np.where(probs > best_threshold, np.ones_like(probs), np.zeros_like(probs))
            conf_matrix = self.conf_matrix(predictions=best_predict)
            tpr = self.tpr_cm(conf_matrix)
            fpr = self.fpr_cm(conf_matrix)
            best_sig = self.significance(signal, background, tpr, fpr, sepbg=sepbg)
            return [best_threshold, best_sig, tpr, fpr, tprs, fprs]
    
    def req_sig_cs(self, lumi, bg_cs, tpr, fpr, sig=5, sepbg=False):
        """
        Given a luminosity (in fb$^{-1}$), a background cross section (in pb), a true positive rate, a false positive rate,
        and a signal significance, computes the signal cross section required for the signal significance to be achieved.
        
        If `sepbg` is False, background is combined and a single FPR is used; if `sepbg` is True, it is assumed that
        `bg_cs`, `fpr` are each lists of length `self.num_bgs` and their vector dot product is used for background yield.
        """
        
        conv = 10**15 / 10**12
        if sepbg:
            bg = np.sum(np.multiply(bg_cs, fpr))
            coef = [-tpr**2 * lumi**2 * conv**2, sig**2 * tpr * lumi * conv, sig**2 * bg * lumi * conv]
        else:
            coef = [-tpr**2 * lumi**2 * conv**2, sig**2 * tpr * lumi * conv, sig**2 * fpr * bg_cs * lumi * conv]
        return np.amax(np.roots(coef))
        
    def save_model(self, filename):
        """
        Saves the model to `filename.joblib`
        """
        joblib.dump(self.model, filename + '.joblib')
        
    def refresh_model(model):
        """
        If this class gets updated, run this function on your already trained model to have it reflect the updated
        class without retraining being necessary.
        """
        return sigbg_model(model.model, model.train, model.test)

### Getting up and running

#### Creating a `bcml_model` instance

To create a new `bcml_model` instance, three things are required. 

First, you need `model`, a machine learning binary classification model: this library was designed to work with `scikit-learn` classifiers, such as `sklearn.linear_model.LogisticRegression`, `sklearn.ensemble.RandomForestClassifier`, or `sklearn.ensemble.GradientBoostingClassifier`.

Second, you need `train`, a training data set. This should be a `numpy` array of shape $N \times (M + 1)$ if you have $N$ training data points and $M$ features. The first $M$ columns of `train` should contain your features, while the final column is your label: $1$ denotes signal, while $0$ or a strictly negative integer denotes background. Negative integers can be used to denote different kinds of background: i.e., $-2$ would denote the background of type $2$.

Third, you need `test`, a testing data set. This should be a `numpy` array of shape $N' \times (M + 1)$ formatted the same as `train`. Typically, $N' \approx \frac{1}{3}N$.

If you utilize the negative integer labelling of background data points, you should pass `num_bgs` as well, denoting the number of different backgrounds you're using (i.e., your backgrounds labels $\ell_i$ satisfy $-$ `num_bgs` $\leq \ell_i \leq -1$).

#### Attributes of `bcml_model`

In addition to the arguments passed, a `bcml_model` instance has the following attributes.

`train_preds`: training predictors, $N \times M$ `numpy` array, the first $M$ columns of train.

`train_labels`: training labels, `numpy` array of length $N$, final column of train (with negative integers mapped to $0$)

`test_preds`: test predictors, $N' \times M$ `numpy` array, first $M$ columns of test

`test_labels`: test labels, `numpy` array of length $N'$, final column of test (with negative integers mapped to $0$)

`test_bgs`: background test data split by background, list of $N_i \times (M + 1)$ `numpy` arrays such that the $i$th element contains all test data points of background type $i$ (i.e., $1 \leq i \leq$ `num_bgs` and $\sum_i N_i = N' - $ # of signal data points)

`test_bgs_preds`: background test predictors split by background, list of $N_i \times M$ `numpy` arrays

`test_sigs`: signal test data, $\overline{N} \times (M + 1)$ `numpy` array (i.e., $\overline{N} + \sum_i N_i = N'$)

`test_sigs_preds`: signal test data predictors, $\overline{N} \times M$ `numpy` array

### Basic Functionality

In [None]:
show_doc(bcml_model.fit)

<h4 id="bcml_model.fit" class="doc_header"><code>bcml_model.fit</code><a href="__main__.py#L21" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.fit</code>(**`preds`**=*`None`*, **`labels`**=*`None`*)

Fits `model` to data.

If predictors `preds` and labels `labels` aren't provided, 
`self.train_preds` and `self.train_labels` are used, respectively.

In [None]:
show_doc(bcml_model.predict_proba)

<h4 id="bcml_model.predict_proba" class="doc_header"><code>bcml_model.predict_proba</code><a href="__main__.py#L32" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.predict_proba</code>(**`preds`**=*`None`*)

Predicts signal probability for each element of a dataset ($? \times M$ `numpy` array).

Returns `numpy` array of length $M$ with values in $[0,1]$ giving predicted signal probabilities.

If predictors `preds` aren't provided, `self.test_preds` is used. 
Uses the `predict_proba` method built into `scikit-learn` models.

In [None]:
show_doc(bcml_model.predict)

<h4 id="bcml_model.predict" class="doc_header"><code>bcml_model.predict</code><a href="__main__.py#L47" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.predict</code>(**`preds`**=*`None`*, **`threshold`**=*`None`*)

Predicts signal ($1$) or background ($2$) for each element of a dataset ($?     imes M$ `numpy` array).

Returns `numpy` array of length $M$ with values in $\{0,1\}$ giving predicted classifications. 

If predictors `preds` aren't provided, `self.test_preds` is used.
Uses the `predict` method built into `scikit-learn` models.

In [None]:
show_doc(bcml_model.predict_hist)

<h4 id="bcml_model.predict_hist" class="doc_header"><code>bcml_model.predict_hist</code><a href="__main__.py#L66" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.predict_hist</code>(**`preds`**=*`None`*, **`labels`**=*`None`*, **`num_bins`**=*`100`*, **`sepbg`**=*`False`*, **`sig_norm`**=*`1`*, **`bg_norm`**=*`1`*, **`dataframe`**=*`False`*)

Constructs a histogram of predicted signal probabilities for signal and background constituents of 
a dataset ($? \times M$ `numpy` array).

If `sepbg` is `False` (the default), background is combined and a list of $3$ $?_i \times M$ `numpy` arrays are returned,
containing bin edges (partitioning $[0,1]$), signal bin contents, and background bin contents.

If `sepbg` is `True`, backgrounds are differentiatedlist of $2 +$ `num_bgs` $?_i \times M$ `numpy` arrays are returned,
containing bin edges (partitioning $[0,1]$), signal bin contents, and `self.num_bgs` background bin contents.

If predictors `preds` aren't provided, `self.test_preds` is used. 
If `labels` aren't provided, `self.test_labels` is used.

In [None]:
show_doc(bcml_model.feature_importance)

<h4 id="bcml_model.feature_importance" class="doc_header"><code>bcml_model.feature_importance</code><a href="__main__.py#L102" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.feature_importance</code>(**`model`**)

Returns the importance of the $M$ features used to train the `model` argument.

In [None]:
show_doc(bcml_model.sorted_feature_importance)

<h4 id="bcml_model.sorted_feature_importance" class="doc_header"><code>bcml_model.sorted_feature_importance</code><a href="__main__.py#L108" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.sorted_feature_importance</code>(**`features`**, **`importances`**)

Returns list of features sorted by importance.

Given arguments `features` and `importances`, lists of length $M$, returns list of size $M      imes 2$ where
the first column gives features and the second their associated importances, sorted by importance.

In [None]:
show_doc(bcml_model.accuracy)

<h4 id="bcml_model.accuracy" class="doc_header"><code>bcml_model.accuracy</code><a href="__main__.py#L118" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.accuracy</code>(**`preds`**=*`None`*, **`labels`**=*`None`*, **`threshold`**=*`None`*)

Computes model accuracy on a dataset ($? x M$ predictors, length $?$ labels).

Returns value in $[0,1]$ giving model accuracy on the provided predictors and labels.

If predictors `preds` aren't provided, `self.test_preds` is used. 
If `labels` aren't provided, `self.test_labels` is used.

In [None]:
show_doc(bcml_model.conf_matrix)

<h4 id="bcml_model.conf_matrix" class="doc_header"><code>bcml_model.conf_matrix</code><a href="__main__.py#L132" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.conf_matrix</code>(**`predictions`**=*`None`*, **`labels`**=*`None`*)

Computes the confusion matrix of the trained model on a dataset ($? x M$ predictors, length $?$ labels).

Returns $2 \times 2$ confusion matrix using `sklearn.metrics.confusion_matrix`.

If predictors `preds` aren't provided, `self.test_preds` is used. 
If `labels` aren't provided, `self.test_labels` is used.

In [None]:
show_doc(bcml_model.tpr_cm)

<h4 id="bcml_model.tpr_cm" class="doc_header"><code>bcml_model.tpr_cm</code><a href="__main__.py#L145" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.tpr_cm</code>(**`conf_matrix`**)

Computes the true positive rate (tpr; correctly identified signal/total signal) 
of a trained model given a confusion matrix.

Returns value in $[0,1]$.

In [None]:
show_doc(bcml_model.fpr_cm)

<h4 id="bcml_model.fpr_cm" class="doc_header"><code>bcml_model.fpr_cm</code><a href="__main__.py#L154" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.fpr_cm</code>(**`conf_matrix`**)

Computes the false positive rate (fpr; misidentified background/total background) 
of a trained model given a confusion matrix.

Returns value in $[0,1]$.

In [None]:
show_doc(bcml_model.tpr)

<h4 id="bcml_model.tpr" class="doc_header"><code>bcml_model.tpr</code><a href="__main__.py#L163" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.tpr</code>(**`predictions`**=*`None`*, **`labels`**=*`None`*)

Computes the true positive rate (tpr; correctly identified signal/total signal) 
of a trained model given predictions and labels (both `numpy` array of length $?$ with values in $\{0,1\}$)

Returns value in $[0,1]$.

If `predictions` aren't provided, `self.predict(self.test_preds)` is used. 
If `labels` aren't provided, `self.test_labels` is used.

In [None]:
show_doc(bcml_model.fpr)

<h4 id="bcml_model.fpr" class="doc_header"><code>bcml_model.fpr</code><a href="__main__.py#L177" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.fpr</code>(**`predictions`**=*`None`*, **`labels`**=*`None`*)

Computes the false positive rate (fpr; misidentified background/total background) 
of a trained model given predictions and labels (both `numpy` array of length $?$ with values in $\{0,1\}$)

Returns value in $[0,1]$.

If `predictions` aren't provided, `self.predict(self.test_preds)` is used. 
If `labels` aren't provided, `self.test_labels` is used.

### Phenomenology

In [None]:
show_doc(bcml_model.significance)

<h4 id="bcml_model.significance" class="doc_header"><code>bcml_model.significance</code><a href="__main__.py#L191" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.significance</code>(**`signal`**, **`background`**, **`tpr`**=*`None`*, **`fpr`**=*`None`*, **`sepbg`**=*`False`*)

Computes signal significance of a trained model given signal and background yield.

Returns a positive real number computed by 
$$\frac{S \cdot TPR}{\sqrt{S \cdot TPR + B \cdot FPR}}$$
which corresponds to signal significance after selecting only datapoints the model identifies as signal.

If `sepbg` is `False`, `background` should be a single real number and is multiplied by `fpr`. If `sepbg` is `True`,
`background` should be a list of length `self.num_bgs` where the $i$th element contains background yield of the $i$th
background type. `fpr`, if passed, is then also a list of length `self.num_bgs` giving false positive rates for each
of the background types.

If `tpr` isn't provided, `self.tpr()` is used. 
If `fpr` isn't provided, `self.fpr()` or a list of false positive rates coming from `self.test_bgs`
are used, depending on the value of `sepbg`. 

In [None]:
show_doc(bcml_model.newvar2thresh)

<h4 id="bcml_model.newvar2thresh" class="doc_header"><code>bcml_model.newvar2thresh</code><a href="__main__.py#L219" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.newvar2thresh</code>(**`newvar`**)

Helper method for `bcml.max_allowable_threshold()`, `bcml.get_tprs_fprs()`, and `bcml.best_threshold()`, 
performing change of variables from `newvar` to `threshold`

In particular, threshold $= 1 - 10^{\text{newvar}}$

In [None]:
show_doc(bcml_model.thresh2newvar)

<h4 id="bcml_model.thresh2newvar" class="doc_header"><code>bcml_model.thresh2newvar</code><a href="__main__.py#L229" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.thresh2newvar</code>(**`thresh`**)

Helper method for `bcml.max_allowable_threshold()`, `bcml.get_tprs_fprs()`, and `bcml.best_threshold()`, 
performing change of variables from `threshold` to `newvar`

In particular, newvar $= \log_{10}(1 - \text{threhold})$

In [None]:
show_doc(bcml_model.max_allowable_threshold)

<h4 id="bcml_model.max_allowable_threshold" class="doc_header"><code>bcml_model.max_allowable_threshold</code><a href="__main__.py#L239" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.max_allowable_threshold</code>(**`preds`**=*`None`*, **`labels`**=*`None`*)

Returns the highest threshold such that only labelling elements of `self.test_pred` with predicted
probabilities higher than that threshold as signal still yields 25 signal.

To achieve a discovery potential of $5\sigma$, even in the best case scenario ($TPR = 1, FPR = 0$) we still
require $5^2 = 25$ signal events, hence we cannot chose a threshold so high that we do not keep at least
25 signal events.

If predictors `preds` aren't provided, `self.test_preds` is used. 
If `labels` aren't provided, `self.test_labels` is used.

NOTE: this function frequently returns an extrapolation error as $25$ frequently falls outside of the 
function interpolation range (i.e., to achieve less than 25 signal events requires coming extremely close to $1$).

In [None]:
show_doc(bcml_model.get_tprs_fprs)

<h4 id="bcml_model.get_tprs_fprs" class="doc_header"><code>bcml_model.get_tprs_fprs</code><a href="__main__.py#L266" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.get_tprs_fprs</code>(**`preds`**=*`None`*, **`labels`**=*`None`*, **`sepbg`**=*`False`*)

Produces (true positive rate, false positive rate) pairs for various thresholds 
for the trained model on data sets.

If `sepbg` is `True`, background is combined and a list of length $4$ is returned containing a list of $L$ sampled
newvars (a convenient change of variable to approach arbitrarily close to 1: related to thresholds by 
[`bcml_model.newvar2thresh()`](/bcml4pheno/bcml_model.html#bcml_model.newvar2thresh())), an $L$-list of tprs associated to those thresholds, an $L$-list of fprs 
related to those thresholds, and an $L$-list of length $?$ `numpy` arrays giving the predicted signal probabilities
for the given data set.

If `sepbg` is `Frue`, background is split and a list of length $4$ `self.num_bgs` is returned containing a 
list of $L$ sampled newvars, an $L$-list of tprs associated to those thresholds, an $L$-list of lists of length
`self.num_bgs` containing fprs for each background type for each threshold, and an $L$-list of length $?$ 
`numpy` arrays giving the predicted signal probabilities for the given data set.

If predictors `preds` aren't provided, `self.test_preds` is used. 
If `labels` aren't provided, `self.test_labels` is used.
Changes are made to both of the above depending on the value of `sepbg`.

In [None]:
show_doc(bcml_model.best_threshold)

<h4 id="bcml_model.best_threshold" class="doc_header"><code>bcml_model.best_threshold</code><a href="__main__.py#L318" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.best_threshold</code>(**`signal`**, **`background`**, **`preds`**=*`None`*, **`labels`**=*`None`*, **`sepbg`**=*`False`*)

Optimizes the threshold on a given data set ($? x M$ predictors, length $?$ labels).

In [None]:
show_doc(bcml_model.req_sig_cs)

<h4 id="bcml_model.req_sig_cs" class="doc_header"><code>bcml_model.req_sig_cs</code><a href="__main__.py#L353" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.req_sig_cs</code>(**`lumi`**, **`bg_cs`**, **`tpr`**, **`fpr`**, **`sig`**=*`5`*, **`sepbg`**=*`False`*)

Given a luminosity (in fb$^{-1}$), a background cross section (in pb), a true positive rate, a false positive rate,
and a signal significance, computes the signal cross section required for the signal significance to be achieved.

If `sepbg` is False, background is combined and a single FPR is used; if `sepbg` is True, it is assumed that
`bg_cs`, `fpr` are each lists of length `self.num_bgs` and their vector dot product is used for background yield.

### Other utilities

In [None]:
show_doc(bcml_model.save_model)

<h4 id="bcml_model.save_model" class="doc_header"><code>bcml_model.save_model</code><a href="__main__.py#L370" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.save_model</code>(**`filename`**)

Saves the model to `filename.joblib`

In [None]:
show_doc(bcml_model.refresh_model)

<h4 id="bcml_model.refresh_model" class="doc_header"><code>bcml_model.refresh_model</code><a href="__main__.py#L376" class="source_link" style="float:right">[source]</a></h4>

> <code>bcml_model.refresh_model</code>(**`model`**)

If this class gets updated, run this function on your already trained model to have it reflect the updated
class without retraining being necessary.