 - Fix accuracy $\checkmark$
 - Feature importance access
 - interpretable feature importance access
 - Predict probabilities $\checkmark$
 - prediction histograms (consolidated background, separate background, different normalizations)
 - tpr-fpr space heatmap helper function

In [392]:
import numpy as np
import sklearn.metrics as skm
import matplotlib.pyplot as plt
import scipy.interpolate
import scipy.optimize
import joblib

class sigbg_model:
    def __init__(self, model, train, test, num_bgs=3):
        self.model = model
        self.train = train
        self.test = test
        self.num_bgs = num_bgs
        self.train_preds = train[:,:-1]
        self.train_labels = np.where(train[:,-1]==1, np.ones_like(train[:,-1]), np.zeros_like(train[:,-1]))
        self.test_preds = test[:,:-1]
        self.test_labels = np.where(test[:,-1]==1, np.ones_like(test[:,-1]), np.zeros_like(test[:,-1]))
        self.test_bgs = [self.test[self.test[:,-1] == -i] for i in range(1,self.num_bgs+1)]
        self.test_bgs_preds = [data[:,:-1] for data in self.test_bgs]
        self.test_sigs = self.test[self.test[:,-1] == 1]
        self.test_sigs_preds = self.test_sigs[:,:-1]
    
    def fit(self, preds=None, labels=None):
        preds = preds if preds is not None else self.train_preds
        labels = labels if labels is not None else self.train_labels
        self.model.fit(preds, labels)
        
    def predict_proba(self, preds=None):
        preds = preds if preds is not None else self.test_preds
        try:
            return self.model.predict_proba(preds)[:,1]
        except:
            print("self.model doesn't have a predict_proba function")
            
    def predict(self, preds=None, threshold=None):
        preds = preds if preds is not None else self.test_pred
        if threshold is not None:
            probs = self.model.predict_proba(preds)[:,1]
            return np.where(probs > threshold, np.ones_like(probs), np.zeros_like(probs))
        else:
            try:
                return self.model.predict(preds)
            except:
                print("self.model doesn't have a predict function")
                
    def feature_importance(model):
        return model.feature_importances_
    
    def feature_importance_summary(names, importances):
        return 
        
    def accuracy(self, preds=None, labels=None, threshold=None):
        preds = preds if preds is not None else self.test_preds
        labels = labels if labels is not None else self.test_labels
        predictions = self.predict(preds=preds, threshold=threshold)
        return len(preds) - np.sum(np.abs(predictions - labels)) / len(preds)
        
    def conf_matrix(self, predictions=None, labels=None):
        predictions = predictions if predictions is not None else self.predict(self.test_preds)
        labels = labels if np.any(labels) is not None else self.test_labels
        return skm.confusion_matrix(labels, predictions, labels=[0,1])
    
    def tpr_cm(self, conf_matrix):
        return conf_matrix[1,1]/np.sum(conf_matrix[1])
        
    def fpr_cm(self, conf_matrix):
        return conf_matrix[0,1]/np.sum(conf_matrix[0])
    
    def tpr(self, predictions=None, labels=None):
        """true positive rate: correctly identified signal / all signal"""
        predictions = predictions if predictions is not None else self.predict(self.test_preds)
        labels = labels if labels is not None else self.test_labels
        return self.tpr_cm(self.conf_matrix(predictions, labels))
    
    def fpr(self, predictions=None, labels=None):
        """false positive rate: misidentified signal / all background"""
        predictions = predictions if predictions is not None else self.predict(self.test_preds)
        labels = labels if labels is not None else self.test_labels
        return self.fpr_cm(self.conf_matrix(predictions, test_label))
    
    def significance(self, signal, background, tpr=None, fpr=None, sepbg=False):
        tpr = tpr if tpr is not None else self.tpr()
        if sepbg:
            fprs = fpr if fpr is not None else [
                self.fpr(self.predict(pred), label) for pred, label in zip(self.test_bgs_preds, self.test_bgs_labels)]
            fprXbackground = np.sum(np.multiply(fprs, background), axis=-1)
            return signal * tpr / np.sqrt(signal * tpr + fprXbackground + 1e-10)
        else:
            fpr = fpr if fpr is not None else self.fpr()
            return signal * tpr / np.sqrt(signal * tpr + background * fpr + 1e-10)
        
#     def significance2(self, signal, backgrounds, tpr=None, fprs=None):
#         tpr = tpr if tpr is not None else self.tpr()
#         fprs = fprs if fprs is not None else [
#             self.fpr(self.predict(preds), labels) for preds, labels in zip(self.test_bgs_preds, self.test_bgs_labels)]
#         fprXbackground = sum([fpr * bg for fpr, bg in zip(fprs, backgrounds)])
#         return signal * self.tpr() / np.sqrt(signal * self.tpr() + fprXbackground + 1e-10)
    
    def newvar2thresh(self, newvar):
        return 1 - np.power(10, newvar)
    
    def thresh2newvar(self, thresh):
        return np.log10(1 - thresh)
    
    def max_allowable_threshold(self):
        newvars = np.concatenate((np.linspace(-8, -2, 10, endpoint=False), np.linspace(-2, 0, 51, endpoint=False)))
        probs = self.model.predict_proba(self.test_pred)[:,1]
        predicts = np.array(
            [np.where(probs > self.newvar2thresh(newvar),
                      np.ones_like(probs), np.zeros_like(probs)) for newvar in newvars])
        num_sig = [np.sum(predict[(predict == 1) & (self.test_label == 1)]) for predict in predicts]
        f = scipy.interpolate.interp1d(num_sig, newvars, kind='cubic')
        return self.newvar2thresh(f(25))
    
    def best_threshold(self, signal, background, preds=None, labels=None, sepbg=False, orig_labels=None):
        """
        newvar = log_10(1 - threshold)
        threshold = 1 - 10**newvar
        0 < threshold < 1
        ??? < new_var < 0
        """
        
        print(str(datetime.datetime.now().time())[:-7])
        # setting up variables
        preds = preds if preds is not None else ([self.test_sigs_preds] + self.test_bgs_preds if sepbg else self.test_preds)
        labels = labels if labels is not None else ([np.ones_like(preds[0][:,0])] + [np.zeros_like(bg_preds[:,0]) for bg_preds in preds[1:]] if sepbg else self.test_labels)
        min_newvar, max_newvar = [-8, 0]
        newvars = np.concatenate((np.linspace(min_newvar, -2, 10, endpoint=False), np.linspace(-2, max_newvar, 51, endpoint=False)))
        
        print(str(datetime.datetime.now().time())[:-7])
        # computing significance as a function of threshold
        if sepbg:
            predss = preds
            labelss = labels
            probss = [self.predict_proba(preds) for preds in predss]
            print(str(datetime.datetime.now().time())[:-7])
            predictionsss = np.array(
                [[np.where(probs > self.newvar2thresh(newvar), 
                          np.ones_like(probs), np.zeros_like(probs)) for probs in probss] for newvar in newvars])
            print(str(datetime.datetime.now().time())[:-7])
#             print(predictionsss[0][0][:5])
#             print(labels[0][:5])
            sig_conf_matrices = [
                self.conf_matrix(predictions=predictionss[0], labels=labelss[0]) for predictionss in predictionsss]
            print(str(datetime.datetime.now().time())[:-7])
            bg_conf_matricess = [
                [self.conf_matrix(predictions=predictions, labels=labelss[i+1]) for i, predictions in enumerate(predictionss[1:])] for predictionss in predictionsss]
            print(str(datetime.datetime.now().time())[:-7])
            tprs = np.array([self.tpr_cm(conf_matrix) for conf_matrix in sig_conf_matrices])
            print(str(datetime.datetime.now().time())[:-7])
            fprss = np.array([[self.fpr_cm(conf_matrix) for conf_matrix in conf_matrices] for conf_matrices in bg_conf_matricess])
            print(str(datetime.datetime.now().time())[:-7])
#             significances = [-self.significance(signal, background, tpr, fprs, sepbg=sepbg) for tpr, fprs in zip(tprs, fprss)]
            significances = -self.significance(signal, background, tprs, fprss, sepbg=sepbg)
        else:
            probs = self.predict_proba(preds)
            print(str(datetime.datetime.now().time())[:-7])
            predictionss = np.array(
                [np.where(probs > self.newvar2thresh(newvar), 
                          np.ones_like(probs), np.zeros_like(probs)) for newvar in newvars])
            conf_matrices = [self.conf_matrix(predictions=predictions, labels=labels) for predictions in predictionss]
            tprs = [self.tpr_cm(conf_matrix) for conf_matrix in conf_matrices]
            fprs = [self.fpr_cm(conf_matrix) for conf_matrix in conf_matrices]
            significances = [-self.significance(signal, background, tpr, fpr, sepbg=sepbg) for tpr, fpr in zip(tprs, fprs)]
        
        print(str(datetime.datetime.now().time())[:-7])
        # interpolating significance as a function of threshold, then maximizing
        f = scipy.interpolate.interp1d(newvars, significances, kind='cubic')
        res = scipy.optimize.minimize(f, [-2], bounds=[(min_newvar + 1e-1, max_newvar - 1e-1)])
        
        print(str(datetime.datetime.now().time())[:-7])
        # computing significance, tpr, fpr for optimized threshold
        best_threshold = self.newvar2thresh(res.x[0])
        if sepbg:
            best_predictss = [np.where(probs > best_threshold, np.ones_like(probs), np.zeros_like(probs)) for probs in probss]
            sig_conf_matrix = self.conf_matrix(predictions=best_predictss[0], labels=labelss[0])
            bg_conf_matrices = [self.conf_matrix(predictions=best_predicts, labels=labels[i+1]) for i, best_predicts in enumerate(best_predictss[1:])]
            tpr = self.tpr_cm(sig_conf_matrix)
            fprs = [self.fpr_cm(conf_matrix) for conf_matrix in bg_conf_matrices]
            best_sig = self.significance(signal, background, tpr, fprs, sepbg=sepbg)
            return [best_threshold, best_sig, tpr, fprs, tprs, fprss]
        else:
            best_predictss = np.where(probs > best_threshold, np.ones_like(probs), np.zeros_like(probs))
            conf_matrix = self.conf_matrix(predictions=best_predict)
            tpr = self.tpr_cm(conf_matrix)
            fpr = self.fpr_cm(conf_matrix)
            best_sig = self.significance(signal, background, tpr, fpr, sepbg=sepbg)
            return [best_threshold, best_sig, tpr, fpr, tprs, fprs]
    
    def best_threshold2(self, signal, backgrounds):
        print(str(datetime.datetime.now().time())[:-7])
        newvars = np.concatenate((np.linspace(-8, -2, 10, endpoint=False), np.linspace(-2, 0, 51, endpoint=False)))
        probs = self.model.predict_proba(self.test_preds)[:,1]
        print(str(datetime.datetime.now().time())[:-7])
        predicts = np.array(
            [np.where(probs > self.newvar2thresh(newvar), 
                      np.ones_like(probs), np.zeros_like(probs)) for newvar in newvars])
        print(str(datetime.datetime.now().time())[:-7])
        conf_matrices = [[self.conf_matrix(
            predictions=predictions[(self.test[:,-1] == -i) | (self.test[:,-1] == 1)], 
            labels=self.test_labels[(self.test[:,-1] == -i) | (self.test[:,-1] == 1)]) 
                          for i in range(1,4)] for predictions in predicts]
        print(str(datetime.datetime.now().time())[:-7])
        tprs = [self.tpr_cm(conf_matrix_row[0]) for conf_matrix_row in conf_matrices]
        print(str(datetime.datetime.now().time())[:-7])
        fprss = [[self.fpr_cm(conf_matrix) for conf_matrix in conf_matrix_row] for conf_matrix_row in conf_matrices]
        print(str(datetime.datetime.now().time())[:-7])
        significances = [-self.significance(signal, backgrounds, tpr, fprs, sepbg=True) for tpr, fprs in zip(tprs, fprss)]
        print(str(datetime.datetime.now().time())[:-7])
        f = scipy.interpolate.interp1d(newvars, significances, kind='cubic')
        print(str(datetime.datetime.now().time())[:-7])
        res = scipy.optimize.minimize(f, [-2], bounds=[(-8 + 1e-2, -1e-1)])
        print(str(datetime.datetime.now().time())[:-7])
        
        best_threshold = self.newvar2thresh(res.x[0])
        best_predict = np.where(probs > best_threshold, np.ones_like(probs), np.zeros_like(probs))
        conf_matrices = [
            self.conf_matrix(
                predictions=best_predict[(self.test[:,-1] == -i) | (self.test[:,-1] == 1)], 
                labels=self.test_labels[(self.test[:,-1] == -i) | (self.test[:,-1] == 1)]) for i in range(1,4)]
        tpr = self.tpr_cm(conf_matrices[0])
        fprs = [self.fpr_cm(conf_matrix) for conf_matrix in conf_matrices]
        best_sig = self.significance(signal, backgrounds, tpr, fprs, sepbg=True)
        print(str(datetime.datetime.now().time())[:-7])
        
        return [best_threshold, best_sig, tpr, fprs, tprs, fprss]
    
    def req_sig_cs(self, lumi, bg_cs, tpr, fpr, sig=5):
        conv = 10**15 / 10**12
        coef = [-tpr**2 * lumi**2 * conv**2, sig**2 * tpr * lumi * conv, sig**2 * fpr * bg_cs * lumi * conv]
        return np.amax(np.roots(coef))
        
    def save_model(self, filename):
        joblib.dump(self.model, filename + '.joblib')

In [134]:
def refresh_model(model):
    return sigbg_model(model.model, model.train, model.test)

In [280]:
lista = [1,2,3]
listb = [4,5,6]
listc = lista + listb
print(listc)

[1, 2, 3, 4, 5, 6]


In [375]:
test1 = [[1,2,3], [1,2,3], [1,2,3], [1,2,3]]
test2 = [[1,2,3], [1,2,3], [1,2,3], [1,2,3]]
test3 = [1,2,3]
test4 = [1,2,3]
print(np.multiply(test1, test2))
print(np.multiply(test3, test4))
print(np.sum(np.multiply(test1, test2), axis=-1))
print(np.sum(np.multiply(test3, test4), axis=-1))

[[1 4 9]
 [1 4 9]
 [1 4 9]
 [1 4 9]]
[1 4 9]
[14 14 14 14]
14
