In [None]:
import h5py    
import numpy as np 
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import nbimporter
import gc

from keras import backend as K 

import tensorflow.compat.v1 as tf

from sklearn.metrics import roc_curve
from scipy import stats
import warnings

In [None]:
# Ignore tensorflow deprecation warnings
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
# Prepare GPU environment and define amount of memory to use
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"  # specify which GPU(s) to be used
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.42)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

In [None]:
# Define path to load input data
path = '/home/pablo/Documentos/lbl/olympics/final_runs_safety_data/input_features/'

# Load input features from file
features_filename = path + "all_input_features.hdf5"
h5_results = h5py.File(features_filename, 'r')

# Store features info
signal = h5_results['signal'][:]
h5_results.close()

# Load input features from file
features_filename = path + "signal_features_m300.hdf5"
h5_results = h5py.File(features_filename, 'r')

# Store features info
signal2 = h5_results['signal'][:]
h5_results.close()

# Load extra background
filename = path + "all_input_features_extra_QCD.hdf5"
h5_results = h5py.File(filename, 'r')

# Store reconstruction info
background2 = h5_results['background'][:]

h5_results.close()

In [None]:
print('Signal features shape: %s' % (signal.shape,))
print('Background features shape: %s' % (background2.shape,))

In [None]:
# Check memory used by dataframe
print('Memory used by signal array: %.2f MB' % (signal.nbytes / (10**3 * 1024)))
print('Memory used by background array: %.2f MB' % (background2.nbytes / (10**3 * 1024)))

In [None]:
print('Signal sample event:\n %s' % signal[-1])
print('Background sample event:\n %s' % background2[-1])

In [None]:
# Study each feature of jet 1 before preprocessing data
print('background2:')
print('mj: (min, max) = (%f, %f)' % (background2[:,0].min(), background2[:,0].max()))
print('t2: (min, max) = (%f, %f)' % (background2[:,1].min(), background2[:,1].max()))
print('tau21: (min, max) = (%f, %f)' % (background2[:,2].min(), background2[:,2].max()))
print('tau32: (min, max) = (%f, %f)' % (background2[:,3].min(), background2[:,3].max()))
print('tau43: (min, max) = (%f, %f)' % (background2[:,4].min(), background2[:,4].max()))
print('ntrk: (min, max) = (%i, %i)' % (background2[:,5].min(), background2[:,5].max()))
print('pt: (min, max) = (%i, %i)' % (background2[:,6].min(), background2[:,6].max()))
print('eta: (min, max) = (%i, %i)' % (background2[:,7].min(), background2[:,7].max()))
print('phi: (min, max) = (%i, %i)' % (background2[:,8].min(), background2[:,8].max()))

# Study each feature of jet 1 before preprocessing data
print('SIGNAL:')
print('mj: (min, max) = (%f, %f)' % (signal[:,0].min(), signal[:,0].max()))
print('t2: (min, max) = (%f, %f)' % (signal[:,1].min(), signal[:,1].max()))
print('tau21: (min, max) = (%f, %f)' % (signal[:,2].min(), signal[:,2].max()))
print('tau32: (min, max) = (%f, %f)' % (signal[:,3].min(), signal[:,3].max()))
print('tau43: (min, max) = (%f, %f)' % (signal[:,4].min(), signal[:,4].max()))
print('ntrk: (min, max) = (%i, %i)' % (signal[:,5].min(), signal[:,5].max()))
print('pt: (min, max) = (%i, %i)' % (signal[:,6].min(), signal[:,6].max()))
print('eta: (min, max) = (%i, %i)' % (signal[:,7].min(), signal[:,7].max()))
print('phi: (min, max) = (%i, %i)' % (signal[:,8].min(), signal[:,8].max()))

In [None]:
# Study each feature of jet 2 before preprocessing data
print('background2:')
print('mj: (min, max) = (%f, %f)' % (background2[:,9].min(), background2[:,9].max()))
print('t2: (min, max) = (%f, %f)' % (background2[:,10].min(), background2[:,10].max()))
print('tau21: (min, max) = (%f, %f)' % (background2[:,11].min(), background2[:,11].max()))
print('tau32: (min, max) = (%f, %f)' % (background2[:,12].min(), background2[:,12].max()))
print('tau43: (min, max) = (%f, %f)' % (background2[:,13].min(), background2[:,13].max()))
print('ntrk: (min, max) = (%i, %i)' % (background2[:,14].min(), background2[:,14].max()))
print('pt: (min, max) = (%i, %i)' % (background2[:,15].min(), background2[:,15].max()))
print('eta: (min, max) = (%i, %i)' % (background2[:,16].min(), background2[:,16].max()))
print('phi: (min, max) = (%i, %i)' % (background2[:,17].min(), background2[:,17].max()))

# Study each feature of jet 2 before preprocessing data
print('SIGNAL:')
print('mj: (min, max) = (%f, %f)' % (signal[:,9].min(), signal[:,9].max()))
print('t2: (min, max) = (%f, %f)' % (signal[:,10].min(), signal[:,10].max()))
print('tau21: (min, max) = (%f, %f)' % (signal[:,11].min(), signal[:,11].max()))
print('tau32: (min, max) = (%f, %f)' % (signal[:,12].min(), signal[:,12].max()))
print('tau43: (min, max) = (%f, %f)' % (signal[:,13].min(), signal[:,13].max()))
print('ntrk: (min, max) = (%i, %i)' % (signal[:,14].min(), signal[:,14].max()))
print('pt: (min, max) = (%i, %i)' % (signal[:,15].min(), signal[:,15].max()))
print('eta: (min, max) = (%i, %i)' % (signal[:,16].min(), signal[:,16].max()))
print('phi: (min, max) = (%i, %i)' % (signal[:,17].min(), signal[:,17].max()))

In [None]:
# Define function to take the input features of the jets

def make_data(jet1_features,jet2_features):
    bgdata_jet1=background2[:,jet1_features]
    bgdata_jet2=background2[:,jet2_features]

    sigdata_jet1=signal[:,jet1_features]
    sigdata_jet2=signal[:,jet2_features]

    sigdata2_jet1=signal2[:,jet1_features]
    sigdata2_jet2=signal2[:,jet2_features]

    bg_mjj = background2[:,18]
    sig_mjj = signal[:,18]
    sig2_mjj=signal2[:,18]

    bgdata_jets=np.hstack((bgdata_jet1,bgdata_jet2))
    sigdata_jets=np.hstack((sigdata_jet1,sigdata_jet2))
    sigdata2_jets=np.hstack((sigdata2_jet1,sigdata2_jet2))

    # restrict mjj to 2800-5200 GeV 
#    bg_mask=(bg_mjj>2800) & (bg_mjj<5200)
#    sig_mask=(sig_mjj>2800) & (sig_mjj<5200)
#    sig2_mask=(sig2_mjj>2800) & (sig2_mjj<5200)
#    
#    bgdata_jets=np.hstack((bgdata_jet1,bgdata_jet2))[bg_mask]
#    sigdata_jets=np.hstack((sigdata_jet1,sigdata_jet2))[sig_mask]
#    sigdata2_jets=np.hstack((sigdata2_jet1,sigdata2_jet2))[sig2_mask]
#    bg_mjj=bg_mjj[bg_mask]
#    sig_mjj=sig_mjj[sig_mask]
#    sig2_mjj=sig2_mjj[sig2_mask]
    
    meanvals=np.mean(bgdata_jets,axis=0)
    stdvals=np.std(bgdata_jets,axis=0)

    bgdata_std=(bgdata_jets-meanvals)/stdvals
    sigdata_std=(sigdata_jets-meanvals)/stdvals
    sigdata2_std=(sigdata2_jets-meanvals)/stdvals

    #stdvec=np.array([400,1,1,400,400,400,1,1,400,400]).astype('float32')
    #stdvec=stdvals
    #stdvec=400*np.ones(bgdata_jets.shape[1]).astype('float32')
    #bgdata_std=(bgdata_jets)/stdvec
    #sigdata_std=(sigdata_jets)/stdvec
    #sigdata2_std=(sigdata2_jets)/stdvec

    bgdata_final=np.hstack((bgdata_std,bg_mjj.reshape((-1,1))))
    sigdata_final=np.hstack((sigdata_std,sig_mjj.reshape((-1,1))))
    sigdata2_final=np.hstack((sigdata2_std,sig2_mjj.reshape((-1,1))))
    
    return bgdata_final,sigdata_final,sigdata2_final, meanvals, stdvals

In [None]:
# Create a dictionary to store the input features
bgdata={}
sigdata={}
sigdata2={}

# Define input features
jet1_features=[0,2,3,5,6] # mj,tau21,tau32,ntrk,pt
jet2_features=[9,11,12,14,15]

# Put input features in a dictionary, whose key is defined as the number of features of each jet
bgdata[5],sigdata[5],sigdata2[5], bg_mean, bg_std =make_data(jet1_features,jet2_features)

In [None]:
# Take data in the SR only:collection of True and False arrays of shape (n_bg, n_feat + mJJ), (n_sg, n_feat + mJJ)
SRmask_bg=(bgdata[5][:,-1]>3500-200) & (bgdata[5][:,-1]<3500+200)      
SRmask_sig=(sigdata[5][:,-1]>3500-200) & (sigdata[5][:,-1]<3500+200)      
SRmask_sig2=(sigdata2[5][:,-1]>3500-200) & (sigdata2[5][:,-1]<3500+200)

In [None]:
# Load models weights and calculate mse
bgout_final={}
sigout_final={}
sig2out_final={}
mse_bg_final={}
mse_sig_final={}
mse_sig2_final={}


for itrain in range(50):
    bgout_final[itrain]=np.load('saved_weights_4/AEmodel_feat_5_lat_2_hid_512_mod_'\
                              +str(itrain)+'_nsig_0_0.dict_bg.npy')
    sigout_final[itrain]=np.load('saved_weights_4/AEmodel_feat_5_lat_2_hid_512_mod_'\
                              +str(itrain)+'_nsig_0_0.dict_sig.npy')
    sig2out_final[itrain]=np.load('saved_weights_4/AEmodel_feat_5_lat_2_hid_512_mod_'\
                              +str(itrain)+'_nsig_0_0.dict_sig2.npy')

    mse_bg_final[itrain]=np.mean((bgout_final[itrain]-bgdata[5][:,:-1])**2,axis=1)
    mse_sig_final[itrain]=np.mean((sigout_final[itrain]-sigdata[5][:,:-1])**2,axis=1)
    mse_sig2_final[itrain]=np.mean((sig2out_final[itrain]-sigdata2[5][:,:-1])**2,axis=1)

    
del(bgout_final, sigout_final, sig2out_final)

# Define averaged mse distributions
mse_bg_final_avg=np.mean(np.array([mse_bg_final[i] for i in range(50)]),axis=0) # this averages the MSEs
mse_sig_final_avg=np.mean(np.array([mse_sig_final[i] for i in range(50)]),axis=0) 
mse_sig2_final_avg=np.mean(np.array([mse_sig2_final[i] for i in range(50)]),axis=0) 

In [None]:
# Calculate fpr, tpr and thresholds in the SR
fpr,tpr,thresholds=roc_curve(np.concatenate((np.zeros(len(mse_bg_final_avg[SRmask_bg])),
                                             np.ones(len(mse_sig_final_avg[SRmask_sig])))),
                             np.concatenate((mse_bg_final_avg[SRmask_bg],
                                             mse_sig_final_avg[SRmask_sig]))
                            )


# Calculate AUC
auc_m500 = roc_auc_score(np.concatenate((np.zeros(len(mse_bg_final_avg[SRmask_bg])),
                                         np.ones(len(mse_sig_final_avg[SRmask_sig])))),
                         np.concatenate((mse_bg_final_avg[SRmask_bg],
                                         mse_sig_final_avg[SRmask_sig]))
                        )

# Select thresholds with a finite SIC (not nan or inf), take the one that maximizes the SIC surve
maxsiccut_sig=thresholds[np.isfinite(tpr/np.sqrt(fpr))][np.argmax((tpr/np.sqrt(fpr))[np.isfinite(tpr/np.sqrt(fpr))])]

# Print max of SIC curve and the corresponding tpr
SIC_m500 = (tpr/np.sqrt(fpr))[np.isfinite(tpr/np.sqrt(fpr))]
thresh_m500 = thresholds[np.isfinite(tpr/np.sqrt(fpr))]

print(np.max((tpr/np.sqrt(fpr))[np.isfinite(tpr/np.sqrt(fpr))]))
print(tpr[np.argmax((tpr/np.sqrt(fpr))[np.isfinite(tpr/np.sqrt(fpr))])])
print('AUC = %.3f' % auc_m500)
fpr500=fpr
tpr500=tpr


In [None]:
# Calculate fpr, tpr and thresholds in the SR
fpr,tpr,thresholds=roc_curve(np.concatenate((np.zeros(len(mse_bg_final_avg[SRmask_bg])),
                                             np.ones(len(mse_sig2_final_avg[SRmask_sig2])))),
                             np.concatenate((mse_bg_final_avg[SRmask_bg],
                                             mse_sig2_final_avg[SRmask_sig2]))
                            )

# Calculate AUC
auc_m300 = roc_auc_score(np.concatenate((np.zeros(len(mse_bg_final_avg[SRmask_bg])),
                                         np.ones(len(mse_sig2_final_avg[SRmask_sig2])))),
                         np.concatenate((mse_bg_final_avg[SRmask_bg],
                                         mse_sig2_final_avg[SRmask_sig2]))
                        )

# Select thresholds with a finite SIC (not nan or inf), take the one that maximizes the SIC surve
maxsiccut_sig2=thresholds[np.isfinite(tpr/np.sqrt(fpr))][np.argmax((tpr/np.sqrt(fpr))[np.isfinite(tpr/np.sqrt(fpr))])]

# Print max of SIC curve and the corresponding tpr
SIC_m300 = (tpr/np.sqrt(fpr))[np.isfinite(tpr/np.sqrt(fpr))]
thresh_m300 = thresholds[np.isfinite(tpr/np.sqrt(fpr))]

print(np.max((tpr/np.sqrt(fpr))[np.isfinite(tpr/np.sqrt(fpr))]))
print(tpr[np.argmax((tpr/np.sqrt(fpr))[np.isfinite(tpr/np.sqrt(fpr))])])
print('AUC = %.3f' % auc_m300)
fpr300=fpr
tpr300=tpr

In [None]:
# Save preprocessing information for test data
pre_info = np.array([bg_mean, bg_std])

In [None]:
#######################################################################################################
##########################################   Save to file   ###########################################
#######################################################################################################    

#n_background = len(bgdata[5])
#n_signal = len(sigdata[5])
#n_features = 10 + 1

#save_path = '/home/pablo/Documentos/lbl/olympics/anomaly_detection_analysis/AE_new_data/'

# Create dataset and classes to store important information
#file_m500 = h5py.File(save_path + 'test_info_m500.hdf5', 'w')

# Save mse distribution and AUC
#bg_data_tofile = file_m500.create_dataset('bg_data', (n_background, n_features), dtype='f8')
#sg_data_tofile = file_m500.create_dataset('sg_data', (n_signal, n_features), dtype='f8')
#mse_bg_tofile = file_m500.create_dataset('mse_bg', (n_background, ), dtype='f8')
#mse_sg_tofile = file_m500.create_dataset('mse_sg', (n_signal, ), dtype='f8')
#pre_info_tofile = file_m500.create_dataset('preprocessing', (2, n_features-1), dtype='f8')

# Fill dataset
#bg_data_tofile[:] = bgdata[5]
#sg_data_tofile[:] = sigdata[5]
#mse_bg_tofile[:] = mse_bg_final_avg
#mse_sg_tofile[:] = mse_sig_final_avg
#pre_info_tofile[:] = pre_info

#file_m500.close()


# Create dataset and classes to store important information
#file_m300 = h5py.File(save_path + 'test_info_m300.hdf5', 'w')

# Save mse distribution and AUC
#bg_data_tofile = file_m300.create_dataset('bg_data', (n_background, n_features), dtype='f8')
#sg_data_tofile = file_m300.create_dataset('sg_data', (n_signal, n_features), dtype='f8')
#mse_bg_tofile = file_m300.create_dataset('mse_bg', (n_background, ), dtype='f8')
#mse_sg_tofile = file_m300.create_dataset('mse_sg', (n_signal, ), dtype='f8')
#pre_info_tofile = file_m300.create_dataset('preprocessing', (2, n_features-1), dtype='f8')

# Fill dataset
#bg_data_tofile[:] = bgdata[5]
#sg_data_tofile[:] = sigdata2[5]
#mse_bg_tofile[:] = mse_bg_final_avg
#mse_sg_tofile[:] = mse_sig2_final_avg
#pre_info_tofile[:] = pre_info

#file_m300.close()

In [None]:
# Load input features from file
#h5_results = h5py.File(save_path + 'test_info_m500.hdf5', 'r')

#bg_evs = h5_results['bg_data'][:]
#sg_evs = h5_results['sg_data'][:]
#bg_mse = h5_results['mse_bg'][:]
#sg_mse = h5_results['mse_sg'][:]
#pre = h5_results['preprocessing'][:]

#h5_results.close()

In [None]:
# Define a function that returns a significance level
from fit_utils import perform_fit, perform_fit2
from scipy.stats import poisson, norm, kstest, skewnorm


def Zscorefunc(obs,pred,sigma):
    pred2=pred-sigma**2
    LLR=0.25*(-2*obs+pred2**2/sigma**2+2*sigma**2+
              pred2*(4-np.sqrt(pred2**2+4*obs*sigma**2)/sigma**2)
             -4*obs*np.log((pred2+np.sqrt(pred2**2+4*obs*sigma**2))/(2*obs))
             )
    return np.sqrt(2*LLR)


In [None]:
# Define mJJ
mjj_bg=bgdata[5][:,-1]
mjj_sig=sigdata[5][:,-1]
mjj_sig2=sigdata2[5][:,-1]

In [None]:
###################################################################################################################
###################################################################################################################
##################################################               ##################################################
##################################################     Cut 1     ##################################################
##################################################               ##################################################
###################################################################################################################
###################################################################################################################

In [None]:
# Fit with full dataset, no cuts
#mask_sr=[3,4]
#def fit_func(y,p0,p1,p2,p3,p4,p5,p6,p7):
#    return np.exp(p0+p1*y+p2*y**2+p3*y**3+p4*y**4+p5*y**5+p6*y**6+p7*y**7)
#def fit_func(x,p0,p1,p2,p3,p4,p5,p6):
#    return (p3+p4*x+p5*x**2+p6*x**3)*skewnorm.pdf((x-p0)/p1,p2)\
#def fit_func(x,p0,p1,p2,p3,p4):
#    return p0*(1-x)**(p1+p4*(np.log(1-x)))/(x**(p2+p3*np.log(x)))

def fit_func(x,p0,p1):
    return p0+p1*x

pvallist_cut_sig=[]
pvallist_cut_sig2=[]
info_m500_cut_1 = []
info_m300_cut_1 = []
SIC_info_m500_cut_1 = []
SIC_info_m300_cut_1 = []

nbins=20
mask_sr=[7,8,9,10,11,12]
mask_sb=[x for x in range(nbins) if x not in mask_sr] 

nback = len(mjj_bg)

for nsig in range(10,895,15):
    print(nsig)
    
    ##########  m = 500 GeV  ##########

    # Before cut: define bins and count number of signal and background events in each bin
    counts_bg_raw,bins,_=plt.hist(mjj_bg,range=[3000,4000],bins=nbins,alpha=0.3)
    counts_sig_raw,bins,_=plt.hist(mjj_sig[:nsig],range=[3000,4000],bins=nbins,alpha=0.3)
    # Before cut: count number of signal and background events in the SR
    Sinit=np.sum(counts_sig_raw[mask_sr])
    Binit=np.sum(counts_bg_raw[mask_sr])
    
    # Find the mse threshold above which only 1% of the SR events survive
    n_cut = int((Sinit + Binit)*0.99)
    mse_bg_SR = mse_bg_final_avg[(mjj_bg > 3350) & (mjj_bg < 3650)]
    mse_sg_SR = mse_sig_final_avg[:nsig][(mjj_sig[:nsig] > 3350) & (mjj_sig[:nsig] < 3650)]
    mse_sorted = np.sort(np.concatenate((mse_bg_SR, mse_sg_SR), axis=0))
    cut_1_mse_m500 = mse_sorted[n_cut]
    
    # Calculate signal efficiency that corresponds to this threshold
    #up_bound_m500 = np.abs(SIC_m500 - maxsiccut_sig).argmin()
    #eff_m500 = tpr500[:up_bound_m500][np.abs(SIC_m500[:up_bound_m500] - cut_1_mse_m500).argmin()]
    
    eff_m500 = tpr500[np.abs(thresh_m500 - cut_1_mse_m500).argmin()]
    SIC_1_percent_m500 = SIC_m500[np.abs(thresh_m500 - cut_1_mse_m500).argmin()]
    
    print('mse threshold: %.5f' % cut_1_mse_m500)
    print(Sinit)
    print(Binit)
    
    # After cut: define bins and count number of signal and background events in each bin
    counts_bg,bins,_=plt.hist(mjj_bg[mse_bg_final_avg>cut_1_mse_m500],range=[3000,4000],bins=nbins,alpha=0.3)
    counts_sig,bins,_=plt.hist(mjj_sig[:nsig][mse_sig_final_avg[:nsig]>cut_1_mse_m500],range=[3000,4000],bins=nbins,alpha=0.3)
    # After cut: count number of signal and background events in the SR
    Safter=np.sum(counts_sig[mask_sr])
    Bafter=np.sum(counts_bg[mask_sr])

    # After cut: define the total number of events (s+b) in each SR bin, and the statistical error in each SR bin
    ydata=(counts_bg+counts_sig).astype('float64')
    yerr=np.sqrt(counts_bg+counts_sig).astype('float64')
    xdata=(0.5*(bins[:-1]+bins[1:])/14000).astype('float64')

    plt.axvline(bins[mask_sr[0]])
    plt.axvline(bins[mask_sr[-1]+1])

    # Make the fit
    popt,pcov,ydata_fit,pval = perform_fit(ydata,yerr,xdata,fit_func,mask_sb,mask_sr)
    
    # Check that the function is a good fit to the sideband
    residuals = (ydata - ydata_fit)/yerr
    redchisq=np.sum((residuals[mask_sb])**2/(len(mask_sb)-len(popt)))
    print("GOODNESS OF FIT ",redchisq)
    pvallist_cut_sig.append([nsig,Sinit/Binit,Sinit/np.sqrt(Binit),pval,norm.ppf(1-pval), Safter/np.sqrt(Bafter)])
    
    # Save useful information
    S_B = Sinit/Binit
    S_sqrt_B_initial = Sinit/np.sqrt(Binit)
    S_sqrt_B_after = Safter/np.sqrt(Bafter)
    pval_expected_initial = 1-norm.cdf(S_sqrt_B_initial)
    pval_expected_after = 1-norm.cdf(S_sqrt_B_after)
    selection_m500 = (Safter + Bafter)/(Sinit + Binit)*100
    
    info_m500_cut_1.append([S_B, S_sqrt_B_initial, S_sqrt_B_after, pval, pval_expected_initial, pval_expected_after, selection_m500])
    SIC_info_m500_cut_1.append([S_B, eff_m500, SIC_1_percent_m500])
    
    
    ##########  m = 300 GeV  ##########

    # Before cut: define bins and count number of signal and background events in each bin
    counts_bg_raw,bins,_=plt.hist(mjj_bg,range=[3000,4000],bins=nbins,alpha=0.3)
    counts_sig2_raw,bins,_=plt.hist(mjj_sig2[:nsig],range=[3000,4000],bins=nbins,alpha=0.3)
    # Before cut: count number of signal and background events in the SR
    Sinit=np.sum(counts_sig2_raw[mask_sr])
    Binit=np.sum(counts_bg_raw[mask_sr])
    
    # Find the mse threshold above which only 1% of the SR events survive
    n_cut = int((Sinit + Binit)*0.99)
    mse_bg_SR = mse_bg_final_avg[(mjj_bg > 3350) & (mjj_bg < 3650)]
    mse_sg_SR = mse_sig2_final_avg[:nsig][(mjj_sig2[:nsig] > 3350) & (mjj_sig2[:nsig] < 3650)]
    mse_sorted = np.sort(np.concatenate((mse_bg_SR, mse_sg_SR), axis=0))
    cut_1_mse_m300 = mse_sorted[n_cut]
    
    # Calculate signal efficiency that corresponds to this threshold
    #up_bound_m300 = np.abs(SIC_m300 - maxsiccut_sig).argmin()
    #eff_m300 = tpr300[:up_bound_m300][np.abs(SIC_m300[:up_bound_m300] - cut_1_mse_m300).argmin()]
    
    eff_m300 = tpr300[np.abs(thresh_m300 - cut_1_mse_m300).argmin()]
    SIC_1_percent_m300 = SIC_m300[np.abs(thresh_m300 - cut_1_mse_m300).argmin()]

    # After cut: define bins and count number of signal and background events in each bin
    counts_bg,bins,_=plt.hist(mjj_bg[mse_bg_final_avg>cut_1_mse_m300],range=[3000,4000],bins=nbins,alpha=0.3)
    counts_sig2,bins,_=plt.hist(mjj_sig2[:nsig][mse_sig2_final_avg[:nsig]>cut_1_mse_m300],range=[3000,4000],bins=nbins,alpha=0.3)
    # After cut: count number of signal and background events in the SR
    Safter=np.sum(counts_sig2[mask_sr])
    Bafter=np.sum(counts_bg[mask_sr])
    
    # After cut: define the total number of events (s+b) in each bin, and the statistical error in each bin
    ydata=(counts_bg+counts_sig2).astype('float64')
    yerr=np.sqrt(counts_bg+counts_sig2).astype('float64')
    xdata=(0.5*(bins[:-1]+bins[1:])/14000).astype('float64')
    plt.axvline(bins[mask_sr[0]])
    plt.axvline(bins[mask_sr[-1]+1])

    # Make the fit
    popt,pcov,ydata_fit,pval = perform_fit(ydata,yerr,xdata,fit_func,mask_sb,mask_sr)
    
    #Check that the function is a good fit to the sideband
    residuals = (ydata - ydata_fit)/yerr
    redchisq=np.sum((residuals[mask_sb])**2/(len(mask_sb)-len(popt)))
    print("GOODNESS OF FIT ",redchisq)
    pvallist_cut_sig2.append([nsig,Sinit/Binit,Sinit/np.sqrt(Binit),pval,norm.ppf(1-pval), Safter/np.sqrt(Bafter)])
    
    # Save useful information
    S_B = Sinit/Binit
    S_sqrt_B_initial = Sinit/np.sqrt(Binit)
    S_sqrt_B_after = Safter/np.sqrt(Bafter)
    pval_expected_initial = 1-norm.cdf(S_sqrt_B_initial)
    pval_expected_after = 1-norm.cdf(S_sqrt_B_after)
    selection_m300 = (Safter + Bafter)/(Sinit + Binit)*100
    
    info_m300_cut_1.append([S_B, S_sqrt_B_initial, S_sqrt_B_after, pval, pval_expected_initial, pval_expected_after, selection_m300])
    SIC_info_m300_cut_1.append([S_B, eff_m300, SIC_1_percent_m300])
    
# Transform lists to arrays
info_m500_cut_1 = np.array(info_m500_cut_1)
info_m300_cut_1 = np.array(info_m300_cut_1)
SIC_info_m500_cut_1 = np.array(SIC_info_m500_cut_1)
SIC_info_m300_cut_1 = np.array(SIC_info_m300_cut_1)

In [None]:
print('Event selection for m500: %.2f%%' % selection_m500)
print('Event selection for m300: %.2f%%' % selection_m300)

In [None]:
print('Signal efficiency for this cut on m500:\n%s\n' % SIC_info_m500_cut_1[:,1])
print('Signal efficiency for this cut on m300:\n%s' % SIC_info_m300_cut_1[:,1])

In [None]:
print('SIC value for this cut on m500:\n%s\n' % SIC_info_m500_cut_1[:,2])
print('SIC value for this cut on m300:\n%s' % SIC_info_m300_cut_1[:,2])

In [None]:
print('Threshold based on the SIC curve: mse = %.3f' % cut_1_mse_m500)
print('Threshold based on the SIC curve: mse = %.3f' % cut_1_mse_m300)

In [None]:
# Make plot
plt.plot(info_m500_cut_1[:,0], info_m500_cut_1[:,3], label='1% overall',color='orange')
plt.plot(info_m500_cut_1[:,0], info_m500_cut_1[:,5], label='1%, S/sqrtB',color='orange',ls='dashed')

plt.plot(info_m500_cut_1[:,0], info_m500_cut_1[:,3], label='optimal cut, fit',color='blue')
plt.plot(info_m500_cut_1[:,0], info_m500_cut_1[:,5], label='optimal cut, S/sqrtB',color='blue',ls='dashed')
plt.plot(info_m500_cut_1[:,0], info_m500_cut_1[:,4], label='no cut, S/sqrtB',color='red',ls='dashed')

for sigma in range(1,9):
    plt.axhline(1-norm.cdf(sigma),linestyle='dashed',color='black')
    sigmastring = r'$' + str(sigma) + '\sigma$'
    plt.text(0.0015,(1-norm.cdf(sigma))*1.1,sigmastring,va='bottom',ha='center',fontsize=12)

plt.yscale('log')
plt.ylim(1e-15,1)
plt.title('m=500 GeV')
plt.legend()
plt.xlabel('S/B in SR')
plt.ylabel('p value')

In [None]:
# Make plot
plt.plot(info_m300_cut_1[:,0], info_m300_cut_1[:,3], label='optimal cut, fit',color='blue')
plt.plot(info_m300_cut_1[:,0], info_m300_cut_1[:,5], label='optimal cut, S/sqrtB',color='blue',ls='dashed')
plt.plot(info_m300_cut_1[:,0], info_m300_cut_1[:,4], label='no cut, S/sqrtB',color='red',ls='dashed')

for sigma in range(1,9):
    plt.axhline(1-norm.cdf(sigma),linestyle='dashed',color='black')
    sigmastring = r'$' + str(sigma) + '\sigma$'
    plt.text(0.0015,(1-norm.cdf(sigma))*1.1,sigmastring,va='bottom',ha='center',fontsize=12)

plt.yscale('log')
plt.ylim(1e-15,1)
plt.title('m=300 GeV')
plt.legend()
plt.xlabel('S/B in SR')
plt.ylabel('p value')

In [None]:
# Check the fraction of selected events in the SR for each benchmarks for the m500 signal
info_m500_cut_1[:,-1]

In [None]:
# Check the fraction of selected events in the SR for each benchmarks for the m300 signal
info_m300_cut_1[:,-1]

In [None]:
###################################################################################################################
###################################################################################################################
##################################################               ##################################################
##################################################     Cut 2     ##################################################
##################################################               ##################################################
###################################################################################################################
###################################################################################################################

In [None]:
# Fit with full dataset, no cuts
#mask_sr=[3,4]
#def fit_func(y,p0,p1,p2,p3,p4,p5,p6,p7):
#    return np.exp(p0+p1*y+p2*y**2+p3*y**3+p4*y**4+p5*y**5+p6*y**6+p7*y**7)
#def fit_func(x,p0,p1,p2,p3,p4,p5,p6):
#    return (p3+p4*x+p5*x**2+p6*x**3)*skewnorm.pdf((x-p0)/p1,p2)\
#def fit_func(x,p0,p1,p2,p3,p4):
#    return p0*(1-x)**(p1+p4*(np.log(1-x)))/(x**(p2+p3*np.log(x)))

def fit_func(x,p0,p1):
    return p0+p1*x

#mask_sr=[4,5]
#mask_sr=[11,12,13,14,15,16]
pvallist_cut_sig=[]
pvallist_cut_sig2=[]
info_m500_cut_2 = []
info_m300_cut_2 = []
SIC_info_m500_cut_2 = []
SIC_info_m300_cut_2 = []

nbins=32
#mask_sr=[7,8,9,10,11,12]
#mask_sr=[9,10,11,12,13,14]
mask_sr=[11,12,13,14,15,16]
mask_sb=[x for x in range(nbins) if x not in mask_sr] 

nback = len(mjj_bg)

for nsig in range(10,895,15):
    print(nsig)
    
    ##########  m = 500 GeV  ##########

    # Before cut: define bins and count number of signal and background events in each bin
    counts_bg_raw,bins,_=plt.hist(mjj_bg,range=[2800,4400],bins=nbins,alpha=0.3)
    counts_sig_raw,bins,_=plt.hist(mjj_sig[:nsig],range=[2800,4400],bins=nbins,alpha=0.3)
    # Before cut: count number of signal and background events in the SR
    Sinit=np.sum(counts_sig_raw[mask_sr])
    Binit=np.sum(counts_bg_raw[mask_sr])
    
    # Find the mse threshold above which only 0.1% of the SR events survive
    n_cut = int((Sinit + Binit)*0.999)
    mse_bg_SR = mse_bg_final_avg[(mjj_bg > 3350) & (mjj_bg < 3650)]
    mse_sg_SR = mse_sig_final_avg[:nsig][(mjj_sig[:nsig] > 3350) & (mjj_sig[:nsig] < 3650)]
    mse_sorted = np.sort(np.concatenate((mse_bg_SR, mse_sg_SR), axis=0))
    cut_2_mse_m500 = mse_sorted[n_cut]
    
    # Calculate signal efficiency that corresponds to this threshold
    #up_bound_m500 = np.abs(SIC_m500 - maxsiccut_sig).argmin()
    #eff_m500 = tpr500[:up_bound_m500][np.abs(SIC_m500[:up_bound_m500] - cut_2_mse_m500).argmin()]
    
    eff_m500 = tpr500[np.abs(thresh_m500 - cut_2_mse_m500).argmin()]
    SIC_01_percent_m500 = SIC_m500[np.abs(thresh_m500 - cut_2_mse_m500).argmin()]
    
    #print('mse threshold: %.5f' % cut_2_mse_m500)
    
    # After cut: define bins and count number of signal and background events in each bin
    counts_bg,bins,_=plt.hist(mjj_bg[mse_bg_final_avg>cut_2_mse_m500],range=[2800,4400],bins=nbins,alpha=0.3)
    counts_sig,bins,_=plt.hist(mjj_sig[:nsig][mse_sig_final_avg[:nsig]>cut_2_mse_m500],range=[2800,4400],bins=nbins,alpha=0.3)
    # After cut: count number of signal and background events in the SR
    Safter=np.sum(counts_sig[mask_sr])
    Bafter=np.sum(counts_bg[mask_sr])

    # After cut: define the total number of events (s+b) in each SR bin, and the statistical error in each SR bin
    ydata=(counts_bg+counts_sig).astype('float64')
    yerr=np.sqrt(counts_bg+counts_sig).astype('float64')
    xdata=(0.5*(bins[:-1]+bins[1:])/14000).astype('float64')

    plt.axvline(bins[mask_sr[0]])
    plt.axvline(bins[mask_sr[-1]+1])

    # Make the fit
    popt,pcov,ydata_fit,pval = perform_fit(ydata,yerr,xdata,fit_func,mask_sb,mask_sr)
    
    # Check that the function is a good fit to the sideband
    residuals = (ydata - ydata_fit)/yerr
    redchisq=np.sum((residuals[mask_sb])**2/(len(mask_sb)-len(popt)))
    print("GOODNESS OF FIT ",redchisq)
    pvallist_cut_sig.append([nsig,Sinit/Binit,Sinit/np.sqrt(Binit),pval,norm.ppf(1-pval), Safter/np.sqrt(Bafter)])
    
    # Save useful information
    S_B = Sinit/Binit
    S_sqrt_B_initial = Sinit/np.sqrt(Binit)
    S_sqrt_B_after = Safter/np.sqrt(Bafter)
    pval_expected_initial = 1-norm.cdf(S_sqrt_B_initial)
    pval_expected_after = 1-norm.cdf(S_sqrt_B_after)
    selection_m500 = (Safter + Bafter)/(Sinit + Binit)*100
    
    print(cut_2_mse_m500)
    
    info_m500_cut_2.append([S_B, S_sqrt_B_initial, S_sqrt_B_after, pval, pval_expected_initial, pval_expected_after, selection_m500])
    SIC_info_m500_cut_2.append([S_B, eff_m500, SIC_01_percent_m500])
    
    
    ##########  m = 300 GeV  ##########

    # Before cut: define bins and count number of signal and background events in each bin
    counts_bg_raw,bins,_=plt.hist(mjj_bg,range=[2800,4400],bins=nbins,alpha=0.3)
    counts_sig2_raw,bins,_=plt.hist(mjj_sig2[:nsig],range=[2800,4400],bins=nbins,alpha=0.3)
    # Before cut: count number of signal and background events in the SR
    Sinit=np.sum(counts_sig2_raw[mask_sr])
    Binit=np.sum(counts_bg_raw[mask_sr])
    
    # Find the mse threshold above which only 0.1% of the SR events survive
    n_cut = int((Sinit + Binit)*0.999)
    mse_bg_SR = mse_bg_final_avg[(mjj_bg > 3350) & (mjj_bg < 3650)]
    mse_sg_SR = mse_sig2_final_avg[:nsig][(mjj_sig2[:nsig] > 3350) & (mjj_sig2[:nsig] < 3650)]
    mse_sorted = np.sort(np.concatenate((mse_bg_SR, mse_sg_SR), axis=0))
    cut_2_mse_m300 = mse_sorted[n_cut]
    
    # Calculate signal efficiency that corresponds to this threshold
    #up_bound_m300 = np.abs(SIC_m300 - maxsiccut_sig).argmin()
    #eff_m300 = tpr300[:up_bound_m300][np.abs(SIC_m300[:up_bound_m300] - cut_2_mse_m300).argmin()]
    
    eff_m300 = tpr300[np.abs(thresh_m300 - cut_2_mse_m300).argmin()]
    SIC_01_percent_m300 = SIC_m300[np.abs(thresh_m300 - cut_2_mse_m300).argmin()]

    # After cut: define bins and count number of signal and background events in each bin
    counts_bg,bins,_=plt.hist(mjj_bg[mse_bg_final_avg>cut_2_mse_m300],range=[2800,4400],bins=nbins,alpha=0.3)
    counts_sig2,bins,_=plt.hist(mjj_sig2[:nsig][mse_sig2_final_avg[:nsig]>cut_2_mse_m300],range=[2800,4400],bins=nbins,alpha=0.3)
    # After cut: count number of signal and background events in the SR
    Safter=np.sum(counts_sig2[mask_sr])
    Bafter=np.sum(counts_bg[mask_sr])
    
    # After cut: define the total number of events (s+b) in each bin, and the statistical error in each bin
    ydata=(counts_bg+counts_sig2).astype('float64')
    yerr=np.sqrt(counts_bg+counts_sig2).astype('float64')
    xdata=(0.5*(bins[:-1]+bins[1:])/14000).astype('float64')
    plt.axvline(bins[mask_sr[0]])
    plt.axvline(bins[mask_sr[-1]+1])

    # Make the fit
    popt,pcov,ydata_fit,pval = perform_fit(ydata,yerr,xdata,fit_func,mask_sb,mask_sr)
    
    #Check that the function is a good fit to the sideband
    residuals = (ydata - ydata_fit)/yerr
    redchisq=np.sum((residuals[mask_sb])**2/(len(mask_sb)-len(popt)))
    print("GOODNESS OF FIT ",redchisq)
    pvallist_cut_sig2.append([nsig,Sinit/Binit,Sinit/np.sqrt(Binit),pval,norm.ppf(1-pval), Safter/np.sqrt(Bafter)])
    
    # Save useful information
    S_B = Sinit/Binit
    S_sqrt_B_initial = Sinit/np.sqrt(Binit)
    S_sqrt_B_after = Safter/np.sqrt(Bafter)
    pval_expected_initial = 1-norm.cdf(S_sqrt_B_initial)
    pval_expected_after = 1-norm.cdf(S_sqrt_B_after)
    selection_m300 = (Safter + Bafter)/(Sinit + Binit)*100
    
    info_m300_cut_2.append([S_B, S_sqrt_B_initial, S_sqrt_B_after, pval, pval_expected_initial, pval_expected_after, selection_m300])
    SIC_info_m300_cut_2.append([S_B, eff_m300, SIC_01_percent_m300])
    
# Transform lists to arrays
info_m500_cut_2 = np.array(info_m500_cut_2)
info_m300_cut_2 = np.array(info_m300_cut_2)
SIC_info_m500_cut_2 = np.array(SIC_info_m500_cut_2)
SIC_info_m300_cut_2 = np.array(SIC_info_m300_cut_2)

In [None]:
print('Event selection for m500: %.2f%%' % selection_m500)
print('Event selection for m300: %.2f%%' % selection_m300)

In [None]:
print('Signal efficiency for this cut on m500:\n%s\n' % SIC_info_m500_cut_2[:,1])
print('Signal efficiency for this cut on m300:\n%s' % SIC_info_m300_cut_2[:,1])

In [None]:
print('SIC value for this cut on m500:\n%s\n' % SIC_info_m500_cut_2[:,2])
print('SIC value for this cut on m300:\n%s' % SIC_info_m300_cut_2[:,2])

In [None]:
print('Threshold based on the SIC curve: mse = %.3f' % cut_2_mse_m500)
print('Threshold based on the SIC curve: mse = %.3f' % cut_2_mse_m300)

In [None]:
# Make plot
plt.plot(info_m500_cut_4[:,0], info_m500_cut_4[:,3], label='0.1% small range',color='orange')
plt.plot(info_m500_cut_4[:,0], info_m500_cut_4[:,5], label='0.1%, S/sqrtB',color='orange',ls='dashed')

plt.plot(info_m500_cut_2[:,0], info_m500_cut_2[:,3], label='optimal cut, fit',color='blue')
plt.plot(info_m500_cut_2[:,0], info_m500_cut_2[:,5], label='optimal cut, S/sqrtB',color='blue',ls='dashed')
plt.plot(info_m500_cut_2[:,0], info_m500_cut_2[:,4], label='no cut, S/sqrtB',color='red',ls='dashed')

for sigma in range(1,9):
    plt.axhline(1-norm.cdf(sigma),linestyle='dashed',color='black')
    sigmastring = r'$' + str(sigma) + '\sigma$'
    plt.text(0.0015,(1-norm.cdf(sigma))*1.1,sigmastring,va='bottom',ha='center',fontsize=12)

plt.yscale('log')
plt.ylim(1e-15,1)
plt.title('m=500 GeV')
plt.legend()
plt.xlabel('S/B in SR')
plt.ylabel('p value')

In [None]:
# Make plot
plt.plot(info_m300_cut_2[:,0], info_m300_cut_2[:,3], label='optimal cut, fit',color='blue')
plt.plot(info_m300_cut_2[:,0], info_m300_cut_2[:,5], label='optimal cut, S/sqrtB',color='blue',ls='dashed')
plt.plot(info_m300_cut_2[:,0], info_m300_cut_2[:,4], label='no cut, S/sqrtB',color='red',ls='dashed')

for sigma in range(1,9):
    plt.axhline(1-norm.cdf(sigma),linestyle='dashed',color='black')
    sigmastring = r'$' + str(sigma) + '\sigma$'
    plt.text(0.0015,(1-norm.cdf(sigma))*1.1,sigmastring,va='bottom',ha='center',fontsize=12)

plt.yscale('log')
plt.ylim(1e-15,1)
plt.title('m=300 GeV')
plt.legend()
plt.xlabel('S/B in SR')
plt.ylabel('p value')

In [None]:
# Check the fraction of selected events in the SR for each benchmarks for the m500 signal
info_m500_cut_2[:,-1]

In [None]:
# Check the fraction of selected events in the SR for each benchmarks for the m300 signal
info_m300_cut_2[:,-1]

In [None]:
# Summary of scenarios:
# 1) Cut 1: 1% cut on SR events, fit range [3000, 4000]         |  Best one   |  sigma = 0.56 for S/B = 0
# 2) Cut 2: 0.1% cut on SR events, fit range [2800, 4400]       |  Best one   |  sigma = 1.06 for S/B = 0

In [None]:
#######################################################################################################
##########################################   Save to file   ###########################################
#######################################################################################################    

save_path = '/home/pablo/Documentos/lbl/olympics/anomaly_detection_analysis/AE_new_data/'

# Create dataset and classes to store important information
file_m500 = h5py.File(save_path + 'AE_cuts_fit_SIC_info_m500.hdf5', 'w')

# Save mse distribution and AUC
cut_1_percent_info_tofile = file_m500.create_dataset('cut_1_percent', SIC_info_m500_cut_3.shape, dtype='f8')
cut_01_percent_info_tofile = file_m500.create_dataset('cut_01_percent', SIC_info_m500_cut_6.shape, dtype='f8')

# Fill dataset
cut_1_percent_info_tofile[:] = SIC_info_m500_cut_1
cut_01_percent_info_tofile[:] = SIC_info_m500_cut_2

file_m500.close()


# Create dataset and classes to store important information
file_m300 = h5py.File(save_path + 'AE_cuts_fit_SIC_info_m300.hdf5', 'w')

# Save mse distribution and AUC
cut_1_percent_info_tofile = file_m300.create_dataset('cut_1_percent', SIC_info_m300_cut_3.shape, dtype='f8')
cut_01_percent_info_tofile = file_m300.create_dataset('cut_01_percent', SIC_info_m300_cut_6.shape, dtype='f8')

# Fill dataset
cut_1_percent_info_tofile[:] = SIC_info_m300_cut_1
cut_01_percent_info_tofile[:] = SIC_info_m300_cut_2

file_m300.close()

In [None]:
#######################################################################################################
##########################################   Save to file   ###########################################
#######################################################################################################    

save_path = '/home/pablo/Documentos/lbl/olympics/anomaly_detection_analysis/AE_new_data/'

# Create dataset and classes to store important information
file_m500 = h5py.File(save_path + 'AE_pvalues_from_fit_info_m500.hdf5', 'w')

# Save mse distribution and AUC
fit_cut_1_info_tofile = file_m500.create_dataset('fit_m500_cut_1', info_m500_cut_1.shape, dtype='f8')
fit_cut_2_info_tofile = file_m500.create_dataset('fit_m500_cut_2', info_m500_cut_2.shape, dtype='f8')

# Fill dataset
fit_cut_1_info_tofile[:] = info_m500_cut_1
fit_cut_2_info_tofile[:] = info_m500_cut_2

file_m500.close()


# Create dataset and classes to store important information
file_m300 = h5py.File(save_path + 'AE_pvalues_from_fit_info_m300.hdf5', 'w')

# Save mse distribution and AUC
fit_cut_1_info_tofile = file_m300.create_dataset('fit_m300_cut_1', info_m300_cut_1.shape, dtype='f8')
fit_cut_2_info_tofile = file_m300.create_dataset('fit_m300_cut_2', info_m300_cut_2.shape, dtype='f8')

# Fill dataset
fit_cut_1_info_tofile[:] = info_m300_cut_1
fit_cut_2_info_tofile[:] = info_m300_cut_2

file_m300.close()