In [None]:
import pandas as pd
import numpy as np
from bff_processor.data_tools import regex_select, get_files, make_df
from bff_processor.utils import time_func, hist2unc, linear, parabola 
from bff_processor.bff_meta import preselection, band_cut, isin, identity
from bff_processor.plotting_utils import produce_bff_hists, boost_plot, boost_plot2d, unc_plot
import boost_histogram as bh
import matplotlib.pyplot as plt
import mplhep as hep
hep.set_style(hep.style.CMS)

In [None]:
# get 2017 files
#era = 2016
era = 9999
#file_re = regex_select(era)
#print(file_re)
file_re = 'tw_.+\.csv'
file_dict = get_files(file_re)

In [None]:
#make df of background
backgrounds = file_dict['DY']+file_dict['TT']+file_dict['ST']+file_dict['VB']
background_df = preselection()(make_df(backgrounds))

In [None]:
#make df of signals
import re
bff_dict = {}
for fname in file_dict['BFF']:
    #print(fname)
    name = re.findall('_M_([0-9]+)_dbs(\d)p(\d+)', fname)[0]
    name = "{} {}.{}".format(*name)
    #print(name)
    if name not in bff_dict:
        bff_dict[name] = []
    bff_dict[name].append(fname)
for name, item in bff_dict.items():
    if len(item) != 3: continue
    print(item)
    bff_dict[name] = preselection()(make_df(item))
bff_samples = ['125 0.04', '200 0.04','350 0.04','500 0.04']
bff_dict.keys()

In [None]:
columns = [
    ['DiLepMass', [139,105,800]],
    ['TMB_nom',       [80,0, 800]],
    ['HTLT_nom',      [100,-500,500]],
    ['RelMET_nom',    [100, 0,1]]
]

In [None]:
def significance(sig,bck):
    return sig/(sig+bck+1e-12)**.5

In [None]:
#optimize RelMET cut:
def column_1d_sig(sig_df, bck_df, mass_band=1, filter_func=lambda x: x):
    #select for region
    _sig_df = filter_func(sig_df)
    _bck_df =  filter_func(bck_df)
    mean, std = sig_df['DiLepMass'].mean(), sig_df['DiLepMass'].std()
    widht = std*mass_band
    #mass band cut
    _sig_df = band_cut('DiLepMass',mean-widht, mean+widht)(_sig_df)
    _bck_df = band_cut('DiLepMass',mean-widht, mean+widht)(_bck_df)
    bck_1d_hist, bck_2d_hist = produce_bff_hists(_bck_df, "", columns, weight='Weight')
    sig_1d_hist, sig_2d_hist = produce_bff_hists(_sig_df, "", columns, weight='Weight')
    return bck_1d_hist, sig_1d_hist, bck_2d_hist, sig_2d_hist

In [None]:
def calc_sig_cut(s,b, direction=1):
    from itertools import accumulate
    s,b = np.array(list(accumulate(s[::direction])))[::direction], np.array(list(accumulate(b[::direction])))[::direction]
    return significance(s,b)

In [None]:
from scipy.stats import norm
def map_signf(value, signf, centers):
    diff = np.abs(centers - value)
    delta  = (centers[1] - centers[0])
    weight = norm.pdf(diff, loc=0, scale=delta*1)
    wval = np.dot(signf,weight)/np.sum(weight+1e-12)
    return wval.nominal_value


In [None]:
def linear_sig(masses, sigs, centers, *popt):
    signfs = []
    for m, sig in zip(masses,sigs):
        cut_val = linear(m,*popt)
        signf = map_signf(cut_val, sig, centers)
        signfs.append(signf+1e-12)
    return np.dot(-np.array(signfs), np.power(masses, 0))

In [None]:
from scipy.optimize import curve_fit, minimize
from bff_processor.utils import vunc2nom, vunc2std
import uncertainties

In [None]:
def plot_opt_sig(column,bff_dict, background_df, fit=False, filter_func=lambda x: x, postfix="",direction=1, p0=[0,.5]):
    fig,ax = plt.subplots(2,4, figsize=[30,20])
    fit_x = []
    fit_y = []
    fit_y_unc = []
    sigs = []
    masses = []
    for j, m in enumerate(bff_samples):
        mass = int(re.findall('([0-9]+)',m)[0])
        bck_1d_hist, sig_1d_hist, bck_2d_hist, sig_2d_hist = column_1d_sig(bff_dict[m], background_df, mass_band=2,filter_func=filter_func)
        
        top_ax = ax[0,j]
        boost_plot(top_ax, bck_1d_hist[column] ,label='background')
        boost_plot(top_ax, sig_1d_hist[column] ,label='Signal {} GeV'.format(m))
        top_ax.set_ylabel(column)
        top_ax.set_xlabel('Counts')
        top_ax.legend()
        
        s,b = hist2unc(sig_1d_hist[column]), hist2unc(bck_1d_hist[column])
        signf_val = calc_sig_cut(s,b, direction=direction)
        
        centers = sig_1d_hist[column].axes[0].centers
        unc_plot(ax[1][j], signf_val,centers, zorder=1)
        peak_index = np.argmax(signf_val)
        peak_center = centers[peak_index]
        peak_height = signf_val[peak_index]
        
        sigs.append(signf_val)
        masses.append(mass)
        
        if fit:
                lims = (peak_index-5,peak_index+5)
                c_band = centers[lims[0]:lims[1]]
                nom_band = vunc2nom(signf_val)[lims[0]:lims[1]]
                std_band = vunc2std(signf_val)[lims[0]:lims[1]]
                popt, pcov = curve_fit(parabola,c_band, nom_band, sigma=std_band, absolute_sigma=True)
                corr_values = uncertainties.correlated_values(popt,pcov)
                ax[1][j].plot(c_band,parabola(c_band,*popt), zorder=2)
                peak_center = popt[0]
                peak_height = popt[1]
            
        #uncertainty of center
        upper_limit = vunc2nom(signf_val)+vunc2std(signf_val)
        in_range = centers[upper_limit > peak_height]
        sigma_range = np.asarray((in_range[0], in_range[-1]))
        unc = np.max(np.abs(sigma_range-peak_center))
        fit_x.append(mass)
        fit_y.append(peak_center)
        fit_y_unc.append(unc)
            
        ax[1][j].set_title("max: {:.2f}+/-{:.2f}".format(peak_center, unc), zorder=2)
        
    fit_func = lambda popt: linear_sig(masses, sigs, centers, *popt)

    min_val = minimize(fit_func, p0)
    popt = min_val.x
    masses = np.array(masses)
    cut_points = linear(masses, *popt)
    for j, (m, c, s) in enumerate(zip(masses, cut_points, sigs)):
        sig_at_cut = map_signf(c, s, centers)
        print(sig_at_cut, c)
        #ax[1][j].plot([c,c], [0,sig_at_cut], zorder=1)
        ax[1][j].scatter([c], [sig_at_cut], zorder=1,s=150, c='red', marker=(5, 1), label='cut location')
        ax[1][j].legend()
    fig.savefig("output/sig_tuning/{}_{}_{}.pdf".format(era, column, postfix), transparent=False)
    return popt
    
    #return popt
        #break

In [None]:
popt_relmet_1  = plot_opt_sig('RelMET_nom',bff_dict, background_df,filter_func=isin('SR1_nom'), postfix='_one_jet')
popt_relmet_2  = plot_opt_sig('RelMET_nom',bff_dict, background_df,filter_func=isin('SR2_nom'), postfix='_two_jet')

In [None]:
def band_cut2d(column1, column2, low=[0,-np.inf], high=[0,np.inf]):
    import pandas as pd 
    def cut_df(df):
        return df[(linear(df[column1],*low) < df[column2]) & (linear(df[column1],*high) > df[column2])]
    return cut_df    

In [None]:
def boost_plot2d(ax, h, lock_aspect=0, log=0,**kwargs):
    w, x, y = h.to_numpy()
    # Draw the count matrix
    if not log:
        ax.pcolormesh(x, y, w.T)
    else:
        import matplotlib.colors as colors
        print(w.T.min(),w.T.max())
        vmin = max(w.T.min(),.1)
        vmax = w.T.max()
        print(vmin,vmax)
        ax.pcolor(x, y, w.T,
                   norm=colors.LogNorm(vmin=vmin, vmax=vmax),
                   cmap='PuBu_r', shading='auto')
    ax.set_xlabel(h.axes[0].metadata)
    ax.set_ylabel(h.axes[1].metadata)
    if lock_aspect: ax.set_aspect("equal")

In [None]:
def visualize_2d(col2, bff_dict, background_df, name='', filter_func=lambda x: x, **kwargs):
    '''This wont work if the plot producer doesn't ahve the right format'''
    col1= 'DiLepMass'
    test_cut_bck = filter_func(background_df)
    _, test_2d = produce_bff_hists(test_cut_bck, name, columns, weight='Weight')
    sig_plts = {}
    for m in bff_samples:
        test_cut_sig = filter_func(bff_dict[m])
        _, test_2d_sig = produce_bff_hists(test_cut_sig, name, columns, weight='Weight')
        sig_plts[m] = test_2d_sig
        
    fig,ax = plt.subplots(1,1+len(sig_plts), figsize=[35,6])
    boost_plot2d(ax[0], test_2d['{}_{}'.format(col1,col2)],log=1)
    ax[0].set_title('background')
    for i, (m,tdict) in enumerate(sig_plts.items()):
        boost_plot2d(ax[i+1], tdict['{}_{}'.format(col1,col2)],log=1)
        ax[i+1].set_title(m)
    fig.savefig('output/sig_tuning/{}_{}_{}_2d.pdf'.format(name,col1,col2))

In [None]:
visualize_2d('RelMET_nom', bff_dict, background_df, name='oneJet_nocut', filter_func=isin('SR1_nom'))
visualize_2d('RelMET_nom', bff_dict, background_df, name='twoJet_nocut', filter_func=isin('SR2_nom'))

In [None]:
popt_relmet_1
popt_relmet_2
RelMET_ff1 = lambda x: band_cut2d('DiLepMass','RelMET_nom', high=popt_relmet_1)(isin('SR1_nom')(x))
RelMET_ff2 = lambda x: band_cut2d('DiLepMass','RelMET_nom', high=popt_relmet_2)(isin('SR2_nom')(x))

visualize_2d('RelMET_nom', bff_dict, background_df, name='oneJet_cut', filter_func=RelMET_ff1)
visualize_2d('RelMET_nom', bff_dict, background_df, name='twoJet_cut', filter_func=RelMET_ff2)

In [None]:
popt_htlt_1 = plot_opt_sig('HTLT_nom',bff_dict, background_df,filter_func=RelMET_ff1, postfix='_one_jet', p0=[-1,300])
popt_htlt_2 = plot_opt_sig('HTLT_nom',bff_dict, background_df,filter_func=RelMET_ff2, postfix='_two_jet')

In [None]:
visualize_2d('HTLT_nom', bff_dict, background_df, name='oneJet_nocut', filter_func=RelMET_ff1)
visualize_2d('HTLT_nom', bff_dict, background_df, name='twoJet_nocut', filter_func=RelMET_ff2)

In [None]:
HTLT_ff1 = lambda x: band_cut2d('DiLepMass','HTLT_nom', high=popt_htlt_1)(RelMET_ff1(x))
HTLT_ff2 = lambda x: band_cut2d('DiLepMass','HTLT_nom', high=popt_htlt_2)(RelMET_ff2(x))

visualize_2d('HTLT_nom', bff_dict, background_df, name='oneJet_cut', filter_func=HTLT_ff1)
visualize_2d('HTLT_nom', bff_dict, background_df, name='twoJet_cut', filter_func=HTLT_ff2)

In [None]:
popt_TMB_1 = plot_opt_sig('TMB_nom',bff_dict, background_df,filter_func=HTLT_ff1, postfix='_one_jet',direction=-1)
popt_TMB_2 = plot_opt_sig('TMB_nom',bff_dict, background_df,filter_func=HTLT_ff2, postfix='_two_jet',direction=-1, p0=[.5,0])

In [None]:
visualize_2d('TMB_nom', bff_dict, background_df, name='oneJet_nocut', filter_func=HTLT_ff1)
visualize_2d('TMB_nom', bff_dict, background_df, name='twoJet_nocut', filter_func=HTLT_ff2)

In [None]:
TMB_ff1 = lambda x: band_cut2d('DiLepMass','TMB_nom', low=popt_TMB_1)(HTLT_ff1(x))
TMB_ff2 = lambda x: band_cut2d('DiLepMass','TMB_nom', low=popt_TMB_2)(HTLT_ff2(x))

visualize_2d('TMB_nom', bff_dict, background_df, name='oneJet_cut', filter_func=TMB_ff1)
visualize_2d('TMB_nom', bff_dict, background_df, name='twoJet_cut', filter_func=TMB_ff2)

In [None]:
cut_dict = {
"RelMET_ff1":RelMET_ff1,
"RelMET_ff2":RelMET_ff2,
"HTLT_ff1":HTLT_ff1,
"HTLT_ff2":HTLT_ff2,
"TMB_ff1":TMB_ff1,
"TMB_ff2":TMB_ff2
}


In [None]:
param_dict = {
"popt_relmet_1":popt_relmet_1.tolist(),
"popt_relmet_2":popt_relmet_2.tolist(),
"popt_htlt_1":popt_htlt_1.tolist(),
"popt_htlt_2":popt_htlt_2.tolist(),
"popt_TMB_1":popt_TMB_1.tolist(),
"popt_TMB_2":popt_TMB_2.tolist(),
}

In [None]:
cut_dict

In [None]:
param_dict

In [None]:
import pickle
with open('bff_cut_dict.py', 'wb') as f:
    pickle.dump(param_dict, f)

In [None]:
import yaml
with open('bff_cut.yml', 'w') as f:
    yaml.dump(param_dict, f)