In [None]:
import pandas as pd
import numpy as np
from bff_processor.data_tools import regex_select, get_files, make_df
from bff_processor.utils import time_func, hist2unc, linear_old, parabola, significance, power_func, apply_multiple_filters
linear = linear_old
from bff_processor.bff_meta import preselection, band_cut, isin, identity
from bff_processor.plotting_utils import produce_bff_hists, boost_plot, boost_plot2d, unc_plot
import boost_histogram as bh
import matplotlib.pyplot as plt
import mplhep as hep
hep.style.use(hep.style.CMS)

In [None]:
# This sets up matplotlib to use latex
plt.rcParams.update({
    "text.usetex": True,
    })

In [None]:
# set which eras you want, 9999 == all eras
era = 9999
file_re = 'tw_(?:2016|2017|2018).+\.csv'
#get files matching regex
file_dict = get_files(file_re)
file_dict

In [None]:
#make df of background
backgrounds = file_dict['DY']+file_dict['TT']+file_dict['ST']+file_dict['VB']
background_df = preselection()(make_df(backgrounds))

In [None]:
#make df of background signals
import re
bff_dict_names = {}
for fname in file_dict['BFF']:
    name = re.findall('_M_([0-9]+)_dbs(\d)p(\d+)', fname)
    if len(name)==1: name = name[0]
    else: continue
    name = "{} {}.{}".format(*name)
    if name not in bff_dict_names:
        bff_dict_names[name] = []
    bff_dict_names[name].append(fname)

In [None]:
#make signal dict only if they have right number of samples (3 for 16, 17, 18)
bff_dict = {} 
for name, item in bff_dict_names.items():
    if len(item) != 3: continue
    if '0.04' not in name: continue
    print(name)
    bff_dict[name] = preselection()(make_df(item))
bff_samples = ['125 0.04', '150 0.04', '200 0.04', '350 0.04', '500 0.04']
bff_dict.keys()

In [None]:
#defines plots to make, and ranges to plot them over
columns = [
    ['DiLepMass', [139,105,800]],
    ['TMB_nom',       [80,0, 800]],
    ['HTLT_nom',      [100,-500,500]],
    ['RelMET_nom',    [100, 0,1]]
]

In [None]:
#optimize RelMET cut:
def column_1d_sig(sig_df, bck_df, mass_band=1, filter_func=lambda x: x):
    #select for region
    _sig_df = filter_func(sig_df)
    _bck_df =  filter_func(bck_df)
    mean, std = sig_df['DiLepMass'].mean(), sig_df['DiLepMass'].std()
    widht = std*mass_band
    #mass band cut
    _sig_df = band_cut('DiLepMass',mean-widht, mean+widht)(_sig_df)
    _bck_df = band_cut('DiLepMass',mean-widht, mean+widht)(_bck_df)
    bck_1d_hist, bck_2d_hist = produce_bff_hists(_bck_df, "", columns, weight='Weight')
    sig_1d_hist, sig_2d_hist = produce_bff_hists(_sig_df, "", columns, weight='Weight')
    return bck_1d_hist, sig_1d_hist, bck_2d_hist, sig_2d_hist

In [None]:
def calc_sig_cut(s,b, direction=1):
    '''Calculates significance (s/sqrt(s+b)) for a left cut for each bin. direction=-1 reverses direction'''
    from itertools import accumulate
    s,b = np.array(list(accumulate(s[::direction])))[::direction], np.array(list(accumulate(b[::direction])))[::direction]
    return significance(s,b)

In [None]:
from scipy.stats import norm
def map_signf(value, signf, centers):
    '''For fitting'''
    diff = np.abs(centers - value)
    delta  = (centers[1] - centers[0])
    weight = norm.pdf(diff, loc=0, scale=delta*1)
    wval = np.dot(signf,weight)/np.sum(weight+1e-12)
    return wval.nominal_value


In [None]:
def fit_sig(masses, sigs, centers, fit_function, *popt):
    '''Fit function for curve fit?'''
    signfs = []
    for m, sig in zip(masses,sigs):
        cut_val = fit_function(m,*popt)
        signf = map_signf(cut_val, sig, centers)
        signfs.append(signf+1e-12)
    return np.dot(-np.array(signfs), np.power(masses, 0))

In [None]:
from scipy.optimize import curve_fit, minimize, differential_evolution
from bff_processor.utils import vunc2nom, vunc2std
import uncertainties

In [None]:
def plot_opt_sig(column,bff_dict, background_df, func2fit, minimize_func=True,  filter_func=lambda x: x, postfix="",direction=1, p0=[0,.5], title_size=20):
    '''Makes plot of funcitons, significance, etc... Also fits function'''
    fig,ax = plt.subplots(2,len(bff_samples), figsize=[38,15])
    fit_x = []
    fit_y = []
    fit_y_unc = []
    sigs = []
    masses = []
    for j, m in enumerate(bff_samples):
        mass = int(re.findall('([0-9]+)',m)[0])
        if m not in bff_dict: continue
        bck_1d_hist, sig_1d_hist, bck_2d_hist, sig_2d_hist = column_1d_sig(bff_dict[m], background_df, mass_band=2,filter_func=filter_func)
        
        top_ax = ax[0,j]
        top_ax.set_xlabel(column.replace('_',' '))
        boost_plot(top_ax, bck_1d_hist[column] ,label='bck')
        boost_plot(top_ax, sig_1d_hist[column] ,label='sig')
        title = r'{} GeV $\delta_{{bs}}$ {}'.format(*m.split(' '))
        top_ax.set_title(title, fontsize=title_size)
        top_ax.legend()
        
        s,b = hist2unc(sig_1d_hist[column]), hist2unc(bck_1d_hist[column])
        signf_val = calc_sig_cut(s,b, direction=direction)
        
        centers = sig_1d_hist[column].axes[0].centers
        unc_plot(ax[1][j], centers, signf_val, zorder=1)
        peak_index = np.argmax(signf_val)
        peak_center = centers[peak_index]
        peak_height = signf_val[peak_index]
        
        sigs.append(signf_val)
        masses.append(mass)
            
        #uncertainty of center
        upper_limit = vunc2nom(signf_val)+vunc2std(signf_val)
        in_range = centers[upper_limit > peak_height]
        sigma_range = np.asarray((in_range[0], in_range[-1]))
        unc = np.max(np.abs(sigma_range-peak_center))
        fit_x.append(mass)
        fit_y.append(peak_center)
        fit_y_unc.append(unc)
        
        ax[1][j].set_title("max: {:.2f}+/-{:.2f}".format(peak_center, unc), fontsize=title_size)
    
    ax[0][0].set_ylabel('Counts')
    ax[1][0].set_ylabel(r'Significance $\frac{s}{\sqrt{s+b}}$')
    
    fit_func = lambda popt: fit_sig(masses, sigs, centers, func2fit, *popt)
    if minimize_func:
        min_val = minimize(fit_func, p0)
        print(min_val)
        popt = min_val.x
    masses = np.array(masses)
    #cut_points = linear(masses, *popt)
    cut_points = func2fit(masses, *popt)
    
    for j, (m, c, s) in enumerate(zip(masses, cut_points, sigs)):
        sig_at_cut = map_signf(c, s, centers)
        print(sig_at_cut, c)
        #ax[1][j].plot([c,c], [0,sig_at_cut], zorder=1)
        ax[1][j].scatter([c], [sig_at_cut], zorder=1,s=150, c='red', marker=(5, 1), label='cut loc.')
        ax[1][j].legend(loc=4)
    fig.savefig("output/sig_tuning/{}_{}_{}.pdf".format(era, column, postfix), transparent=False)
    return popt
    
    #return popt
        #break

In [None]:
# fit Relmet first
popt_relmet_1  = plot_opt_sig('RelMET_nom',bff_dict, background_df, power_func, filter_func=isin('SR1_nom'), postfix='_one_jet', p0=[300,-1.33])
popt_relmet_2  = plot_opt_sig('RelMET_nom',bff_dict, background_df, power_func, filter_func=isin('SR2_nom'), postfix='_two_jet', p0=[300,-1.2])

In [None]:
def band_cut2d(column1, column2, function, low=[0,-np.inf], high=[0,np.inf]):
    '''cuts a linear band in 2d space.'''
    import pandas as pd 
    def cut_df(df):
        return df[(function(df[column1],*low) < df[column2]) & (function(df[column1],*high) > df[column2])]
    return cut_df    

In [None]:
def replace_bf_metadata(bf_dict, name1, name2):
    '''Renames axes for boost hist'''
    for reg in bf_dict:
        bf_dict[reg].axes[0].metadata = name1
        bf_dict[reg].axes[1].metadata = name2
    return bf_dict

def visualize_2d(col2, bff_dict, background_df, name='', filter_func=lambda x: x, title_size=20,**kwargs):
    '''This wont work if the plot producer doesn't have the right format'''
    col1= 'DiLepMass'
    test_cut_bck = filter_func(background_df)
    _, test_2d = produce_bff_hists(test_cut_bck, name, columns, weight='Weight')
    test_2d =  replace_bf_metadata(test_2d, col1, col2.replace('_',' '))
    
    sig_plts = {}
    for m in bff_samples:
        test_cut_sig = filter_func(bff_dict[m])
        _, test_2d_sig = produce_bff_hists(test_cut_sig, name, columns, weight='Weight')
        test_2d_sig =  replace_bf_metadata(test_2d_sig, col1, col2.replace('_',' '))
        sig_plts[m] = test_2d_sig
        
    fig,ax = plt.subplots(1,1+len(sig_plts), figsize=[38,7])
    boost_plot2d(ax[0], test_2d['{}_{}'.format(col1,col2)],log=1)
    ax[0].set_title('background')
    for i, (m,tdict) in enumerate(sig_plts.items()):
        boost_plot2d(ax[i+1], tdict['{}_{}'.format(col1,col2)],log=1)
        ax[i+1].set_ylabel('')
        ax[i+1].set_title(r'{} GeV $\delta_{{bs}}$ {}'.format(*m.split(' ')), fontsize=title_size)
    fig.savefig('output/sig_tuning/{}_{}_{}_2d.pdf'.format(name,col1,col2))

In [None]:
# Draw relmet 2d cut
visualize_2d('RelMET_nom', bff_dict, background_df, name='oneJet_nocut', filter_func=isin('SR1_nom'))
visualize_2d('RelMET_nom', bff_dict, background_df, name='twoJet_nocut', filter_func=isin('SR2_nom'))

In [None]:
#print best fit params
popt_relmet_1, popt_relmet_2

In [None]:
def RelMET_filter(df, col, *popt):
    '''This function filters relmet'''
    return power_func(df.DiLepMass, *popt) > df[col]

#remet filter function
RelMET_filter1 = lambda df, col: RelMET_filter(df, col, *popt_relmet_1)
RelMET_filter2 = lambda df, col: RelMET_filter(df, col, *popt_relmet_2)

#filter function for whole string
RelMET_f1_SR1_nom = lambda df: apply_multiple_filters(df, [df.SR1_nom, RelMET_filter1(df, 'RelMET_nom')])
RelMET_f2_SR2_nom = lambda df: apply_multiple_filters(df, [df.SR2_nom, RelMET_filter2(df, 'RelMET_nom')])

#draw post filter
visualize_2d('RelMET_nom', bff_dict, background_df, name='oneJet_cut', filter_func=RelMET_f1_SR1_nom)
visualize_2d('RelMET_nom', bff_dict, background_df, name='twoJet_cut', filter_func=RelMET_f2_SR2_nom)

In [None]:
popt_htlt_1 = plot_opt_sig('HTLT_nom',bff_dict, background_df, linear, filter_func=RelMET_f1_SR1_nom, postfix='_one_jet', p0=[-1.13,351])
popt_htlt_2 = plot_opt_sig('HTLT_nom',bff_dict, background_df,linear, filter_func=RelMET_f2_SR2_nom, postfix='_two_jet', p0=[-1.11,332])

In [None]:
visualize_2d('HTLT_nom', bff_dict, background_df, name='oneJet_nocut', filter_func=RelMET_f1_SR1_nom)
visualize_2d('HTLT_nom', bff_dict, background_df, name='twoJet_nocut', filter_func=RelMET_f2_SR2_nom)

In [None]:
def HTLT_filter(df, col, *popt):
    return linear(df.DiLepMass, *popt) > df[col]

HTLT_filter1 = lambda df, col: HTLT_filter(df, col, *popt_htlt_1)
HTLT_filter2 = lambda df, col: HTLT_filter(df, col, *popt_htlt_2)


HTLT_f1_SR1_nom = lambda df: apply_multiple_filters(df, [df.SR1_nom, RelMET_filter1(df, 'RelMET_nom'), HTLT_filter1(df, 'HTLT_nom')])
HTLT_f2_SR2_nom = lambda df: apply_multiple_filters(df, [df.SR2_nom, RelMET_filter2(df, 'RelMET_nom'), HTLT_filter2(df, 'HTLT_nom')])

visualize_2d('HTLT_nom', bff_dict, background_df, name='oneJet_cut', filter_func=HTLT_f1_SR1_nom)
visualize_2d('HTLT_nom', bff_dict, background_df, name='twoJet_cut', filter_func=HTLT_f2_SR2_nom)

In [None]:
def heaviside(x, cutoff, scale):
    return np.heaviside(x-cutoff, 0)*scale

In [None]:
popt_TMB_1 = plot_opt_sig('TMB_nom',bff_dict, background_df,heaviside, filter_func=HTLT_f1_SR1_nom, postfix='_one_jet',direction=-1, p0=[300,100])
popt_TMB_2 = plot_opt_sig('TMB_nom',bff_dict, background_df,heaviside, filter_func=HTLT_f2_SR2_nom, postfix='_two_jet',direction=-1, p0=[225,100])

In [None]:
visualize_2d('TMB_nom', bff_dict, background_df, name='oneJet_nocut', filter_func=HTLT_f1_SR1_nom)
visualize_2d('TMB_nom', bff_dict, background_df, name='twoJet_nocut', filter_func=HTLT_f2_SR2_nom)

In [None]:
popt_TMB_2

In [None]:
def TMB_filter(df, col, *popt):
    return heaviside(df.DiLepMass, *popt) < df[col]

TMB_filter1 = lambda df, col: TMB_filter(df, col, *popt_TMB_1)
TMB_filter2 = lambda df, col: TMB_filter(df, col, *popt_TMB_2)


TMB_f1_SR1_nom = lambda df: apply_multiple_filters(df, [df.SR1_nom, RelMET_filter1(df, 'RelMET_nom'), HTLT_filter1(df, 'HTLT_nom'), TMB_filter1(df, 'TMB_nom') ])
TMB_f2_SR2_nom = lambda df: apply_multiple_filters(df, [df.SR2_nom, RelMET_filter2(df, 'RelMET_nom'), HTLT_filter2(df, 'HTLT_nom'), TMB_filter2(df, 'TMB_nom')])


visualize_2d('TMB_nom', bff_dict, background_df, name='oneJet_cut', filter_func=TMB_f1_SR1_nom)
visualize_2d('TMB_nom', bff_dict, background_df, name='twoJet_cut', filter_func=TMB_f2_SR2_nom)

In [None]:
import dill as pickle

In [None]:
param_dict = {
    "popt_TMB_1":popt_TMB_1,
    "popt_TMB_2":popt_TMB_2,
    "popt_htlt_1":popt_htlt_1,
    "popt_htlt_2":popt_htlt_2,
    "popt_relmet_1":popt_relmet_1,
    "popt_relmet_2":popt_relmet_2,
    #'htlt_func': 'linear',
    #'relmet_func':'power',
    #'TMB_func': 'heaviside',
    #"RelMET_filter1":RelMET_filter1,
    #"RelMET_filter2":RelMET_filter2,
    #"TMB_filter1":TMB_filter1,
    #"TMB_filter2":TMB_filter2,
    #"HTLT_filter1":HTLT_filter1,
    #"HTLT_filter2":HTLT_filter2,
    #'bff_1': lambda df,jv: apply_multiple_filters(df, [RelMET_filter1(df, 'RelMET_{}'.format(jv)), HTLT_filter1(df, 'HTLT_{}'.format(jv)), TMB_filter1(df, 'TMB_{}'.format(jv))]),
    #'bff_2': lambda df,jv: apply_multiple_filters(df, [RelMET_filter2(df, 'RelMET_{}'.format(jv)), HTLT_filter2(df, 'HTLT_{}'.format(jv)), TMB_filter2(df, 'TMB_{}'.format(jv))]),
}

In [None]:
import yaml
with open('bff_cut.yml', 'w') as f:
     yaml.dump(param_dict, f)

In [None]:
with open('bff_cut.dill', 'wb') as f:
    pickle.dump(param_dict, f)

In [None]:
with open('bff_cut.dill', 'rb') as f:
    param_dict_loaded = pickle.load(f)

In [None]:
param_dict