In [None]:
import pandas as pd
import numpy as np
#surpress divide warnings
np.errstate(invalid='ignore', divide='ignore')
import matplotlib.pyplot as plt
import pickle
from src.data_tools.get_data import get_data

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from src.plotting_tools.draw_stack_plot_hists import draw_bckground, draw_signals, draw_data, draw_stackplot
from src.plotting_tools.SysHist import SysHist
from src.plotting_tools.Bins import Bins, bins
from src.plotting_tools.utils import ratio_plot_template
from src.data_tools.StackPlotter import get_stack_plotter
from src.plotting_tools.latexAssets import mll

In [None]:
split_bins = bins

In [None]:
from src.general.array_utils import moving_average, moving_sum, super_sample, super_sample_function, moving_avg_func, unp_array_to_nom_std
from src.plotting_tools.cms_format import cms_style, cms_format_fig

In [None]:
from scipy.optimize import curve_fit
from src.general.functions import power_func, power_law, make_bpoly, linear, parabola, make_bpoly_exp

In [None]:
from scipy.optimize import curve_fit
from src.general.functions import make_bpoly, lognorm, log_norm_np, log_norm_unp
from src.plotting_tools.SysHist import SysHist
import uncertainties
import uncertainties.unumpy as unp

In [None]:
cms_style()

In [None]:
output_dir = 'assets_feb_23'
outdir = '{}/abcd'.format(output_dir)
era = '2018'
ismc=0
isdata = ismc==0

In [None]:
sp = get_stack_plotter(output_dir, era, bins='none')

In [None]:
def compare_hists(fhist, dhist, ismc_pull = False, n=10, ndof=0, log=1, color='red', flabel="", dlabel="", 
                  dhist_isdata=0, fhist_is_data=0, ratio=False,  isabcd=0, **kwargs):
    fig, ax, rax = ratio_plot_template(figsize=(10,10))
    
    
    fhist.draw(ax, label=flabel, zorder=3)
    if dhist_isdata:
        ax.errorbar(dhist.calc_bin_centers(), dhist.nominal, yerr=dhist.std, color='black', label=dlabel,
           ls='', marker='o', zorder=3)
    else:
        dhist.draw(ax, label=dlabel, zorder=1, color=color, errorbar=False, draw_sys=0)#, sys_label='Background Systematics')
    
    x = fhist.calc_bin_centers()
    
    cms_format_fig(era, ax)
    if log:
        ax.set_yscale('Log')
    ax.set_ylabel('Counts per GeV')
    
    #calc pull
    nom1_avg = moving_sum(fhist.nominal, n = n)
    nom2_avg = moving_sum(dhist.nominal, n = n)
    var1_avg = moving_sum(fhist.std**2, n = n)
    var2_avg = moving_sum(dhist.std**2, n = n)
    x_avg = moving_average(x, n=n)
    if ismc_pull:
        pull = (nom1_avg-nom2_avg)/(var2_avg+var1_avg)**.5
    else:
        pull = (nom1_avg-nom2_avg)/nom1_avg**.5
    pullsquare = pull**2
        
    
    if ratio:
        rax.plot(x, np.full(len(x), .5) , color='black', linestyle=':')
        rax.plot(x, np.full(len(x), 0) , color='black')
        rax.plot(x, np.full(len(x), 1.5) , color='black', linestyle=':') 
        rax.set_ylim(0,2)
        if fhist_is_data:
            rax.set_ylabel('Obs./MC')
        else:
            if isabcd: rax.set_ylabel('ABCD/MC')
            else: rax.set_ylabel('Fit/MC')
        fhist.calc_ratio(dhist.nominal).draw(rax, zorder=1)
        if dhist_isdata:
            dhist.calc_ratio(dhist.nominal).draw(rax, color=color, zorder=0)
        else:
            dhist.calc_ratio(dhist.nominal).draw(rax, color=color, zorder=0, errorbar=False)
    else:
        rax.plot(x, np.full(len(x), 1) , color='black', linestyle=':')
        rax.plot(x, np.full(len(x), 0) , color='black')
        rax.plot(x, np.full(len(x), -1) , color='black', linestyle=':')
        rax.set_ylim(-5,5)
        rax.set_ylabel('Pull')
        rax.plot(x_avg, pull)
        
    rax.set_xlabel('{} [GeV]'.format(mll))
    return  {'chi2': (pullsquare).sum()/(fhist.calc_nBins()-ndof),
             'fig': fig,
             'ax': ax,
             'rax': rax}
    

In [None]:
def fit_hist(func, hist, n=10, comp_hist_n =10, do_super_sample=1, ismc=False,  do_unc=1, color='red', 
             flabel="", dlabel="", dhist_isdata=0, fhist_is_data=0, **kwargs):
    x = np.array(hist.calc_bin_centers())
    if not isdata:
        popt, pcov = curve_fit(func, x, hist.nominal, 
                       **kwargs,
                       sigma=hist.std, maxfev = int(1e6))  
    else:
        #std is not optimal for data: zero and low count bins will be subotimal error estimates
        popt, pcov = curve_fit(func, x, hist.nominal, 
               **kwargs, maxfev = int(1e6))  
    if do_unc:
        #create fit values with uncertainties
        popt_unc = uncertainties.correlated_values(popt, pcov)
        #make_fit_hist
        y = log_norm_unp(x, *popt_unc)
        y_nom, y_std = unp_array_to_nom_std(y)
    else: 
        y_nom = func(x, *popt)
        varper = ((hist.nominal-y_nom)**2/y_nom).mean()
        y_std = (varper*y_nom)**.5
        y_std = y_nom**.5
    fit_hist = SysHist(
            y_nom,
            x*0, x*0, 
            y_std,
            np.array(hist.bin_edges)
        ).normalize().calc_ratio(1/hist.calc_sum())

    compare_dict = compare_hists(fit_hist, hist, ismc=ismc, n=comp_hist_n, color = color, ndof=5, flabel=flabel, dlabel=dlabel, dhist_isdata=dhist_isdata)
    
    return {**compare_dict, 
            "popt" : np.array(popt),
            "pcov": np.array(pcov),
            "fit_hist": fit_hist
           }

In [None]:
bottom_value = 120
top_value=400
feature='DiLepMass'
sp.x_range = (bottom_value, top_value)

In [None]:
from scipy.ndimage import gaussian_filter

In [None]:
comp_hist_n = 1
sp.rebin = 0
fit_dict = {}
for reg in ['CR14','CR24', 'CR10','CR20', 'CR13', 'CR23']:
    #data for fitting, data or MC?
    if isdata:
        _plot_dict = sp.make_data_hist(feature, reg)
    else:
        _plot_dict = sp.combine_back(feature, reg)
    #reduce the range for fitting
    hist = _plot_dict.reduce_range(bottom=bottom_value, top=top_value)
    if ismc: hist.std += gaussian_filter(hist.std,2)
    #hist.up *=0
    #hist.down *=0
    total_events = hist.nominal.sum()
    #fit the hist
    flabel='Observed Fit' if isdata else 'MC Fit'
    dlabel = 'Observed' if isdata else None
    # fit and pull plot
    
    
    curve_fit_chi2 = fit_hist(log_norm_np, hist, comp_hist_n=comp_hist_n, do_super_sample=0, 
                              ismc=ismc, p0=[total_events*10, .8, 80, 70], 
                              bounds = ([0, .2, 50, 50], [total_events*100, 1, 100, 100]),
                              do_unc=1, flabel=flabel, dlabel=dlabel, dhist_isdata=isdata,fhist_is_data=isdata,
                             )
    
    
    sp.draw_background(curve_fit_chi2['ax'], feature, reg, sys_label='Stat. + Sys.', errorbar=False)
    print(reg)
    fit_dict[reg] = curve_fit_chi2
    print(reg, curve_fit_chi2['chi2'],  repr(curve_fit_chi2['popt']))
    # reorder legend
    handles, labels = curve_fit_chi2['ax'].get_legend_handles_labels()
    order = [0,1,2,3,4,5,7,6]
    if ismc: order = np.linspace(0,len(handles)-1, len(handles), dtype=int)
    curve_fit_chi2['ax'].legend([handles[idx] for idx in order],[labels[idx] for idx in order], 
                                ncol=2)
    curve_fit_chi2['ax'].set_ylim(bottom=1e0, top=1e4)
    curve_fit_chi2['fig'].savefig('{}/fit_data_pull_era{}_ismc{}_reg{}_bottom{}_lognorm.pdf'.format(outdir,era,ismc,reg,bottom_value))
    
    #200gev pull
    curve_fit_chi2['ax'].set_xlim(left=bottom_value, right=200)
    curve_fit_chi2['rax'].set_xlim(left=bottom_value, right=200)
    curve_fit_chi2['fig'].savefig('{}/fit_data_pull_era{}_ismc{}_reg{}_bottom{}_lognorm_200GeVMax.pdf'.format(outdir,era,ismc,reg,bottom_value))

    ### ratio plot
    #rebin to split binning for these plots
    sp.rebin = split_bins.bin_edges
    bhist = sp.combine_back(feature, reg).make_density_hist()
    bhist.nominal += 1e-10
    fit_hist_rebin = curve_fit_chi2['fit_hist'].rebin(split_bins.bin_edges).reduce_range(bottom=bottom_value, top=top_value).make_density_hist()
    ratio_dict = compare_hists(fit_hist_rebin, bhist, n=comp_hist_n, 
                               color = 'red', ndof=5, flabel=flabel, ratio=True, dhist_isdata=0, fhist_is_data=isdata)
    ratio_dict['ax'].legend()
    sp.draw_background(ratio_dict['ax'], feature, reg, sys_label='Stat. + Sys.', errorbar=False)
    if isdata:
        data_plot = sp.make_data_hist(feature, reg).make_density_hist()
        ratio_dict['ax'].errorbar(data_plot.calc_bin_centers(), data_plot.nominal, yerr=data_plot.std, 
                    color='black', label='Observed', ls='', marker='o', zorder=2)
        
        ratio_dict['rax'].errorbar(data_plot.calc_bin_centers(), data_plot.nominal/bhist.nominal,
                                   yerr=data_plot.std/bhist.nominal, 
                    color='black', ls='', marker='o', zorder=.5)        
        
    handles, labels = ratio_dict['ax'].get_legend_handles_labels()
    order = [0,1,2,3,4,6,5,7]
    if ismc: order = np.linspace(0,len(handles)-1, len(handles), dtype=int)
    ratio_dict['ax'].legend([handles[idx] for idx in order],[labels[idx] for idx in order], 
                                ncol=2)

    ratio_dict['ax'].set_ylim(bottom=1e0, top=1e4)
    ratio_dict['fig'].savefig('{}/fit_mc_ratio_era{}_ismc{}_reg{}_bottom{}_lognorm.pdf'.format(outdir,era,ismc,reg,bottom_value))
    #200gev pull
    ratio_dict['ax'].set_xlim(left=bottom_value, right=200)
    ratio_dict['rax'].set_xlim(left=bottom_value, right=200)
    ratio_dict['fig'].savefig('{}/fit_mc_ratio_era{}_ismc{}_reg{}_bottom{}_lognorm_200GeVMax.pdf'.format(outdir,era,ismc,reg,bottom_value))
    sp.rebin = 0
    




In [None]:
#### abcd

In [None]:
def make_fit_hist(template_hist, reg):
    tmp_xrange = sp.x_range
    sp.x_range = (-np.inf, np.inf)
    template_hist = sp.combine_back(feature, reg)
    x = np.array(template_hist.calc_bin_centers())
    popt, pcov = fit_dict[reg]['popt'], fit_dict[reg]['pcov']
    popt_unc = uncertainties.correlated_values(popt, pcov)
    y = log_norm_unp(x, *popt_unc)
    y_nom, y_std = unp_array_to_nom_std(y)
    fit_hist = SysHist(
            y_nom,
            x*0, x*0, 
            y_std,
            np.array(template_hist.bin_edges)
        )
    sp.x_range = tmp_xrange
    return fit_hist

In [None]:
def make_abcd(nJets):
    A = make_fit_hist(bhist, 'CR{}0'.format(nJets))
    B = make_fit_hist(bhist, 'CR{}3'.format(nJets))
    C = make_fit_hist(bhist, 'CR{}4'.format(nJets))
    abcd = A.uncertainty_std_dev()*B.uncertainty_std_dev()/C.uncertainty_std_dev()
    abcd_nom, abcd_std = unp_array_to_nom_std(abcd)
    return SysHist(abcd_nom, abcd_nom*0,abcd_nom*0,abcd_std, A.bin_edges)

In [None]:
#abcd plots
for nJets in [1,2]:
    reg = 'SR{}'.format(nJets)
    #make abcd hist
    abcd_hist = make_abcd(nJets)
    abcd_hist = abcd_hist.rebin(split_bins.bin_edges).reduce_range(bottom=bottom_value, top=top_value)
    #background hist
    sp.rebin = split_bins.bin_edges
    bhist = sp.combine_back(feature, reg)

    #ratio
    flabel = 'Obs. ABCD' if isdata else 'MC ABCD'
    ratio_dict = compare_hists(abcd_hist.make_density_hist(), bhist.make_density_hist(), isdata=0, n=comp_hist_n, 
                                   color = 'red', ndof=5, flabel=flabel, ratio=True, isabcd=1)
    sp.draw_background(ratio_dict['ax'], feature, reg, sys_label='Stat. + Sys.', errorbar=False)
    
    handles, labels = ratio_dict['ax'].get_legend_handles_labels()
    order = [0,1,2,3,4,6,5]
    if ismc: order = np.linspace(0,len(handles)-1, len(handles), dtype=int)
    ratio_dict['ax'].legend([handles[idx] for idx in order],[labels[idx] for idx in order], 
                                ncol=2)
    
    ratio_dict['ax'].set_ylim(bottom=1e0, top=1e3)
    ratio_dict['fig'].savefig('{}/abcd_mc_ratio_era{}_ismc{}_regSR{}_bottom{}_lognorm_splt_binning.pdf'.format(outdir,era,ismc,nJets,bottom_value))
    #200 GeV zoom in 
    ratio_dict['ax'].set_xlim(left=bottom_value, right=200)
    ratio_dict['rax'].set_xlim(left=bottom_value, right=200)
    ratio_dict['fig'].savefig('{}/abcd_mc_ratio_era{}_ismc{}_regSR{}_bottom{}_lognorm_splt_binning_200GeVMax.pdf'.format(outdir,era,ismc,nJets,bottom_value))
    
    #pull
    pull_dict = compare_hists(abcd_hist.make_density_hist(), bhist.make_density_hist(), isdata=0, n=comp_hist_n, 
                                   color = 'red', ndof=5, flabel=flabel, ratio=False, isabcd=1)
    sp.draw_background(pull_dict['ax'], feature, reg, sys_label='Stat. + Sys.', errorbar=False)
    handles, labels = pull_dict['ax'].get_legend_handles_labels()
    order = [0,1,2,3,4,6,5]
    if ismc: order = np.linspace(0,len(handles)-1, len(handles), dtype=int)
    pull_dict['ax'].legend([handles[idx] for idx in order],[labels[idx] for idx in order], 
                                ncol=2)
    pull_dict['ax'].set_ylim(bottom=1e0, top=1e3)
    pull_dict['fig'].savefig('{}/abcd_mc_pull_era{}_ismc{}_regSR{}_bottom{}_lognorm_splt_binning.pdf'.format(outdir,era,ismc,nJets,bottom_value))
    #200 GeV zoom in 
    pull_dict['ax'].set_xlim(left=bottom_value, right=200)
    pull_dict['rax'].set_xlim(left=bottom_value, right=200)
    pull_dict['fig'].savefig('{}/abcd_mc_pull_era{}_ismc{}_regSR{}_bottom{}_lognorm_splt_binning_200GeVMax.pdf'.format(outdir,era,ismc,nJets,bottom_value))
    sp.rebin = 0

    #save data
    fit_dict['SR{}'.format(nJets)]= ratio_dict
    fit_dict['SR{}'.format(nJets)]['fit_hist'] = abcd_hist

In [None]:
{reg: item['chi2'] for reg, item in fit_dict.items()}

In [None]:
fit_list = []
for reg, item in fit_dict.items():

    _plot_dict = sp.make_data_hist(feature, reg).reduce_range(bottom=bottom_value, top=top_value)
    _fit_dict = {
        "era": era,
        "region": reg,
    'n_{background}': "{:.2f}".format(_plot_dict.uncertainty_std_dev().sum()),
    'n_{ABCD,Data}': "{:.2f}".format(item['fit_hist'].inverse_make_density_hist().uncertainty_std_dev().sum()),
    ' Data $\chi^2/n_{DOF}$': "{:.2f}".format(item['chi2'])
    }
    fit_list.append(_fit_dict)

In [None]:
with open('{}/fit_stats_data_{}_ismc{}.txt'.format(outdir,era, ismc), 'w') as f:
    latex = pd.DataFrame(fit_list).to_latex()
    print(latex)
    f.write(latex)

In [None]:
#format dict for saving
limit_dict = {}
for reg, item in fit_dict.items():
    limit_dict[reg] = item['fit_hist'].to_dict()

In [None]:
import pickle as pkl
with open('{}/abcd_dict_data_{}_ismc{}.pkl'.format(outdir, era, ismc), 'wb') as f:
    pkl.dump(limit_dict, f)

In [None]:
fit_dict_skimmed = {}
for k, v in fit_dict.items():
    v = {k2:v2 for k2,v2 in v.items() if not k2 in ['fig', 'ax','rax']}
    fit_dict_skimmed[k] = v

In [None]:
import pickle as pkl
with open('{}/fit_dict_data_{}_ismc{}.pkl'.format(outdir, era, ismc), 'wb') as f:
    pkl.dump(fit_dict_skimmed, f)