In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from src.data_tools.get_data import get_data

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from src.plotting_tools.draw_stack_plot_hists import draw_bckground, draw_signals, draw_data, draw_stackplot
from src.plotting_tools.SysHist import SysHist
from src.plotting_tools.Bins import Bins, bins
from src.plotting_tools.utils import ratio_plot_template

In [None]:
bins_other_binning = bins

In [None]:
from src.general.array_utils import moving_average, moving_sum, super_sample, super_sample_function, moving_avg_func, unp_array_to_nom_std
from src.plotting_tools.cms_format import cms_style, cms_format_fig

In [None]:
cms_style()

In [None]:
era = '2016'
mdf, lumi = get_data(era, df_filter=lambda x: x.type.str.contains('bck'), stitch_dy=1)
mdf_data, lumi = get_data(era, df_filter=lambda x: x.type.str.contains('data'), stitch_dy=1)

In [None]:
outdir = 'assets_nov_2022/abcd'

In [None]:
mdf_data.type.unique()

In [None]:
bins_red = Bins(np.linspace(105,900, int((900-105)/5+1)))
bins_red = Bins(np.linspace(105,900, 300))

bins_red

In [None]:
data_hists = {}
for reg in ['CR14','CR24', 'CR10','CR20', 'CR13', 'CR23']:
    fig, ax = plt.subplots(1)
    data_hists[reg] = draw_data(ax, mdf_data, 'DiLepMass', reg, era, bin_edges=bins_red.bin_edges, 
                                return_hist=1, make_density=0, isdata=1)

    ax.set_yscale('log')

In [None]:
mc_hists_other_binning = {}
for reg in ['CR14','CR24', 'CR10','CR20', 'CR13', 'CR23']:
    fig, ax = plt.subplots(1)
    mc_hists_other_binning[reg] = draw_data(ax, mdf, 'DiLepMass', reg, era, bin_edges=bins.bin_edges, 
                                              return_hist=1, isdata=1)
    ax.set_yscale('log')

In [None]:
data_hists_other_binning = {}
for reg in ['CR14','CR24', 'CR10','CR20', 'CR13', 'CR23']:
    fig, ax = plt.subplots(1)
    data_hists_other_binning[reg] = draw_data(ax, mdf_data, 'DiLepMass', reg, era, bin_edges=bins.bin_edges, 
                                              return_hist=1, isdata=1)
    data_hists_other_binning[reg].sys = {}
    ax.set_yscale('log')

In [None]:
#def open_and_convert_plotting_dict(fn):
#    with open(fn, 'rb') as f:
#        plot_dict= pickle.load(f)
#        
#    for reg in plot_dict:
#        if 'hist' in plot_dict[reg]:
#            plot_dict[reg]['hist'] = SysHist.from_dict(plot_dict[reg]['hist'])
#        plot_dict[reg]['fit'] = SysHist.from_dict(plot_dict[reg]['fit'])
#    return plot_dict

In [None]:
def compare_hists(hist1, hist2, ismc = False, n=10, ndof=0, log=1, color='red', label1="", label2="", isdata=0, **kwargs):
    fig, ax, rax = ratio_plot_template(figsize=(10,10))
    hist1.draw(ax, label=label1, zorder=3)
    if isdata:
        ax.errorbar(hist2.calc_bin_centers(), hist2.nominal, yerr=hist2.std, color='black', label=label2,
           ls='', marker='o', zorder=1)
    else:
        hist2.draw(ax, label=label2, color=color)
    
    x = hist1.calc_bin_centers()

    rax.plot(x, np.full(len(x), 1) , color='black', linestyle=':')
    rax.plot(x, np.full(len(x), 0) , color='black')
    rax.plot(x, np.full(len(x), -1) , color='black', linestyle=':')
    
    cms_format_fig(era, ax)
    if log:
        ax.set_yscale('Log')
    ax.set_ylabel('Counts per GeV')
    rax.set_xlabel('DiLepMass [GeV]')
    rax.set_ylim(-5,5)
    rax.set_ylabel('Pull')
    
    nom1_avg = moving_sum(hist1.nominal, n = n)
    nom2_avg = moving_sum(hist2.nominal, n = n)
    var1_avg = moving_sum(hist1.std**2, n = n)
    var2_avg = moving_sum(hist2.std**2, n = n)
    x_avg = moving_average(x, n=n)
    if ismc:
        pull = (nom1_avg-nom2_avg)/(var2_avg+var1_avg)**.5
    else:
        pull = (nom1_avg-nom2_avg)/nom1_avg**.5
    pullsquare = pull**2
    
    rax.plot(x_avg, pull)
    
    return  {'chi2': (pullsquare).sum()/(hist1.calc_nBins()-ndof),
             'fig': fig,
             'ax': ax,
             'rax': rax}
    

In [None]:
from scipy.optimize import curve_fit
from src.general.functions import power_func, power_law, make_bpoly, linear, parabola, make_bpoly_exp

In [None]:
from scipy.optimize import curve_fit
from src.general.functions import make_bpoly, lognorm, log_norm_np, log_norm_unp
from src.plotting_tools.SysHist import SysHist

In [None]:
import uncertainties
import uncertainties.unumpy as unp

In [None]:
def fit_hist(func, hist, n=10, comp_hist_n =10, do_super_sample=1, ismc=False,  do_unc=1, color='red', label1="", label2="", isdata=0, **kwargs):
    x = np.array(hist.calc_bin_centers())
    if not isdata:
        popt, pcov = curve_fit(func, x, hist.nominal, 
                       **kwargs,
                       sigma=hist.std, maxfev = int(1e6))  
    else:
        #std is not optimal for data: zero and low count bins will be subotimal error estimates
        popt, pcov = curve_fit(func, x, hist.nominal, 
               **kwargs, maxfev = int(1e6))  
    if do_unc:
        #create fit values with uncertainties
        popt_unc = uncertainties.correlated_values(popt, pcov)
        #make_fit_hist
        y = log_norm_unp(x, *popt_unc)
        y_nom, y_std = unp_array_to_nom_std(y)
    else: 
        y_nom = func(x, *popt)
        varper = ((hist.nominal-y_nom)**2/y_nom).mean()
        y_std = (varper*y_nom)**.5
        y_std = y_nom**.5
    fit_hist = SysHist(
            y_nom,
            x*0, x*0, 
            y_std,
            np.array(hist.bin_edges)
        ).normalize().calc_ratio(1/hist.calc_sum())
    
    #bins from analysis
    x_other = np.array(bins_other_binning.calc_bin_centers())
    y_nom = func(x_other, *popt)
    y_std = y_nom*.1   
    if do_unc:
        popt_unc = uncertainties.correlated_values(popt, pcov)
        y = log_norm_unp(x_other, *popt_unc)
        _, y_std = unp_array_to_nom_std(y)
    fit_hist_other_binning = SysHist(
            y_nom,
            x_other*0, x_other*0, 
            y_std,
            np.array(bins_other_binning.bin_edges)
        ).reduce_range(bottom=bottom_value, top=1000).inverse_make_density_hist().normalize().calc_ratio(1/hist.calc_sum())
    
        #)#.reduce_range(bottom=bottom_value, top=top_value).inverse_make_density_hist().normalize().calc_ratio(1/hist.calc_sum())

    compare_dict = compare_hists(fit_hist, hist, ismc=ismc, n=comp_hist_n, color = color, ndof=5, label1=label1, label2=label2, isdata=isdata)
    return {**compare_dict, 
            "popt" : popt,
            "fit_hist": fit_hist,
            "fit_hist_other_binning": fit_hist_other_binning
           }

In [None]:
bottom_value = 120
top_value=400

In [None]:
comp_hist_n = 1
fit_dict = {}
ismc=0
isdata = ismc==0
for reg in ['CR14','CR24', 'CR10','CR20', 'CR13', 'CR23']:
    _plot_dict = data_hists[reg]
    print(reg)

    hist = data_hists[reg].reduce_range(bottom=bottom_value, top=top_value)
    hist.up *=0
    hist.down *=0

    total_events = hist.nominal.sum()
    curve_fit_chi2 = fit_hist(log_norm_np, hist, comp_hist_n=comp_hist_n, do_super_sample=0, 
                              ismc=ismc, p0=[total_events*10, .8, 80, 70], 
                              bounds = ([0, .2, 50, 50], [total_events*100, 1, 100, 100]),
                              do_unc=1, label1='fit', label2='Data', isdata=isdata,
                             )
    
    
    draw_bckground(curve_fit_chi2['ax'], mdf_data, 'DiLepMass', reg, era, bin_edges=hist.bin_edges, make_density=0)
    fit_dict[reg] = curve_fit_chi2
    print(reg, curve_fit_chi2['chi2'],  repr(curve_fit_chi2['popt']))
    curve_fit_chi2['ax'].legend(ncol=2)
    curve_fit_chi2['fig'].savefig('{}/data_fit_era{}_ismc{}_reg{}_bottom{}_lognorm.pdf'.format(outdir,era,ismc,reg,bottom_value))
    #curve_fit_chi2['fig'].savefig('fits/curve_fit_closure_test_22/mc_fit_era{}_ismc{}_reg{}_bottom{}_lognorm.pdf'.format(era,ismc,reg,bottom_value))

In [None]:
real_mc_hists_other_binning = {}
for reg in ['SR1', 'SR2']:
    fig, ax = plt.subplots(1)
    real_mc_hists_other_binning[reg] = draw_bckground(ax, mdf, 'DiLepMass', reg, era, bin_edges=bins.bin_edges, make_density=0)
    ax.set_yscale('log')

In [None]:
for nJets in [1,2]:
    reg = 'SR{}'.format(nJets)
    A = fit_dict['CR{}0'.format(nJets)]['fit_hist_other_binning'].reduce_range(bottom=bottom_value, top=1000)
    B = fit_dict['CR{}3'.format(nJets)]['fit_hist_other_binning'].reduce_range(bottom=bottom_value, top=1000)
    C = fit_dict['CR{}4'.format(nJets)]['fit_hist_other_binning'].reduce_range(bottom=bottom_value, top=1000)
    print(A.calc_sum(), B.calc_sum(), C.calc_sum(), ( A.uncertainty_std_dev()*B.uncertainty_std_dev()/C.uncertainty_std_dev()).sum() )
    abcd = A.uncertainty_std_dev()*B.uncertainty_std_dev()/C.uncertainty_std_dev()
    abcd_nom, abcd_std = unp_array_to_nom_std(abcd)
    #abcd_std = abcd_nom**.5
    
    hist =  real_mc_hists_other_binning['SR{}'.format(nJets)].reduce_range(bottom=bottom_value, top=1000).make_density_hist()
    hist.up *=0
    hist.down *=0
    
    #abcd_hist = SysHist(abcd_nom, abcd_nom*0,abcd_nom*0,abcd_std, hist.bin_edges)#.reduce_range(bottom=bottom_value, top=top_value).make_density_hist()
    abcd_hist = SysHist(abcd_nom, abcd_nom*0,abcd_nom*0,abcd_std, hist.bin_edges).make_density_hist()
    print(hist.inverse_make_density_hist().calc_sum(), abcd_hist.inverse_make_density_hist().calc_sum())
    #reduce the range to the area used for chi2
    compare_hist_dict = compare_hists(abcd_hist.reduce_range(bottom=bottom_value, top=top_value), 
                                      hist.reduce_range(bottom=bottom_value, top=top_value), 
                                      ismc=True, n=1, log=True, color='black', label1='Data ABCD pred.', label2='MC',  isdata=0)
    
    #draw stack plot background
    draw_bckground(compare_hist_dict['ax'], mdf, 'DiLepMass', reg, era, 
                   bin_edges=abcd_hist.reduce_range(bottom=bottom_value, top=top_value).bin_edges, make_density=1)
    
    compare_hist_dict['ax'].legend(ncol=2)
    
    #compare_hist_dict['fig'].savefig('fits/curve_fit_closure_test_22/abcd_era{}_ismc{}_regSR{}_bottom{}_lognorm_splt_binning.pdf'.format(era,ismc,nJets,bottom_value))
    compare_hist_dict['fig'].savefig('{}/data_abcd_era{}_ismc{}_regSR{}_bottom{}_lognorm_splt_binning.pdf'.format(outdir,era,ismc,nJets,bottom_value))
    compare_hist_dict['fig'].show()
    
    compare_hist_dict['ax'].set_xlim(120,200)
    compare_hist_dict['rax'].set_xlim(120,200)
    compare_hist_dict['ax'].set_ylim(bottom=0.001, top=150)
    compare_hist_dict['ax'].set_yscale('linear')
    compare_hist_dict['fig'].savefig('{}/data_abcd_era{}_ismc{}_regSR{}_bottom{}_lognorm_splt_binning_200GeV_max.pdf'.format(outdir,era,ismc,nJets,bottom_value))

    fit_dict['SR{}'.format(nJets)]= compare_hist_dict
    fit_dict['SR{}'.format(nJets)]['fit_hist'] = abcd_hist
    

In [None]:
{reg: item['chi2'] for reg, item in fit_dict.items()}

In [None]:
fit_list = []
for reg, item in fit_dict.items():
    try:
        _plot_dict = data_hists[reg].reduce_range(bottom=bottom_value, top=top_value)
    except:
        _plot_dict = real_mc_hists_other_binning[reg].reduce_range(bottom=bottom_value, top=top_value)
    _fit_dict = {
        "era": era,
        "region": reg,
    'n_{background}': "{:.2f}".format(_plot_dict.uncertainty_std_dev().sum()),
    'n_{ABCD,Data}': "{:.2f}".format(item['fit_hist'].inverse_make_density_hist().uncertainty_std_dev().sum()),
    ' Data $\chi^2/n_{DOF}$': "{:.2f}".format(item['chi2'])
    }
    fit_list.append(_fit_dict)

In [None]:
with open('{}/fit_stats_data_{}.txt'.format(outdir,era), 'w') as f:
    latex = pd.DataFrame(fit_list).to_latex()
    print(latex)
    f.write(latex)

In [None]:
#format dict for saving
limit_dict = {}
for reg, item in fit_dict.items():
    limit_dict[reg] = item['fit_hist'].to_dict()

In [None]:
import pickle as pkl
with open('{}/abcd_dict_data_{}.pkl'.format(outdir, era), 'wb') as f:
    pkl.dump(limit_dict, f)

In [None]:
limit_dict