In [1]:
import pandas as pd
import numpy as np
from bff_processor.data_tools import regex_select, get_files, make_df
from bff_processor.bff_meta import preselection, band_cut, isin, identity, all_reg, jet_variations
from bff_processor.bff_cuts import *
from glob import glob
import re
import pyarrow.feather as feather
import uncertainties

In [2]:
era = 2018

In [3]:
combined_df = feather.read_feather('data/combined_{}.feather'.format(era))
print(combined_df.shape)
combined_df = combined_df[combined_df.deltaR > 0.4]
print(combined_df.shape)

(2041378, 116)
(1900695, 116)


In [4]:
combined_df.loc[combined_df.name.isin(['ZToEE_M_120_200', 'ZToEE_M_200_400', 'ZToEE_M_400_800',
       'ZToEE_M_50_120', 'ZToEE_M_800_1400', 'ZToMuMu_M_120_200',
       'ZToMuMu_M_200_400', 'ZToMuMu_M_400_800', 'ZToMuMu_M_50_120',
       'ZToMuMu_M_800_1400']),'groupname'] = 'DY'
combined_df.loc[combined_df.name.isin(['mc_santitop', 'mc_stop']),'groupname'] = 'ST'
combined_df.loc[combined_df.name.isin(['mc_ttbar']),'groupname'] = 'TT'
combined_df.loc[combined_df.name.isin(['mc_ww', 'mc_wz', 'mc_zz']),'groupname'] = 'DB'

In [5]:
bck_df = combined_df[combined_df.type=='bck']
data_df = combined_df[combined_df.type=='data']

In [6]:
samples = np.unique(bck_df.name)
samples

array(['ZToEE_M_120_200', 'ZToEE_M_200_400', 'ZToEE_M_400_800',
       'ZToEE_M_50_120', 'ZToEE_M_800_1400', 'ZToMuMu_M_120_200',
       'ZToMuMu_M_200_400', 'ZToMuMu_M_400_800', 'ZToMuMu_M_50_120',
       'ZToMuMu_M_800_1400', 'mc_santitop', 'mc_stop', 'mc_ttbar',
       'mc_ww', 'mc_wz', 'mc_zz'], dtype=object)

In [7]:
groupnames = np.unique(bck_df.groupname)

In [8]:
regions = ["CR10_nom","CR11_nom","CR12_nom","CR13_nom","CR14_nom","CR20_nom","CR21_nom","CR22_nom","CR23_nom","CR24_nom"]

In [9]:
regions = ["CR10_nom","CR13_nom","CR14_nom", "CR20_nom","CR23_nom","CR24_nom"]

In [10]:
def weight_and_sum(reg, df):
    df = df[df[reg]==1]
    return uncertainties.ufloat(df.Weight.sum(), (df.Weight**2).sum()**.5)

def reg_sum(reg, df):
    df = df[df[reg]==1]
    return df.Weight.sum()

In [11]:
# define function to compute scale factor from object multiplicity

In [12]:
obj_count = {'SR1_nom': {"mu": 2, "el": 0, "lj": 0, "bj": 1},
'CR10_nom': {"mu": 2, "el": 0, "lj": 1, "bj": 0},
'CR11_nom': {"mu": 1, "el": 1, "lj": 0, "bj": 1},
'CR12_nom': {"mu": 1, "el": 1, "lj": 1, "bj": 0},
'CR13_nom': {"mu": 0, "el": 2, "lj": 0, "bj": 1},
'CR14_nom': {"mu": 0, "el": 2, "lj": 1, "bj": 0},
 'SR2_nom': {"mu": 2, "el": 0, "lj": 1, "bj": 1},
'CR20_nom': {"mu": 2, "el": 0, "lj": 2, "bj": 0},
'CR21_nom': {"mu": 1, "el": 1, "lj": 1, "bj": 1},
'CR22_nom': {"mu": 1, "el": 1, "lj": 2, "bj": 0},
'CR23_nom': {"mu": 0, "el": 2, "lj": 1, "bj": 1},
'CR24_nom': {"mu": 0, "el": 2, "lj": 2, "bj": 0},}
obj_count = pd.DataFrame(obj_count)
obj_count

Unnamed: 0,SR1_nom,CR10_nom,CR11_nom,CR12_nom,CR13_nom,CR14_nom,SR2_nom,CR20_nom,CR21_nom,CR22_nom,CR23_nom,CR24_nom
mu,2,2,1,1,0,0,2,2,1,1,0,0
el,0,0,1,1,2,2,0,0,1,1,2,2
lj,0,1,0,1,0,1,1,2,1,2,1,2
bj,1,0,1,0,1,0,1,0,1,0,1,0


In [13]:
region_dicts = []
for reg in regions:
    datasum = reg_sum(reg, data_df)
    count_dict = {groupname: reg_sum(reg,bck_df[bck_df.groupname==groupname]) for groupname in groupnames}
    count_dict['data'] = datasum
    count_dict['reg'] = reg
    region_dicts.append({**count_dict, **obj_count[reg]})
region_counts = pd.DataFrame(region_dicts)
obj_sf = np.array([1.,1.,1.,1.])
region_counts

Unnamed: 0,DB,DY,ST,TT,data,reg,mu,el,lj,bj
0,322.768608,57855.787945,433.43439,2104.122857,56319.0,CR10_nom,2,0,1,0
1,24.582435,2913.074002,655.858431,3024.705746,6186.0,CR13_nom,0,2,0,1
2,270.498409,41082.658498,304.713775,1486.680185,37689.0,CR14_nom,0,2,1,0
3,151.215726,9658.195621,95.921099,1078.973574,12697.0,CR20_nom,2,0,2,0
4,20.083532,908.528908,304.361793,4794.929059,5426.0,CR23_nom,0,2,1,1
5,110.888514,6814.962764,76.081153,777.187174,8478.0,CR24_nom,0,2,2,0


In [14]:
def sf_from_obj_sf(region_counts, obj_sf):
    obj_sf = np.array(obj_sf)
    obj_counts = region_counts[['mu', 'el', 'lj', 'bj']]
    return np.power(obj_sf,obj_counts).prod(axis=1)

def compute_estimate(region_counts, sf, smp_sf):
    return (region_counts[['DB', 'DY', 'ST', 'TT']]*smp_sf).sum(axis=1)*sf

def l2(x1, x2):
    return (((x1-x2)**2)/x2).sum()

def regularization(sf, width=.001):
    return np.sum(((sf-1)**2)/width)

def produce_estimate(region_counts, params):
    obj_sf = params[0:4]
    smp_sf = params[4:8]
    sf = sf_from_obj_sf(region_counts, obj_sf)
    return compute_estimate(region_counts, sf, smp_sf)

def loss_sf(params):
    estimate = produce_estimate(region_counts, params)
    data = region_counts['data']
    return l2(estimate,data) + regularization(params)

In [15]:
from scipy.optimize import minimize

In [16]:
loss_sf(np.full(8,1.))

1518.611142190529

In [17]:
res = minimize(loss_sf, np.full(8,1.), tol=1e-6)
sf = res.x
obj_sf = sf[0:4]
smp_sf = sf[4:8]
res

      fun: 232.48962091546906
 hess_inv: array([[ 2.00032831e-05,  1.98792190e-05, -2.54838266e-05,
        -4.12624670e-05, -5.31449951e-05, -1.54241380e-05,
         3.94151848e-05,  6.55758409e-06],
       [ 1.98792190e-05,  2.34957830e-05, -2.57492951e-05,
        -4.66710074e-05, -5.19053698e-05, -1.80150195e-05,
         4.19099938e-05,  3.34934019e-06],
       [-2.54838266e-05, -2.57492951e-05,  3.94151798e-05,
         5.69102240e-05,  7.33653937e-05,  1.52075974e-05,
        -4.83525749e-05, -8.10100002e-06],
       [-4.12624670e-05, -4.66710074e-05,  5.69102240e-05,
         1.21395978e-04,  1.22537698e-04,  3.12467456e-05,
        -9.58976828e-05,  6.58590530e-06],
       [-5.31449951e-05, -5.19053698e-05,  7.33653937e-05,
         1.22537698e-04,  1.53656468e-04,  3.79427625e-05,
        -1.12364503e-04, -1.24922501e-05],
       [-1.54241380e-05, -1.80150195e-05,  1.52075974e-05,
         3.12467456e-05,  3.79427625e-05,  2.05218141e-05,
        -3.79737053e-05, -4.93344875

In [18]:
nosf =  produce_estimate(region_counts, np.full(8,1))

In [19]:
estimate = produce_estimate(region_counts, (sf))
data = region_counts.data
((estimate-data)**2/data).mean()

27.224976722021026

In [20]:
estimate, data, nosf

(0    56890.811607
 1     5515.383689
 2    38186.335794
 3    12129.721513
 4     5876.426182
 5     8114.972306
 dtype: float64,
 0    56319.0
 1     6186.0
 2    37689.0
 3    12697.0
 4     5426.0
 5     8478.0
 Name: data, dtype: float64,
 0    60716.113800
 1     6618.220614
 2    43144.550867
 3    10984.306020
 4     6027.903291
 5     7779.119604
 dtype: float64)

In [21]:
sfs_by_region = np.power(obj_sf,obj_count.T).prod(axis=1)

In [22]:
sfs_by_region

SR1_nom     0.936475
CR10_nom    1.002688
CR11_nom    0.910130
CR12_nom    0.974480
CR13_nom    0.884526
CR14_nom    0.947066
SR2_nom     1.103039
CR20_nom    1.181029
CR21_nom    1.072008
CR22_nom    1.147805
CR23_nom    1.041850
CR24_nom    1.115514
dtype: float64

In [23]:
group_membership = np.array(list(map(lambda x: combined_df.groupname==x,['DB', 'DY', 'ST', 'TT']))).T
sfs_by_smp = (group_membership*smp_sf).sum(axis=1)

In [24]:
reg_sf = (combined_df[sfs_by_region.keys()]*sfs_by_region).sum(axis=1)

In [25]:
sfs_by_smp[sfs_by_smp==0] = 1
reg_sf[reg_sf==0] = 1
sfs_by_smp[combined_df.type=='data'] = 1
reg_sf[combined_df.type=='data'] = 1

In [26]:
reg_sf[combined_df.type=='data']

120532     1.0
120533     1.0
120534     1.0
120535     1.0
120536     1.0
          ... 
1244141    1.0
1244142    1.0
1244145    1.0
1244147    1.0
1244148    1.0
Length: 192436, dtype: float64

In [27]:
combined_df['reg_sf'] = reg_sf
combined_df['smp_sf'] = sfs_by_smp

In [28]:
feather.write_feather(combined_df,'data/combined_scaled_{}.feather'.format(era))