### Validate WeightXsLumi by using the sum(genWeight)

In [1]:
import glob
import yaml
import zquery
import functools
import numpy as np
import pandas as pd

In [2]:
paths = sorted(glob.glob("/vols/cms/sdb15/Analysis/ZinvWidth/databases/full/2020/02_Feb/10_SingleTable_FixObjectWeights/MC/*.h5"))

In [3]:
cfg = {
    "histogram": {
        "func": "zquery.analysis_functions:histogram",
        "kwargs": {
            "input": {"key": "Events", "iterator": True, "chunksize": 500_000, "columns": ["sample", "genWeight"]},
            "cfgs": [{
                "evals": ["count = 1", "sum_w = genWeight", "sum_ww = genWeight**2"],
                "columns": ["sample", "count", "sum_w", "sum_ww"],
                "groupby": ["sample"],
            }],
        }
    }
}

with open("configs/mc.yaml", 'w') as f:
    yaml.dump(cfg, f, indent=4)

In [4]:
#results = zquery.process_tables(
#    cfg,
#    ["histogram"],
#    paths,
#    njobs=200,
#    pysge_func='sge_submit',
#    pysge_args=("hist", "_ccsp_temp"),
#    pysge_kwargs={"options": "-q hep.q -l h_rt=3:0:0 -l h_vmem=12G"},
#)

In [5]:
#df = functools.reduce(
#    lambda x, y: zquery.analysis_functions._df_merge(x, y),
#    [r for rs in results for r in rs],
#)
#df.to_hdf(
#    "data/genweights.h5", "MCAggEvents",
#    format='table', append=False,
#    complib='zlib', complevel=9,
#)

In [20]:
df = pd.read_hdf("data/genweights.h5", "MCAggEvents")
df = df.reset_index()

mask = df['sample'].str.contains("_ext")
df.loc[mask,"sample"] = df.loc[mask, "sample"].str.extract(r'(.*)_ext(.*)').loc[:,0]
df = df.groupby("sample").sum()
df

Unnamed: 0_level_0,count,sum_w,sum_ww
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DYJetsToLL_Inclusive,120777245.0,1.897142e+12,6.637846e+16
DYJetsToLL_Pt-100To250,84284977.0,1.674622e+10,2.615875e+13
DYJetsToLL_Pt-250To400,21176899.0,1.525587e+08,8.142405e+09
DYJetsToLL_Pt-400To650,1625936.0,1.458753e+06,8.778160e+06
DYJetsToLL_Pt-50To100,130517314.0,1.192598e+11,8.205251e+14
...,...,...,...
ZZTo2L2Nu,57586850.0,5.758685e+07,5.758685e+07
ZZTo2L2Q,15462693.0,7.844768e+07,9.984828e+08
ZZTo2Q2Nu,30493038.0,1.990256e+08,3.445795e+09
ZZTo4L,10711278.0,2.047934e+07,8.223470e+07


In [7]:
xsec = pd.read_hdf("data/xsection.h5", "CrossSections")
xsec.columns = ["sample", "xsection"]

mask = xsec['sample'].str.contains('_ext')
xsec.loc[mask,'sample'] = xsec.loc[mask,'sample'].str.extract(r'(.*)_ext(.*)').loc[:,0]
xsec = xsec.groupby("sample").max().reset_index()

In [13]:
df_ref = pd.read_hdf(
    "/vols/cms/sdb15/Analysis/ZinvWidth/databases/full/2020/02_Feb/10_SingleTable_FixObjectWeights/MC/result_00150.h5", "Events",
    start=0, stop=1_000_000,
    columns=["parent", "sample", "genWeight", "WeightXsLumi"],
)

mask = df_ref['sample'].str.contains('_ext')
df_ref.loc[mask,'sample'] = df_ref.loc[mask,'sample'].str.extract(r'(.*)_ext(.*)').loc[:,0]
df_ref

Unnamed: 0,parent,sample,genWeight,WeightXsLumi
0,DYJetsToLL,DYJetsToLL_Pt-100To250,-557.033752,-0.096881
1,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881
2,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881
3,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881
4,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881
...,...,...,...,...
232043,DYJetsToLL,DYJetsToLL_Pt-100To250,-557.119690,-0.096896
232044,DYJetsToLL,DYJetsToLL_Pt-100To250,557.119690,0.096896
232045,DYJetsToLL,DYJetsToLL_Pt-100To250,557.119690,0.096896
232046,DYJetsToLL,DYJetsToLL_Pt-100To250,-557.119690,-0.096896


In [14]:
# merge in xsection values
df_ref = df_ref.merge(xsec, how='left', on='sample')
df_ref

Unnamed: 0,parent,sample,genWeight,WeightXsLumi,xsection
0,DYJetsToLL,DYJetsToLL_Pt-100To250,-557.033752,-0.096881,81.22
1,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881,81.22
2,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881,81.22
3,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881,81.22
4,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881,81.22
...,...,...,...,...,...
732043,DYJetsToLL,DYJetsToLL_Pt-100To250,-557.119690,-0.096896,81.22
732044,DYJetsToLL,DYJetsToLL_Pt-100To250,557.119690,0.096896,81.22
732045,DYJetsToLL,DYJetsToLL_Pt-100To250,557.119690,0.096896,81.22
732046,DYJetsToLL,DYJetsToLL_Pt-100To250,-557.119690,-0.096896,81.22


In [21]:
df = df.loc[:, ("sum_w",)]
df.columns = ["genWeightSum"]
df = df.reset_index()
df

Unnamed: 0,sample,genWeightSum
0,DYJetsToLL_Inclusive,1.897142e+12
1,DYJetsToLL_Pt-100To250,1.674622e+10
2,DYJetsToLL_Pt-250To400,1.525587e+08
3,DYJetsToLL_Pt-400To650,1.458753e+06
4,DYJetsToLL_Pt-50To100,1.192598e+11
...,...,...
57,ZZTo2L2Nu,5.758685e+07
58,ZZTo2L2Q,7.844768e+07
59,ZZTo2Q2Nu,1.990256e+08
60,ZZTo4L,2.047934e+07


In [22]:
df_ref = df_ref.merge(df, how='inner', on='sample')
df_ref

Unnamed: 0,parent,sample,genWeight,WeightXsLumi,xsection,genWeightSum
0,DYJetsToLL,DYJetsToLL_Pt-100To250,-557.033752,-0.096881,81.22,1.674622e+10
1,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881,81.22,1.674622e+10
2,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881,81.22,1.674622e+10
3,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881,81.22,1.674622e+10
4,DYJetsToLL,DYJetsToLL_Pt-100To250,557.033752,0.096881,81.22,1.674622e+10
...,...,...,...,...,...,...
732043,DYJetsToLL,DYJetsToLL_Pt-100To250,-557.119690,-0.096896,81.22,1.674622e+10
732044,DYJetsToLL,DYJetsToLL_Pt-100To250,557.119690,0.096896,81.22,1.674622e+10
732045,DYJetsToLL,DYJetsToLL_Pt-100To250,557.119690,0.096896,81.22,1.674622e+10
732046,DYJetsToLL,DYJetsToLL_Pt-100To250,-557.119690,-0.096896,81.22,1.674622e+10


In [23]:
old_n_new = pd.DataFrame({
    "old": df_ref["WeightXsLumi"],
    "new": df_ref.eval("xsection*35860*genWeight/genWeightSum"),
})
np.isclose(old_n_new["old"], old_n_new["new"]).all()

True