In [1]:
import dask_awkward as dak
import awkward as ak
import pandas as pd

from distributed import Client
import glob
import os

client =  Client(n_workers=40,  threads_per_worker=1, processes=True, memory_limit='8 GiB')

In [2]:
from modules import selection

In [3]:
V1_fields_2compute = [
    # "gjj_mass",
    "wgt_nominal",
    "nBtagLoose_nominal",
    "nBtagMedium_nominal",
    "mu1_pt",
    "mu2_pt",
    "mu1_eta",
    "mu2_eta",
    "mu1_phi",
    "mu2_phi",
    "dimuon_pt",
    "dimuon_eta",
    "dimuon_phi",
    "dimuon_mass",
    "jet1_phi_nominal",
    "jet1_pt_nominal",
    "jet2_pt_nominal",
    "jet2_phi_nominal",
    "jet1_eta_nominal",
    "jet2_eta_nominal",
    "jj_mass_nominal",
    "jj_dEta_nominal",
    # "region",
    "event",
    "njets_nominal",
    # "run",
    # "event",
    # "luminosityBlock",
    "nfatJets_drmuon",
    "MET_pt"
]

In [4]:
# category="ggh"
category="vbf"
# region="signal"
# region="h-peak"
region="h-sidebands"


In [5]:
def getYield(process, load_path, do_vbf_filter_study=False, year="-"):
    print("{line} {process} {line}".format(line="="*5, process=process))

    fields = V1_fields_2compute
    if "data" in process and "gjj_mass" in fields:
        fields.remove("gjj_mass")
    if "data" not in process:
        fields.append("gjj_mass")

    filelist = glob.glob(f"{load_path}/{process}")
    total_integral = 0
    for file in filelist:
        # print(f"\tfile: {file}")
        events_data = dak.from_parquet(f"{file}/*/*.parquet")
        events_data = ak.zip({field: events_data[field] for field in fields}).compute()
        events_data = selection.applyRegionCatCuts(events_data, category=category, region_name=region, process=process, variation="nominal", do_vbf_filter_study=do_vbf_filter_study)

        wgts = ak.fill_none(events_data.wgt_nominal, value=1.0)
        data_yield = ak.sum(wgts)
        # print(f"\tSum weights: {data_yield}")
        df = pd.DataFrame({field: ak.fill_none(events_data[field], value=-999.9) for field in events_data.fields})
        print(f"\tdata_yield for {file.split("/")[-1]}: {data_yield}")
        total_integral += data_yield
    print(f"\t==> Total Yield: {total_integral:.3f}")
    return process,region,year,total_integral

In [6]:
import csv
outfile = "yield.csv"
years = ["2018", "2017", "2016preVFP", "2016postVFP"]

years = ["2018"]
info = []
for year in years:
    print("{vspace}{lspace} {year} {lspace}".format(vspace="\n\n", lspace="*"*5, year=year))
    # load_path = f"/depot/cms/users/shar1172/hmm/copperheadV1clean/Run2_nanoAODv12_UpdatedQGL_FixPUJetIDWgt/stage1_output/{year}/compacted"
    # load_path = f"/depot/cms/hmm/shar1172/hmm_ntuples/copperheadV1clean/Run2_nanoAODv12_AK8jets/stage1_output/{year}/f1_0"
    load_path = f"/depot/cms/hmm/shar1172/hmm_ntuples/copperheadV1clean/Run2_nanoAODv12_AK8jets/stage1_output/{year}/f1_0"
    # getYield("data*", load_path)
    info.append(
        getYield(process = "data*", load_path = load_path, year=year)
    )
    info.append(
        getYield(process = "vbf_powheg_dipole", load_path = load_path, year=year)
    )
    info.append(
        getYield(process = "ggh_powhegPS", load_path = load_path, year=year)
    )
    # getYield("dy_VBF_filter", True)
    # getYield("dy_M*_MiNNLO", True)
    # getYield("ewk_lljj_mll50_mjj120")
    # getYield("ggh_powhegPS", load_path)
    # getYield("st_*")
    # getYield("ttjets_*")
    # getYield("ww_2l2nu")
    # getYield("wz_1l1nu2q")
    # getYield("wz_2l2q")
    # getYield("wz_3lnu")
    # getYield("zz")
    # getYield("www")
    # getYield("wwz")
    # getYield("wzz")
    # getYield("zzz")

with open(outfile, 'w', newline='') as out_f:
    writer = csv.writer(out_f)
    writer.writerow(['sample', 'region', 'year', 'yield'])
    writer.writerows(info)



***** 2018 *****
===== data* =====
	data_yield for data_C: 339.0
	data_yield for data_A: 773.0
	data_yield for data_B: 386.0
	data_yield for data_D: 1557.0
	==> Total Yield: 3055.000
===== vbf_powheg_dipole =====
	data_yield for vbf_powheg_dipole: 0.14962994221889378
	==> Total Yield: 0.150
===== ggh_powhegPS =====
	data_yield for ggh_powhegPS: 0.1175762412861803
	==> Total Yield: 0.118


# With FatJet and MET veto

```bash
***** 2017 *****
===== data* =====
	data_yield for data_E: 475.0
	data_yield for data_C: 477.0
	data_yield for data_F: 727.0
	data_yield for data_B: 283.0
	data_yield for data_D: 201.0
	==> Total Yield: 2163.000
===== vbf_powheg_dipole =====
	data_yield for vbf_powheg_dipole: 0.10710499408020374
	==> Total Yield: 0.107
===== ggh_powhegPS =====
	data_yield for ggh_powhegPS: 0.07959645111818892
        	==> Total Yield: 0.080
```

# With default selection

```bash
***** 2017 *****
===== data* =====
	data_yield for data_E: 491.0
	data_yield for data_C: 492.0
	data_yield for data_F: 744.0
	data_yield for data_B: 289.0
	data_yield for data_D: 206.0
	==> Total Yield: 2222.000
===== vbf_powheg_dipole =====
	data_yield for vbf_powheg_dipole: 0.10736833694516268
	==> Total Yield: 0.107
===== ggh_powhegPS =====
	data_yield for ggh_powhegPS: 0.07959645111818892
	==> Total Yield: 0.080
```