In [1]:
import dask_awkward as dak
import awkward as ak
import pandas as pd
import itertools
import csv

from distributed import Client
import glob
import os

client =  Client(n_workers=40,  threads_per_worker=1, processes=True, memory_limit='8 GiB')

In [2]:
# from modules import selection_debug as selection
from modules import selection as selection

In [3]:
V1_fields_2compute = [
    # "gjj_mass",
    "wgt_nominal",
    "nBtagLoose_nominal",
    "nBtagMedium_nominal",
    "mu1_pt",
    "mu2_pt",
    "mu1_eta",
    "mu2_eta",
    "mu1_phi",
    "mu2_phi",
    "dimuon_pt",
    "dimuon_eta",
    "dimuon_phi",
    "dimuon_mass",
    "jet1_phi_nominal",
    "jet1_pt_nominal",
    "jet2_pt_nominal",
    "jet2_phi_nominal",
    "jet1_eta_nominal",
    "jet2_eta_nominal",
    "jj_mass_nominal",
    "jj_dEta_nominal",
    # "region",
    "event",
    "njets_nominal",
    # "run",
    # "event",
    # "luminosityBlock",
    "nfatJets_drmuon",
    "MET_pt"
]

In [7]:
def getYield(process, load_path, do_vbf_filter_study=False, year="-", do_VH_veto=False, category="vbf", region="h-sidebands"):
    print("{line} {process} {line}".format(line="="*5, process=process))

    fields = V1_fields_2compute
    if "data" in process and "gjj_mass" in fields:
        fields.remove("gjj_mass")
    elif "data" not in process and "gjj_mass" not in fields:
        fields.append("gjj_mass")

    # print(fields)

    filelist = glob.glob(f"{load_path}/{process}")
    total_integral = 0
    for file in filelist:
        # print(f"\tfile: {file}")
        events_data = dak.from_parquet(f"{file}/*/*.parquet")
        # print(events_data.fields)
        events_data = ak.zip({field: events_data[field] for field in fields}).compute()
        events_data = selection.applyRegionCatCuts(events_data, category=category, region_name=region, process=process, variation="nominal", do_vbf_filter_study=do_vbf_filter_study, do_VH_veto=do_VH_veto)

        wgts = ak.fill_none(events_data.wgt_nominal, value=1.0)
        data_yield = ak.sum(wgts)
        # print(f"\tSum weights: {data_yield}")
        df = pd.DataFrame({field: ak.fill_none(events_data[field], value=-999.9) for field in events_data.fields})
        print(f"\tdata_yield for {file.split("/")[-1]}: {data_yield}")
        total_integral += data_yield
    print(f"\t==> Total Yield: {total_integral:.3f}")
    return process,category, region,year,total_integral

In [8]:
# region="signal"
# region="h-peak"
region="h-sidebands"


In [9]:
suffix = "oct23_2025"
outfile = f"yield_{suffix}.csv"

categories = ["ggh", "vbf"]
years = ["2018", "2017", "2016preVFP", "2016postVFP"]

years = ["2018PR"]
info = []
# for year in years:
for category, year in list(itertools.product(categories, years)):
    print("{vspace}{lspace} {year} {lspace}".format(vspace="\n\n", lspace="*"*5, year=year))
    # load_path = f"/depot/cms/users/shar1172/hmm/copperheadV1clean/Run2_nanoAODv12_UpdatedQGL_FixPUJetIDWgt/stage1_output/{year}/compacted"
    # load_path = f"/depot/cms/hmm/shar1172/hmm_ntuples/copperheadV1clean/Run2_nanoAODv12_AK8jets/stage1_output/{year}/f1_0"
    # load_path = f"/depot/cms/hmm/shar1172/hmm_ntuples/copperheadV1clean/Run2_nanoAODv12_AK8jets/stage1_output/{year}/f1_0"
    load_path = f"/depot/cms/hmm/shar1172/hmm_ntuples/copperheadV1clean/CrossCheckCutFlow_BR_fatjet//stage1_output/{year}/f1_0"
    info.append(
        getYield(process = "data*", load_path = load_path, year=year, category=category, region=region, do_vbf_filter_study=False, do_VH_veto=False)
    )
    info.append(
        getYield(process = "vbf_powheg_dipole", load_path = load_path, year=year, category=category, region=region, do_vbf_filter_study=False, do_VH_veto=False)
    )
    info.append(
        getYield(process = "ggh_powhegPS", load_path = load_path, year=year, category=category, region=region, do_vbf_filter_study=False, do_VH_veto=False)
    )
    info.append(
        getYield(process = "dy_M-100To200_aMCatNLO", load_path = load_path, year=year, category=category, region=region, do_vbf_filter_study=True, do_VH_veto=False)
    )
    # info.append(
    #     getYield(process = "dy_M*_MiNNLO", load_path = load_path, year=year, do_vbf_filter_study=True)
    # )    
    # info.append(
    #     getYield(process = "ewk_lljj_mll50_mjj120", load_path = load_path, year=year, do_vbf_filter_study=False)
    # )   
    # info.append(
    #     getYield(process = "ttjets_*", load_path = load_path, year=year, do_vbf_filter_study=False)
    # )       
    # getYield("st_*")
    # getYield("ww_2l2nu")
    # getYield("wz_1l1nu2q")
    # getYield("wz_2l2q")
    # getYield("wz_3lnu")
    # getYield("zz")
    # getYield("www")
    # getYield("wwz")
    # getYield("wzz")
    # getYield("zzz")

with open(outfile, 'w', newline='') as out_f:
    writer = csv.writer(out_f)
    writer.writerow(['sample', 'category', 'region', 'year', 'yield'])
    writer.writerows(info)



***** 2018PR *****
===== data* =====
	data_yield for data_A: 270.0
	==> Total Yield: 270.000
===== vbf_powheg_dipole =====
	data_yield for vbf_powheg_dipole: 0.3043358403787821
	==> Total Yield: 0.304
===== ggh_powhegPS =====
	data_yield for ggh_powhegPS: 3.843323388156139
	==> Total Yield: 3.843
===== dy_M-100To200_aMCatNLO =====
	data_yield for dy_M-100To200_aMCatNLO: 292148.45529218967
	==> Total Yield: 292148.455


***** 2018PR *****
===== data* =====
	data_yield for data_A: 3.0
	==> Total Yield: 3.000
===== vbf_powheg_dipole =====
	data_yield for vbf_powheg_dipole: 0.11618856564753652
	==> Total Yield: 0.116
===== ggh_powhegPS =====
	data_yield for ggh_powhegPS: 0.17102386781875445
	==> Total Yield: 0.171
===== dy_M-100To200_aMCatNLO =====
	data_yield for dy_M-100To200_aMCatNLO: 19.11414148712845
	==> Total Yield: 19.114
