In [25]:
def ranges_overlap(range1, range2):
    """
    Check if two ranges overlap.

    Parameters:
    range1 (tuple): A tuple representing the first range (start1, end1).
    range2 (tuple): A tuple representing the second range (start2, end2).

    Returns:
    bool: True if the ranges overlap, False otherwise.
    """
    start1, end1 = range1
    start2, end2 = range2
        
    # Check for overlap
    return max(start1, start2) < min(end1, end2)

def getMidpoint(a: float, b: float) -> float:
    """
    Calculate the midpoint of two float values.

    :param a: The first float value.
    :param b: The second float value.
    :return: The midpoint of the two values.
    """
    return (a + b) / 2

def checkNewCharacteristic(new_hist, new_hist_err):
    assert len(new_hist) == 2 # we assume new_hist has size of two
    # obtain up and down histogram ranges
    lower_bin_down_up = (
        new_hist[0] - new_hist_err[0],
        new_hist[0] + new_hist_err[0] 
    )
    upper_bin_down_up = (
        new_hist[1] - new_hist_err[1],
        new_hist[1] + new_hist_err[1] 
    )
    # check if either lower bin edge range or upper one has zero difference. Then return False
    if abs(lower_bin_down_up[0]-lower_bin_down_up[1]) ==0:
        return False
    elif abs(upper_bin_down_up[0]-upper_bin_down_up[1]) ==0:
        return False
    
    no_range_overlap = not ranges_overlap(lower_bin_down_up, upper_bin_down_up) # no range overlap means there was a characateristic we missed
    new_characteristic = no_range_overlap

    # print(f"lower_bin_down_up: {lower_bin_down_up}")
    # print(f"upper_bin_down_up: {upper_bin_down_up}")
    # print(f"no_range_overlap: {no_range_overlap}")
    return new_characteristic



In [5]:
import numpy as np
import matplotlib.pyplot as plt
import copy

# Example data
data1 = np.random.normal(loc=5, scale=1, size=100_000)
data2 = np.random.normal(loc=10, scale=1, size=200_000)
data3 = np.random.normal(loc=20, scale=1, size=1000_000)
data = np.concatenate([data1, data2, data3])

In [6]:
import dask_awkward as dak
import numpy as np
import awkward as ak
import argparse
import sys
from distributed import LocalCluster, Client, progress
np.set_printoptions(threshold=sys.maxsize)
import correctionlib
from correctionlib import schemav2 as schema


In [7]:
from distributed import LocalCluster, Client
cluster = LocalCluster(processes=True)
cluster.adapt(minimum=8, maximum=31) #min: 8 max: 32
client = Client(cluster)
print("Local scale Client created")

Perhaps you already have a cluster running?
Hosting the HTTP server on port 40707 instead


Local scale Client created




In [8]:
def filterRegion(events, region="h-peak"):
    dimuon_mass = events.dimuon_mass
    if region =="h-peak":
        region = (dimuon_mass > 115.03) & (dimuon_mass < 135.03)
    elif region =="h-sidebands":
        region = ((dimuon_mass > 110) & (dimuon_mass < 115.03)) | ((dimuon_mass > 135.03) & (dimuon_mass < 150))
    elif region =="signal":
        region = (dimuon_mass >= 110) & (dimuon_mass <= 150.0)
    elif region =="z-peak":
        region = (dimuon_mass >= 70) & (dimuon_mass <= 110.0)

    # mu1_pt = events.mu1_pt
    # mu1ptOfInterest = (mu1_pt > 75) & (mu1_pt < 150.0)
    # events = events[region&mu1ptOfInterest]
    events = events[region]
    return events


run_label = "V2_Jan09_ForZptReWgt"
# run_label = args.label
year = "2018"
base_path = f"/depot/cms/users/yun79/hmm/copperheadV1clean/{run_label}/stage1_output/{year}/f1_0" # define the save path of stage1 outputs

# # temporary overwrite
# year = "2016"
# base_path = f"/depot/cms/users/yun79/hmm/copperheadV1clean/{run_label}/stage1_output/2016*/f1_0" # define the save path of stage1 outputs

# load the data and dy samples
data_events = dak.from_parquet(f"{base_path}/data_*/*/*.parquet")
dy_events = dak.from_parquet(f"{base_path}/dy_M-50/*/*.parquet")

# apply z-peak region filter and nothing else
data_events = filterRegion(data_events, region="z-peak")
dy_events = filterRegion(dy_events, region="z-peak")

# compute the events to local memory
njet_field = "njets_nominal"
value_field = "dimuon_pt"
weight_field = "wgt_nominal"
fields2load = [njet_field, value_field, weight_field]
# dy_events = ak.zip({field: dy_events for field in fields2load}).compute()
# data_events = ak.zip({field: data_events for field in fields2load}).compute()


SF_hists = []
# for njet in [0,1,2]:
# for njet in [0]:
njet = 0
if njet != 2:
    data_events_loop = data_events[data_events[njet_field] ==njet]
    dy_events_loop = dy_events[dy_events[njet_field] ==njet]
else:
    data_events_loop = data_events[data_events[njet_field] >=njet]
    dy_events_loop = dy_events[dy_events[njet_field] >=njet]






In [9]:
def getSF_hist(data_event, dy_event, binning):
    """
    return SF (Data/DY ratio) histogram with its error
    """
    # obtain histogram for data
    values = data_event["dimuon_pt"]
    weights = data_event["wgt_nominal"]
    data_hist, _ = np.histogram(values, bins=binning, weights = weights)
    data_hist_w2, _ = np.histogram(values, bins=binning, weights = weights*weights)
    data_hist = ak.to_numpy(data_hist)
    data_hist_err = np.sqrt(ak.to_numpy(data_hist_w2))
    # obtain histogram for dy
    values = dy_event["dimuon_pt"]
    weights = dy_event["wgt_nominal"]
    dy_hist, _ = np.histogram(values, bins=binning, weights = weights)
    dy_hist_w2, _ = np.histogram(values, bins=binning, weights = weights*weights)
    dy_hist = ak.to_numpy(dy_hist)
    dy_hist_err = np.sqrt(ak.to_numpy(dy_hist_w2))
    
    # initialize ratio histogram and fill in values
    ratio_hist = np.zeros_like(data_hist)
    inf_filter = dy_hist>0
    ratio_hist[inf_filter] = data_hist[inf_filter]/  dy_hist[inf_filter]
    # add relative uncertainty of data and bkg_mc by adding by quadrature
    rel_unc_ratio = np.sqrt((dy_hist_err/dy_hist)**2 + (data_hist_err/data_hist)**2)
    ratio_err = rel_unc_ratio*ratio_hist
    return ratio_hist, ratio_err

In [12]:
xmax = 200
xmin = 0
initial_bins = np.linspace(xmin, xmax, 5)

old_bins = initial_bins
current_bins = copy.deepcopy(old_bins)
# loop over old bins and divide them into two equal bins
print(f"current_bins: {current_bins}")

bin_has_changed = True
fields2load = ["wgt_nominal", "dimuon_pt"]
data_dict = {field: data_events_loop[field].compute() for field in fields2load}
dy_dict = {field: dy_events_loop[field].compute() for field in fields2load}


        

current_bins: [  0.  50. 100. 150. 200.]


2025-01-14 22:43:17,474 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/depot/cms/kernels/coffea_latest/lib/python3.11/site-packages/distributed/comm/tcp.py", line 225, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/depot/cms/kernels/coffea_latest/lib/python3.11/site-packages/distributed/worker.py", line 1250, in heartbeat
    response = await retry_operation(
               ^^^^^^^^^^^^^^^^^^^^^^
  File "/depot/cms/kernels/coffea_latest/lib/python3.11/site-packages/distributed/utils_comm.py", line 461, in retry_operation
    return await retry(
           ^^^^^^^^^^^^
  File "/depot/cms/kernels/coffea_latest/lib/python3.11/site-packages/distributed

new loop start ---------------------------------------------------------------------
current_bins length: 5
current_bins: [  0.  50. 100. 150. 200.]


2025-01-14 22:43:45,370 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:43:45,371 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:43:45,371 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.


bin_low_edge: 0.0
bin_low_edge: 50.0


KeyboardInterrupt: 

In [13]:
data_events_loop["wgt_nominal"].compute()

2025-01-14 22:44:24,359 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:24,359 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:24,359 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:24,360 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:24,360 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:24,361 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:24,639 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:24,639 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:29,382 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:29,382 - distributed.core - INFO - Con

2025-01-14 22:44:31,377 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:31,377 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:31,377 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:31,377 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:31,378 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.
2025-01-14 22:44:31,379 - distributed.core - INFO - Connection to tcp://127.0.0.1:43543 has been closed.


In [21]:
# data_dict
for value in data_dict.values():
    print(type(value))

<class 'awkward.highlevel.Array'>
<class 'awkward.highlevel.Array'>


In [43]:
old_bins = np.array([  0.,        3.125 ,    6.25 ,     9.375,    12.5,      15.625,    18.75,
  21.875,    25.,       31.25 ,    37.5 ,     40.625,    42.1875,   42.96875,
  43.75,     46.875,    48.4375 ,  50.,      100.,      150.,      200.,     ],
                   dtype="float")
current_bins = copy.deepcopy(old_bins)
new_bins = copy.deepcopy(current_bins)
# loop over old bins and divide them into two equal bins
print(f"current_bins: {current_bins}")

current_bins: [  0.        3.125     6.25      9.375    12.5      15.625    18.75
  21.875    25.       31.25     37.5      40.625    42.1875   42.96875
  43.75     46.875    48.4375   50.      100.      150.      200.     ]


In [44]:
bin_values_already_tested = []

while True:
    bin_has_changed = False # make this false until flipped True
    print("new loop start ---------------------------------------------------------------------")
    print(f"current_bins length: {len(current_bins)}")
    print(f"current_bins: {', '.join(map(str, current_bins.tolist()))}")
    
    for bin_idx in range(len(current_bins)-1):
        bin_low_edge = current_bins[bin_idx]
        bin_high_edge = current_bins[bin_idx+1]
        
        bin_mid = getMidpoint(bin_low_edge, bin_high_edge)
        # check if this bin_mid value has been already tested and if so, skip
        if bin_mid in bin_values_already_tested:
            print(f"{bin_mid} has been already tested. Skipping!")
            continue
        
        # Make new Binning and plot histogram
        new_binning = np.array([bin_low_edge, bin_mid, bin_high_edge])
        # new_hist, edges = np.histogram(data, bins=new_binning)
        # new_hist_err = np.sqrt(new_hist)
        new_hist, new_hist_err = getSF_hist(data_dict, dy_dict, new_binning)
        new_charaacteristic = checkNewCharacteristic(new_hist, new_hist_err)
        print(f"bin_low_edge: {bin_low_edge}")
        # print(f"bin_high_edge: {bin_high_edge}")
        # print(f"bin_mid: {bin_mid}")
        # print(f"new_hist: {new_hist}")
        # print(f"new_hist_err: {new_hist_err}")
        # print(f"edges: {edges}")
        # if new binning leads to new characateristic, keep new binning
        if new_charaacteristic:
            # add new bin edge and sort
            new_bins = list(new_bins) + [bin_mid]
            new_bins = list(set(new_bins)) # remove any redundant values as sanity check
            new_bins = np.array(sorted(new_bins)) 
            print(f"adding edge {bin_mid}")
            # print(f"new_bins: {new_bins}")
            bin_has_changed = True 
        else:
            print(f"NOT adding edge {bin_mid}")
            bin_values_already_tested.append(bin_mid)
    
    # repeat until no new bin edge has been added, then end loop
    if bin_has_changed:
        current_bins = new_bins
    else: 
        print("No new bins were found. Ending Loop!")
        print(f"final binning: {current_bins}")
        break # end loop of no bin has changed

new loop start ---------------------------------------------------------------------
current_bins length: 21
current_bins: 0.0, 3.125, 6.25, 9.375, 12.5, 15.625, 18.75, 21.875, 25.0, 31.25, 37.5, 40.625, 42.1875, 42.96875, 43.75, 46.875, 48.4375, 50.0, 100.0, 150.0, 200.0
bin_low_edge: 0.0
adding edge 1.5625
bin_low_edge: 3.125
adding edge 4.6875
bin_low_edge: 6.25
adding edge 7.8125
bin_low_edge: 9.375
adding edge 10.9375
bin_low_edge: 12.5
NOT adding edge 14.0625
bin_low_edge: 15.625
adding edge 17.1875
bin_low_edge: 18.75
adding edge 20.3125
bin_low_edge: 21.875
NOT adding edge 23.4375
bin_low_edge: 25.0
NOT adding edge 28.125
bin_low_edge: 31.25
NOT adding edge 34.375
bin_low_edge: 37.5
NOT adding edge 39.0625
bin_low_edge: 40.625
NOT adding edge 41.40625
bin_low_edge: 42.1875
NOT adding edge 42.578125
bin_low_edge: 42.96875
NOT adding edge 43.359375
bin_low_edge: 43.75
NOT adding edge 45.3125
bin_low_edge: 46.875
NOT adding edge 47.65625
bin_low_edge: 48.4375
NOT adding edge 49.21

In [48]:
new_bins

array([  0.        ,   1.5625    ,   1.953125  ,   2.1484375 ,
         2.24609375,   2.34375   ,   2.734375  ,   3.125     ,
         3.22265625,   3.3203125 ,   3.515625  ,   3.7109375 ,
         3.75976562,   3.80859375,   3.90625   ,   4.00390625,
         4.05273438,   4.1015625 ,   4.296875  ,   4.4921875 ,
         4.6875    ,   4.8828125 ,   5.078125  ,   5.2734375 ,
         5.37109375,   5.46875   ,   5.6640625 ,   5.859375  ,
         6.25      ,   6.4453125 ,   6.54296875,   6.640625  ,
         6.8359375 ,   7.03125   ,   7.421875  ,   7.51953125,
         7.6171875 ,   7.8125    ,   8.0078125 ,   8.203125  ,
         8.59375   ,   8.7890625 ,   8.984375  ,   9.375     ,
        10.9375    ,  12.5       ,  15.625     ,  16.015625  ,
        16.2109375 ,  16.30859375,  16.40625   ,  17.1875    ,
        18.75      ,  20.3125    ,  21.875     ,  25.        ,
        31.25      ,  37.5       ,  40.625     ,  42.1875    ,
        42.96875   ,  43.75      ,  46.875     ,  48.43

In [46]:
current_bins

array([  0.        ,   1.5625    ,   1.953125  ,   2.1484375 ,
         2.24609375,   2.34375   ,   2.734375  ,   3.125     ,
         3.22265625,   3.3203125 ,   3.515625  ,   3.7109375 ,
         3.75976562,   3.80859375,   3.90625   ,   4.00390625,
         4.05273438,   4.1015625 ,   4.296875  ,   4.4921875 ,
         4.6875    ,   4.8828125 ,   5.078125  ,   5.2734375 ,
         5.37109375,   5.46875   ,   5.6640625 ,   5.859375  ,
         6.25      ,   6.4453125 ,   6.54296875,   6.640625  ,
         6.8359375 ,   7.03125   ,   7.421875  ,   7.51953125,
         7.6171875 ,   7.8125    ,   8.0078125 ,   8.203125  ,
         8.59375   ,   8.7890625 ,   8.984375  ,   9.375     ,
        10.9375    ,  12.5       ,  15.625     ,  16.015625  ,
        16.2109375 ,  16.30859375,  16.40625   ,  17.1875    ,
        18.75      ,  20.3125    ,  21.875     ,  25.        ,
        31.25      ,  37.5       ,  40.625     ,  42.1875    ,
        42.96875   ,  43.75      ,  46.875     ,  48.43