In [1]:
import os
import re
import sys
import h5py
import shutil

import numpy as np

from pathlib import Path

sys.path.append('..')
import utils_HDF5 as utils

# TriHiggs

In [2]:
def print_triHiggs_h5_info(file_path):
    # 印出 triHiggs HDF5 資料中，各 Higgs 數目的事件數
    print(file_path)
    with h5py.File(file_path, 'r') as f:

        h1b1 = f['TARGETS/h1/b1'][...]
        h1b2 = f['TARGETS/h1/b2'][...]
        h2b1 = f['TARGETS/h2/b1'][...]
        h2b2 = f['TARGETS/h2/b2'][...]
        h3b1 = f['TARGETS/h3/b1'][...]
        h3b2 = f['TARGETS/h3/b2'][...]

        quark_jet = np.array([h1b1, h1b2, h2b1, h2b2, h3b1, h3b2]).T

        h1_mask = utils.get_particle_mask(quark_jet, (0, 1))
        h2_mask = utils.get_particle_mask(quark_jet, (2, 3))
        h3_mask = utils.get_particle_mask(quark_jet, (4, 5))
        
        n_tot = h1_mask.shape[0]
        n_0h = ((~h1_mask) & (~h2_mask) & (~h3_mask)).sum()
        # 任一個 Higgs 有對應的 jet
        n_1h = ((h1_mask & (~h2_mask) & (~h3_mask)) | 
                ((~h1_mask) & h2_mask & (~h3_mask)) | 
                ((~h1_mask) & (~h2_mask) & h3_mask)).sum()
        
        # 任兩個 Higgs 有對應的 jet
        n_2h = ((h1_mask & h2_mask & (~h3_mask)) | 
                ((~h1_mask) & h2_mask & h3_mask) | 
                (h1_mask & (~h2_mask) & h3_mask)).sum()
        n_3h = (h1_mask & h2_mask & h3_mask).sum()

    print(f'Dataset size: {n_tot}')
    print(f'Number of 0 Higgs events: {n_0h}')
    print(f'Number of 1 Higgs events: {n_1h}')
    print(f'Number of 2 Higgs events: {n_2h}')
    print(f'Number of 3 Higgs events: {n_3h}')
    
    print(f'\\item Total sample size: {n_tot:,}')
    print(f'\\item 1h sample size: {n_1h:,}')
    print(f'\\item 2h sample size: {n_2h:,}')
    print(f'\\item 3h sample size: {n_3h:,}')
    
    result = {
        'total': n_tot,
        '0h': n_0h,
        '1h': n_1h,
        '2h': n_2h,
        '3h': n_3h
    }
    return result

def print_h5_sb_info(file):
    # 印出訊號與背景的事件數
    with h5py.File(file,'r') as f:
        n_tot = f['CLASSIFICATIONS/EVENT/signal'][...].shape[0]
        ns = (f['CLASSIFICATIONS/EVENT/signal'][...] == 1).sum()
        nb = (f['CLASSIFICATIONS/EVENT/signal'][...] == 0).sum()

    print(f'\\item Total sample size: {n_tot:,}')
    print(f'\\item Signal sample size: {ns:,}')
    print(f'\\item Background sample size: {nb:,}')
    

def select_3h_event(file, output_file):
    # 選取 triHiggs HDF5 資料中，有 3 個 Higgs 的事件
    # root, ext = os.path.splitext(file)
    # new_file = root + '_3h' + ext

    with h5py.File(file, 'r') as f:
        h1b1 = f['TARGETS/h1/b1'][...]
        h1b2 = f['TARGETS/h1/b2'][...]
        h2b1 = f['TARGETS/h2/b1'][...]
        h2b2 = f['TARGETS/h2/b2'][...]
        h3b1 = f['TARGETS/h3/b1'][...]
        h3b2 = f['TARGETS/h3/b2'][...]

        quark_jet = np.array([h1b1, h1b2, h2b1, h2b2, h3b1, h3b2]).T

        h1_mask = utils.get_particle_mask(quark_jet, (0, 1))
        h2_mask = utils.get_particle_mask(quark_jet, (2, 3))
        h3_mask = utils.get_particle_mask(quark_jet, (4, 5))
        
        mask = h1_mask & h2_mask & h3_mask
        n_3h = mask.sum()

        print(f'Number of 3 Higgs events: {n_3h}')

        # copy 3h events to new file
        shutil.copyfile(file, output_file)
        with h5py.File(output_file, 'a') as f_new:
            for key in utils.get_dataset_keys(f):
                f_new[key].resize(n_3h, axis=0)
                f_new[key][:] = f[key][:][mask]

    return output_file

def select_nb_event(file, output_file, nb=6):
    # 選取 triHiggs HDF5 資料中，有 nb 個 b-jets 的事件
    # root, ext = os.path.splitext(file)

    with h5py.File(file, 'r') as f:
        bTag = f['INPUTS/Source/btag'][...]
        n_b_jet = np.sum(bTag, axis=1)

        mask = n_b_jet >= nb
        n_event = mask.sum()

        print(f'Number of {nb} b events: {n_event}')

        with h5py.File(output_file, 'w') as f_new:    
            for key in utils.get_dataset_keys(f):
                maxShape = list(f[key].maxshape)
                maxShape[0] = None
                f_new.create_dataset(key, maxshape=maxShape, data=f[key][:][mask])

    return output_file

def select_45b_event(file, output_file):
    # 選取 triHiggs HDF5 資料中，有 4 或 5 個 b-jets 的事件
    # root, ext = os.path.splitext(file)

    with h5py.File(file, 'r') as f:
        bTag = f['INPUTS/Source/btag'][...]
        n_b_jet = np.sum(bTag, axis=1)

        mask = (n_b_jet == 4) | (n_b_jet == 5)
        n_event = mask.sum()

        print(f'Number of 4, 5 b events: {n_event}')

        with h5py.File(output_file, 'w') as f_new:    
            for key in utils.get_dataset_keys(f):
                maxShape = list(f[key].maxshape)
                maxShape[0] = None
                f_new.create_dataset(key, maxshape=maxShape, data=f[key][:][mask])


    return output_file

def select_4pT40_event(file, output_file):
    # 選取 triHiggs HDF5 資料中，有 nb 個 b-jets 的事件
    # root, ext = os.path.splitext(file)

    with h5py.File(file, 'r') as f:
        pt = f['INPUTS/Source/pt'][...]
        pt_mask = pt[:, 3] > 40

        n_event = pt_mask.sum()
        print(f'Number of 4 pT > 40 GeV events: {n_event}')

        # copy 3h events to new file
        shutil.copyfile(file, output_file)
        with h5py.File(output_file, 'a') as f_new:
            for key in utils.get_dataset_keys(f):
                f_new[key].resize(n_event, axis=0)
                f_new[key][:] = f[key][:][pt_mask]

    return output_file

# Example

In [3]:
file_path = './SPANet/bkg/pp6b-pT25_0b.h5'
utils.print_h5_info(file_path, 600)

output_file = './SPANet/bkg/pp6b-pT25_4b.h5'
four_b_file = select_nb_event(file_path, output_file, nb=4)
print_triHiggs_h5_info(four_b_file)

output_file = './SPANet/bkg/pp6b-4pT40_4b.h5'
four_pT40_file = select_4pT40_event(four_b_file, output_file)
print_triHiggs_h5_info(four_pT40_file)

output_file = './SPANet/bkg/pp6b-4pT40_6b.h5'
six_b_file = select_nb_event(four_pT40_file, output_file, nb=6)
print_triHiggs_h5_info(six_b_file)

./SPANet/bkg/pp6b-pT25_0b.h5
Dataset size: 5570742
CLASSIFICATIONS/EVENT/signal 0
INPUTS/Source/MASK [ True  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/btag [False  True False False  True  True False False False False False False
 False False False]
INPUTS/Source/eta [-1.5613574   1.7147591   0.50634855 -2.4052203  -2.3219755   1.6762717
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [18.679089   8.476983   9.943685   6.022513   7.6641026  3.0452356
  0.         0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [ 0.15385643  2.5918746   3.0130267  -1.8666257  -0.7792846   2.0502343
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [80.86402  63.279446 46.487667 33.09533  30.195988 25.177929  0.
  0.        0.        0.        0.        0. 

{'total': 139246, '0h': 139246, '1h': 0, '2h': 0, '3h': 0}

In [4]:
spanet_data_dir = Path('~/SPANet2.3/data/triHiggs/').expanduser()
print_triHiggs_h5_info(spanet_data_dir / 'triHiggs_TRSM-4pT40_4b-mix_5-train.h5')
print_triHiggs_h5_info(spanet_data_dir / 'triHiggs_TRSM-4pT40_4b-mix_5-test.h5')

print_h5_sb_info(spanet_data_dir / 'triHiggs_TRSM-4pT40_4b-mix_5-train.h5')
print_h5_sb_info(spanet_data_dir / 'triHiggs_TRSM-4pT40_4b-mix_5-test.h5')

/home/r10222035/SPANet2.3/data/triHiggs/triHiggs_TRSM-4pT40_4b-mix_5-train.h5
Dataset size: 6000000
Number of 0 Higgs events: 1234667
Number of 1 Higgs events: 1200697
Number of 2 Higgs events: 1985367
Number of 3 Higgs events: 1579269
\item Total sample size: 6,000,000
\item 1h sample size: 1,200,697
\item 2h sample size: 1,985,367
\item 3h sample size: 1,579,269
/home/r10222035/SPANet2.3/data/triHiggs/triHiggs_TRSM-4pT40_4b-mix_5-test.h5
Dataset size: 300000
Number of 0 Higgs events: 61689
Number of 1 Higgs events: 59966
Number of 2 Higgs events: 99105
Number of 3 Higgs events: 79240
\item Total sample size: 300,000
\item 1h sample size: 59,966
\item 2h sample size: 99,105
\item 3h sample size: 79,240
\item Total sample size: 6,000,000
\item Signal sample size: 5,000,000
\item Background sample size: 1,000,000
\item Total sample size: 300,000
\item Signal sample size: 250,000
\item Background sample size: 50,000


In [5]:
def process_partcile_label(file_path):
    print('Process particle mask.')
    with h5py.File(file_path, 'r+') as f:
        nevent = f['INPUTS/Source/pt'].shape[0]
        h1b1 = f['TARGETS/h1/b1'][...]
        h1b2 = f['TARGETS/h1/b2'][...]
        h2b1 = f['TARGETS/h2/b1'][...]
        h2b2 = f['TARGETS/h2/b2'][...]
        h3b1 = f['TARGETS/h3/b1'][...]
        h3b2 = f['TARGETS/h3/b2'][...]

        quarks_Jet = np.array([h1b1, h1b2, h2b1, h2b2, h3b1, h3b2]).T
        # 取得 quarks 的 mask
        h1_mask = utils.get_particle_mask(quarks_Jet, (0, 1))
        h2_mask = utils.get_particle_mask(quarks_Jet, (2, 3))
        h3_mask = utils.get_particle_mask(quarks_Jet, (4, 5))

        # 如果 quarks 的 mask 為 False，則將對應的 jet 設為 -1
        f['TARGETS/h1/b1'][...] = np.where(h1_mask == False, -1, h1b1)
        f['TARGETS/h1/b2'][...] = np.where(h1_mask == False, -1, h1b2)
        f['TARGETS/h2/b1'][...] = np.where(h2_mask == False, -1, h2b1)
        f['TARGETS/h2/b2'][...] = np.where(h2_mask == False, -1, h2b2)
        f['TARGETS/h3/b1'][...] = np.where(h3_mask == False, -1, h3b1)
        f['TARGETS/h3/b2'][...] = np.where(h3_mask == False, -1, h3b2)

# replace all nan by 0
def replace_nan_to_zero(file_path):
    print('Replace all nan by 0.')
    with h5py.File(file_path, 'r+') as f:
        dataset_keys = utils.get_dataset_keys(f)
        for key in dataset_keys:
            data = f[key][...]
            data[np.isnan(data)] = 0
            f[key][...] = data

In [6]:
file_path = './SPANet/TRSM/TRSM_500_350/gghhh-4pT40_4b-99.h5'
process_partcile_label(file_path)
replace_nan_to_zero(file_path)

Process particle mask.
Replace all nan by 0.


In [7]:
file_path = './SPANet/TRSM/TRSM_500_350/gghhh-4pT40_4b-99.h5'
utils.print_h5_info(file_path, 100)

./SPANet/TRSM/TRSM_500_350/gghhh-4pT40_4b-99.h5
Dataset size: 1214086
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/btag [False  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/eta [-2.3142445  -0.6806595  -1.0417455   0.91909975  0.34773642 -0.53816724
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [42.25673   12.320855  20.852262  15.874163   9.8029785  4.000685
  0.         0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [ 1.1038225 -1.2953217 -2.2855241  1.0666269 -2.6694667  2.4733875
  0.         0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/pt [278.38937  210.61337  154.87965   89.28765   65.267235  36.080963
   0.         0.         0.         0

# Make training and testing dataset

## Background

In [8]:
bkg_dir = Path('./SPANet/bkg/')
file_path = bkg_dir / 'pp6b-4pT40_4b.h5'

# prepare 4b training and testing datasets
train_size = 1000000
test_size = 50000
size = train_size + test_size
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, bkg_dir / 'pp6b-4pT40_4b-1.h5') # 4b train + test
os.rename(split_file2, bkg_dir / 'pp6b-4pT40_4b-2.h5') # 6b test

file_path = bkg_dir / 'pp6b-4pT40_4b-1.h5'
utils.split_h5_size(file_path, train_size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, bkg_dir / 'pp6b-4pT40_4b-train.h5')
os.rename(split_file2, bkg_dir / 'pp6b-4pT40_4b-test.h5')


# prepare 6b testing sample
output_file = bkg_dir / 'pp6b-4pT40_6b.h5'
six_b_file = select_nb_event(bkg_dir / 'pp6b-4pT40_4b-2.h5', output_file, nb=6)
print_triHiggs_h5_info(six_b_file)

size = 50000
utils.split_h5_size(six_b_file, size)

root, ext = os.path.splitext(six_b_file)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, bkg_dir / 'pp6b-4pT40_6b-test.h5')

# remove the split file
os.remove(bkg_dir / 'pp6b-4pT40_4b-1.h5')
os.remove(bkg_dir / 'pp6b-4pT40_4b-2.h5')
os.remove(split_file2)

Size of SPANet/bkg/pp6b-4pT40_4b.h5: 2030855
Size of SPANet/bkg/pp6b-4pT40_4b_split1.h5: 1050000
Size of SPANet/bkg/pp6b-4pT40_4b_split2.h5: 980855
Size of SPANet/bkg/pp6b-4pT40_4b-1.h5: 1050000
Size of SPANet/bkg/pp6b-4pT40_4b-1_split1.h5: 1000000
Size of SPANet/bkg/pp6b-4pT40_4b-1_split2.h5: 50000
Number of 6 b events: 67436
SPANet/bkg/pp6b-4pT40_6b.h5
Dataset size: 67436
Number of 0 Higgs events: 67436
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 67,436
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0
Size of SPANet/bkg/pp6b-4pT40_6b.h5: 67436
Size of SPANet/bkg/pp6b-4pT40_6b_split1.h5: 50000
Size of SPANet/bkg/pp6b-4pT40_6b_split2.h5: 17436


## DM-CPV Signal

In [3]:
# merge all .h5 files in the folder
# for m3_m2 in ['500_275', '500_300', '520_325']:
for m3_m2 in ['500_275', '500_300', '520_325', '570_250', '600_325', '700_325', '800_325', '700_400', '800_400']:
    files_path = f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/'
    files = [os.path.join(files_path, name) for name in os.listdir(files_path) if name.startswith('delphes_events_')]
    merged_h5 = utils.merge_h5_file(*files)

    new_file = f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b.h5'
    os.rename(merged_h5, new_file)

'./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920.h5' and ('./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_191926.h5',) are same structure, can be merged.
./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920_merged.h5 not exist. Copy ./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920.h5 to ./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920_merged.h5
Size of ./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920.h5: 263169
Size of ./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_191926.h5: 397665
Size of ./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920_merged.h5: 660834
'./Sample/SPANet/sig/gghhh_bsm_500_300/delphes_events_20241209_222922.h5' and ('./Sample/SPANet/sig/gghhh_bsm_500_300/delphes_events_20241206_025512.h5',) are same structure, can be merged.
./Sample/SPANet/sig/gghhh_bsm_500_300/delphes_events_20241209_222922_merged.h5 not exist. Copy ./Sam

In [7]:
for m3_m2 in ['570_250', '600_325', '700_325', '800_325', '700_400', '800_400']:
    file_name = f'./Sample/45b_400k/{m3_m2}_45b_400000.h5'
    new_file = f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b.h5'
    # copy 3h events to new file
    shutil.copyfile(file_name, new_file)

In [None]:
# m3_m2_list = ['420_280','500_275', '500_300', '520_325']
m3_m2_list = ['420_280', '500_275', '500_300', '520_325',
              '570_250', '600_325', '700_325',
              '800_325', '700_400', '800_400'
              ]
n_mass = len(m3_m2_list)
n_4b_total = 500000
for m3_m2 in m3_m2_list:

    # split signal files for 4b and 6b
    file_dir = Path(f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/')
    file_path = file_dir / 'gghhh-4pT40_4b.h5'
    size = n_4b_total // n_mass
    utils.split_h5_size(file_path, size)

    root, ext = os.path.splitext(file_path)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_4b-1.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_4b-2.h5')

    file_path = file_dir / 'gghhh-4pT40_4b-1.h5'
    size = int(n_4b_total // n_mass * 0.9)
    utils.split_h5_size(file_path, size)

    root, ext = os.path.splitext(file_path)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_4b-train.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_4b-test.h5')

    output_file = file_dir / 'gghhh-4pT40_6b.h5'
    six_b_file = select_nb_event(file_dir / 'gghhh-4pT40_4b-2.h5', output_file, nb=6)
    print_triHiggs_h5_info(six_b_file)

    size = 50000 // n_mass
    utils.split_h5_size(six_b_file, size)

    root, ext = os.path.splitext(six_b_file)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_6b-1.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_6b-2.h5')

Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b.h5: 400000
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b_split1.h5: 83333
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b_split2.h5: 316667
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b-1.h5: 83333
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b-1_split1.h5: 74999
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b-1_split2.h5: 8334
Number of 6 b events: 0
Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_6b.h5
Dataset size: 0
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 0
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_6b.h5: 0
Split size 8333 is greater than the input file size 0.
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_6b_split1.h5: 0
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT4

### $4b$ dataset

In [None]:
files = ['./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5'] + [f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-train.h5' for m3_m2 in m3_m2_list]
# files = [f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-train.h5' for m3_m2 in m3_m2_list]
# files = [f'./Sample/45b_400k/{m3_m2}_45b_400000.h5' for m3_m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

new_file = f'./Sample/SPANet/triHiggs-4pT40_4b-mix_{n_mass}-train.h5'
os.rename(merged_h5, new_file)

files = ['./Sample/SPANet/bkg/pp6b-4pT40_4b-test.h5'] + [f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-test.h5' for m3_m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

new_file = f'./Sample/SPANet/triHiggs-4pT40_4b-mix_{n_mass}-test.h5'
os.rename(merged_h5, new_file)

'./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5' and ('./Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b-train.h5', './Sample/SPANet/sig/gghhh_bsm_600_325/gghhh-4pT40_4b-train.h5', './Sample/SPANet/sig/gghhh_bsm_700_325/gghhh-4pT40_4b-train.h5', './Sample/SPANet/sig/gghhh_bsm_800_325/gghhh-4pT40_4b-train.h5', './Sample/SPANet/sig/gghhh_bsm_700_400/gghhh-4pT40_4b-train.h5', './Sample/SPANet/sig/gghhh_bsm_800_400/gghhh-4pT40_4b-train.h5') are same structure, can be merged.
./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5 not exist. Copy ./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5 to ./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5: 900000
Size of ./Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b-train.h5: 74999
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5: 974999
Size of ./Sample/SPANet/sig/gghhh_bsm_600_325/gghhh-4pT40_4b-train.h5: 74999
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5: 1049998
Size of ./Sample/

TypeError: rename: src should be string, bytes or os.PathLike, not NoneType

In [None]:
utils.shuffle_h5(f'./Sample/SPANet/triHiggs-4pT40_4b-mix_{n_mass}-train.h5')
utils.shuffle_h5(f'./Sample/SPANet/triHiggs-4pT40_4b-mix_{n_mass}-test.h5')

Dataset size: 1349994


IndexError: list index out of range

### $6b$ dataset: 50k + 50K

In [None]:
files = ['./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5'] + [f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_6b-1.h5' for m3_m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

new_file = './Sample/SPANet/triHiggs-4pT40_6b-mix_10.h5'
os.rename(merged_h5, new_file)

'./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5' and ('./Sample/SPANet/sig/gghhh_bsm_420_280/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_500_275/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_500_300/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_520_325/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_600_325/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_700_325/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_800_325/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_700_400/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_800_400/gghhh-4pT40_6b-1.h5') are same structure, can be merged.
./Sample/SPANet/bkg/pp6b-4pT40_6b-1_merged.h5 not exist. Copy ./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5 to ./Sample/SPANet/bkg/pp6b-4pT40_6b-1_merged.h5
Size of ./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5: 50000
Size of ./Sample/SPANet/sig/gghhh_bsm_420_280/gghhh-4pT40_6b-1.h5: 5000
Size of ./Sample/SPANet/bkg/pp6b-

## TRSM signal

In [9]:
m3_m2_list = [(420, 280), (500, 275), (500, 300), (520, 325), (500, 350)]
for m3, m2 in m3_m2_list:
    files_path = Path(f'./SPANet/TRSM/TRSM_{m3}_{m2}/')

    files = [files_path / name for name in os.listdir(files_path) if re.match(r'gghhh-4pT40_4b-\d{2}.h5', name)]
    merged_h5 = utils.merge_h5_file(*files)

    new_file = files_path / 'gghhh-4pT40_4b.h5'
    os.rename(merged_h5, new_file)

'SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03.h5' and (PosixPath('SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-05.h5'), PosixPath('SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-01.h5'), PosixPath('SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-02.h5'), PosixPath('SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-04.h5'), PosixPath('SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-06.h5')) are same structure, can be merged.
SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03_merged.h5 not exist. Copy SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03.h5 to SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03_merged.h5


Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03.h5: 324213
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-05.h5: 323620
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03_merged.h5: 647833
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-01.h5: 3203
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03_merged.h5: 651036
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-02.h5: 32385
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03_merged.h5: 683421
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-04.h5: 323849
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03_merged.h5: 1007270
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-06.h5: 323585
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03_merged.h5: 1330855
'SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-03.h5' and (PosixPath('SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-05.h5'), PosixPath('SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-01.h5'), PosixPath('SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-02.h5'), PosixPath('SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-

In [10]:
n_mass = len(m3_m2_list)
size_4b_train = 1000000
size_4b_test = 50000
size_4b = size_4b_train + size_4b_test
size_6b = 10000
for m3, m2 in m3_m2_list:

    # split signal files for 4b and 6b
    file_dir = Path(f'./SPANet/TRSM/TRSM_{m3}_{m2}/')
    file_path = file_dir / 'gghhh-4pT40_4b.h5'
    utils.split_h5_size(file_path, size_4b)

    root, ext = os.path.splitext(file_path)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_4b-1.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_4b-2.h5')

    file_path = file_dir / 'gghhh-4pT40_4b-1.h5'
    utils.split_h5_size(file_path, size_4b_train)

    root, ext = os.path.splitext(file_path)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_4b-train.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_4b-test.h5')

    output_file = file_dir / 'gghhh-4pT40_6b.h5'
    six_b_file = select_nb_event(file_dir / 'gghhh-4pT40_4b-2.h5', output_file, nb=6)
    print_triHiggs_h5_info(six_b_file)

    utils.split_h5_size(six_b_file, size_6b)

    root, ext = os.path.splitext(six_b_file)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_6b-test.h5')

    # remove the split file
    os.remove(file_dir / 'gghhh-4pT40_4b-1.h5')
    os.remove(file_dir / 'gghhh-4pT40_4b-2.h5')
    os.remove(split_file2)

Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b.h5: 1330855
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b_split1.h5: 1050000
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b_split2.h5: 280855
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-1.h5: 1050000
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-1_split1.h5: 1000000
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-1_split2.h5: 50000
Number of 6 b events: 34946
SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b.h5
Dataset size: 34946
Number of 0 Higgs events: 463
Number of 1 Higgs events: 3747
Number of 2 Higgs events: 6963
Number of 3 Higgs events: 23773
\item Total sample size: 34,946
\item 1h sample size: 3,747
\item 2h sample size: 6,963
\item 3h sample size: 23,773
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b.h5: 34946
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b_split1.h5: 10000
Size of SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b_split2.h5: 24946
Size of SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b.h5: 1220598
Size of SPANet/TRSM/TRSM_500_

### $4b$ dataset

In [11]:
files = ['./SPANet/bkg/pp6b-4pT40_4b-train.h5'] + [f'./SPANet/TRSM/TRSM_{m3}_{m2}/gghhh-4pT40_4b-train.h5' for m3, m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

train_file = f'./SPANet/TRSM/triHiggs_TRSM-4pT40_4b-mix_{n_mass}-train.h5'
os.rename(merged_h5, train_file)

utils.shuffle_h5(train_file)

spanet_data_dir = Path('~/SPANet2.3/data/triHiggs/').expanduser()
shutil.copyfile(train_file, spanet_data_dir / 'triHiggs_TRSM-4pT40_4b-mix_5-train.h5')

'./SPANet/bkg/pp6b-4pT40_4b-train.h5' and ('./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-train.h5', './SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-train.h5', './SPANet/TRSM/TRSM_500_300/gghhh-4pT40_4b-train.h5', './SPANet/TRSM/TRSM_520_325/gghhh-4pT40_4b-train.h5', './SPANet/TRSM/TRSM_500_350/gghhh-4pT40_4b-train.h5') are same structure, can be merged.
./SPANet/bkg/pp6b-4pT40_4b-train_merged.h5 not exist. Copy ./SPANet/bkg/pp6b-4pT40_4b-train.h5 to ./SPANet/bkg/pp6b-4pT40_4b-train_merged.h5
Size of ./SPANet/bkg/pp6b-4pT40_4b-train.h5: 1000000
Size of ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-train.h5: 1000000
Size of ./SPANet/bkg/pp6b-4pT40_4b-train_merged.h5: 2000000
Size of ./SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-train.h5: 1000000
Size of ./SPANet/bkg/pp6b-4pT40_4b-train_merged.h5: 3000000
Size of ./SPANet/TRSM/TRSM_500_300/gghhh-4pT40_4b-train.h5: 1000000
Size of ./SPANet/bkg/pp6b-4pT40_4b-train_merged.h5: 4000000
Size of ./SPANet/TRSM/TRSM_520_325/gghhh-4pT40_4b-train.h5: 1000000
Size o

PosixPath('/home/r10222035/SPANet2.3/data/triHiggs/triHiggs_TRSM-4pT40_4b-mix_5-train.h5')

In [12]:
files = [f'./SPANet/TRSM/TRSM_{m3}_{m2}/gghhh-4pT40_4b-test.h5' for m3, m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

signal_file = f'./SPANet/TRSM/gghhh-4pT40_4b-mix_{n_mass}-test.h5'
os.rename(merged_h5, signal_file)

files = ['./SPANet/bkg/pp6b-4pT40_4b-test.h5'] + [f'./SPANet/TRSM/TRSM_{m3}_{m2}/gghhh-4pT40_4b-test.h5' for m3, m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

test_file = f'./SPANet/TRSM/triHiggs_TRSM-4pT40_4b-mix_{n_mass}-test.h5'
os.rename(merged_h5, test_file)

'./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-test.h5' and ('./SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-test.h5', './SPANet/TRSM/TRSM_500_300/gghhh-4pT40_4b-test.h5', './SPANet/TRSM/TRSM_520_325/gghhh-4pT40_4b-test.h5', './SPANet/TRSM/TRSM_500_350/gghhh-4pT40_4b-test.h5') are same structure, can be merged.
./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-test_merged.h5 not exist. Copy ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-test.h5 to ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-test_merged.h5
Size of ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-test.h5: 50000
Size of ./SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-test.h5: 50000
Size of ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-test_merged.h5: 100000
Size of ./SPANet/TRSM/TRSM_500_300/gghhh-4pT40_4b-test.h5: 50000
Size of ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-test_merged.h5: 150000
Size of ./SPANet/TRSM/TRSM_520_325/gghhh-4pT40_4b-test.h5: 50000
Size of ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-test_merged.h5: 200000
Size of ./SPANet/TRSM/TRSM_500_350

### $6b$ dataset: 50k + 50K

In [13]:
files = [f'./SPANet/TRSM/TRSM_{m3}_{m2}/gghhh-4pT40_6b-test.h5' for m3, m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

signal_file = f'./SPANet/TRSM/gghhh-4pT40_6b-mix_{n_mass}-test.h5'
os.rename(merged_h5, signal_file)

files = ['./SPANet/bkg/pp6b-4pT40_6b-test.h5'] + [f'./SPANet/TRSM/TRSM_{m3}_{m2}/gghhh-4pT40_6b-test.h5' for m3, m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

new_file = f'./SPANet/TRSM/triHiggs_TRSM-4pT40_6b-mix_{n_mass}-test.h5'
os.rename(merged_h5, new_file)

'./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b-test.h5' and ('./SPANet/TRSM/TRSM_500_275/gghhh-4pT40_6b-test.h5', './SPANet/TRSM/TRSM_500_300/gghhh-4pT40_6b-test.h5', './SPANet/TRSM/TRSM_520_325/gghhh-4pT40_6b-test.h5', './SPANet/TRSM/TRSM_500_350/gghhh-4pT40_6b-test.h5') are same structure, can be merged.
./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b-test_merged.h5 not exist. Copy ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b-test.h5 to ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b-test_merged.h5
Size of ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b-test.h5: 10000
Size of ./SPANet/TRSM/TRSM_500_275/gghhh-4pT40_6b-test.h5: 10000
Size of ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b-test_merged.h5: 20000
Size of ./SPANet/TRSM/TRSM_500_300/gghhh-4pT40_6b-test.h5: 10000
Size of ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b-test_merged.h5: 30000
Size of ./SPANet/TRSM/TRSM_520_325/gghhh-4pT40_6b-test.h5: 10000
Size of ./SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b-test_merged.h5: 40000
Size of ./SPANet/TRSM/TRSM_500_350/gg

## Process SPANet predict file for pairing and DNN
Change the dataset name and replace the predict results by true labels

In [3]:
def replace_labels(file_path, label_path):
    with h5py.File(label_path, 'r') as f:
        label = f['CLASSIFICATIONS/EVENT/signal'][...]

    with h5py.File(file_path, 'r+') as f:
        if 'CLASSIFICATIONS/EVENT/signal' in f:
            del f['CLASSIFICATIONS/EVENT/signal']
        f.create_dataset('CLASSIFICATIONS/EVENT/signal', data=label, chunks=True, maxshape=(None,))

def rename_dataset(file_path):
    with h5py.File(file_path, 'r+') as f:
        for key in utils.get_dataset_keys(f):
            if key.startswith('SpecialKey.'):
                new_key = key.replace('SpecialKey.', '')
            else:
                new_key = key
            # first word capitalize
            new_key = new_key.split('/')
            new_key = '/'.join([new_key[0].upper()] + new_key[1:])
            if new_key == key:
                continue
            
            maxShape = list(f[key].maxshape)
            maxShape[0] = None
            f.create_dataset(new_key, data=f[key][...], chunks=True, maxshape=maxShape)
            del f[key]

### DM-CPV

In [None]:
sample_dir = './Sample/SPANet/TRSM/'
rename_dataset('Sample/SPANet/triHiggs-4pT40_4b-mix-train-4b_SPANet_pairing.h5')
rename_dataset('Sample/SPANet/triHiggs-4pT40_4b-mix-test-4b_SPANet_pairing.h5')

file_path = 'Sample/SPANet/triHiggs-4pT40_4b-mix-train-4b_SPANet_pairing.h5'
label_path = 'Sample/SPANet/triHiggs-4pT40_4b-mix-train.h5'
replace_labels(file_path, label_path)

file_path = 'Sample/SPANet/triHiggs-4pT40_4b-mix-test-4b_SPANet_pairing.h5'
label_path = 'Sample/SPANet/triHiggs-4pT40_4b-mix-test.h5'
replace_labels(file_path, label_path)

In [None]:
sig_dir = sample_dir / 'sig'
for m3_m2 in ['420_280', '500_275', '500_300', '520_325']:
    file_path = sig_dir / f'gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-test-4b_SPANet_pairing.h5'
    label_path = sig_dir / f'gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-test.h5'
    rename_dataset(file_path)
    replace_labels(file_path, label_path)

    file_path = sig_dir / f'gghhh_bsm_{m3_m2}/gghhh-4pT40_6b-1-4b_SPANet_pairing.h5'
    label_path = sig_dir / f'gghhh_bsm_{m3_m2}/gghhh-4pT40_6b-1.h5'
    rename_dataset(file_path)
    replace_labels(file_path, label_path)

file_path = './Sample/SPANet/bkg/pp6b-4pT40_4b-test-4b_SPANet_pairing.h5'
label_path = './Sample/SPANet/bkg/pp6b-4pT40_4b-test.h5'
rename_dataset(file_path)
replace_labels(file_path, label_path)

file_path = './Sample/SPANet/bkg/pp6b-4pT40_6b-1-4b_SPANet_pairing.h5'
label_path = './Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5'
rename_dataset(file_path)
replace_labels(file_path, label_path)

In [None]:
file_path = 'Sample/SPANet/triHiggs-4pT40_6b-mix-4b_SPANet_pairing.h5'
label_path = 'Sample/SPANet/triHiggs-4pT40_6b-mix.h5'
rename_dataset(file_path)
replace_labels(file_path, label_path)

### TRSM

In [4]:
# for DNN training
sample_dir = Path('./Sample/SPANet/TRSM/')

file_path = sample_dir / 'triHiggs_TRSM-4pT40_4b-mix_5-train-4b_SPANet_pairing.h5'
label_path = sample_dir / 'triHiggs_TRSM-4pT40_4b-mix_5-train.h5'
rename_dataset(file_path)
replace_labels(file_path, label_path)

file_path = sample_dir / 'triHiggs_TRSM-4pT40_4b-mix_5-test-4b_SPANet_pairing.h5'
label_path = sample_dir / 'triHiggs_TRSM-4pT40_4b-mix_5-test.h5'
rename_dataset(file_path)
replace_labels(file_path, label_path)

file_path = sample_dir / 'triHiggs_TRSM-4pT40_6b-mix_5-4b_SPANet_pairing.h5'
label_path = sample_dir / 'triHiggs_TRSM-4pT40_6b-mix_5.h5'
rename_dataset(file_path)
replace_labels(file_path, label_path)

In [5]:
# for evaluating AUC on each mass point
for m3, m2 in [(420, 280), (500, 275), (500, 300), (520, 325), (500, 350)]:
    sig_dir = sample_dir / f'TRSM_{m3}_{m2}'
    file_path = sig_dir / f'gghhh-4pT40_4b-test-mix_5-4b_SPANet_pairing.h5'
    label_path = sig_dir / f'gghhh-4pT40_4b-test.h5'
    rename_dataset(file_path)
    replace_labels(file_path, label_path)

    file_path = sig_dir / f'gghhh-4pT40_6b-1-mix_5-4b_SPANet_pairing.h5'
    label_path = sig_dir / f'gghhh-4pT40_6b-1.h5'
    rename_dataset(file_path)
    replace_labels(file_path, label_path)

file_path = './Sample/SPANet/TRSM/pp6b-4pT40_4b-test-mix_5-4b_SPANet_pairing.h5'
label_path = './Sample/SPANet/bkg/pp6b-4pT40_4b-test.h5'
rename_dataset(file_path)
replace_labels(file_path, label_path)

file_path = './Sample/SPANet/TRSM/pp6b-4pT40_6b-1-mix_5-4b_SPANet_pairing.h5'
label_path = './Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5'
rename_dataset(file_path)
replace_labels(file_path, label_path)