In [1]:
import os
import h5py
import shutil

import numpy as np

from pathlib import Path

import utils_HDF5 as utils

# TriHiggs

In [2]:
def print_triHiggs_h5_info(file_path):
    # 印出 triHiggs HDF5 資料中，各 Higgs 數目的事件數
    print(file_path)
    with h5py.File(file_path, 'r') as f:

        h1b1 = f['TARGETS/h1/b1'][...]
        h1b2 = f['TARGETS/h1/b2'][...]
        h2b1 = f['TARGETS/h2/b1'][...]
        h2b2 = f['TARGETS/h2/b2'][...]
        h3b1 = f['TARGETS/h3/b1'][...]
        h3b2 = f['TARGETS/h3/b2'][...]

        quark_jet = np.array([h1b1, h1b2, h2b1, h2b2, h3b1, h3b2]).T

        h1_mask = utils.get_particle_mask(quark_jet, (0, 1))
        h2_mask = utils.get_particle_mask(quark_jet, (2, 3))
        h3_mask = utils.get_particle_mask(quark_jet, (4, 5))
        
        n_tot = h1_mask.shape[0]
        n_0h = ((~h1_mask) & (~h2_mask) & (~h3_mask)).sum()
        # 任一個 Higgs 有對應的 jet
        n_1h = ((h1_mask & (~h2_mask) & (~h3_mask)) | 
                ((~h1_mask) & h2_mask & (~h3_mask)) | 
                ((~h1_mask) & (~h2_mask) & h3_mask)).sum()
        
        # 任兩個 Higgs 有對應的 jet
        n_2h = ((h1_mask & h2_mask & (~h3_mask)) | 
                ((~h1_mask) & h2_mask & h3_mask) | 
                (h1_mask & (~h2_mask) & h3_mask)).sum()
        n_3h = (h1_mask & h2_mask & h3_mask).sum()

    print(f'Dataset size: {n_tot}')
    print(f'Number of 0 Higgs events: {n_0h}')
    print(f'Number of 1 Higgs events: {n_1h}')
    print(f'Number of 2 Higgs events: {n_2h}')
    print(f'Number of 3 Higgs events: {n_3h}')
    
    print(f'\\item Total sample size: {n_tot:,}')
    print(f'\\item 1h sample size: {n_1h:,}')
    print(f'\\item 2h sample size: {n_2h:,}')
    print(f'\\item 3h sample size: {n_3h:,}')
    
    result = {
        'total': n_tot,
        '0h': n_0h,
        '1h': n_1h,
        '2h': n_2h,
        '3h': n_3h
    }
    return result

def print_h5_sb_info(file):
    # 印出訊號與背景的事件數
    with h5py.File(file,'r') as f:
        n_tot = f['CLASSIFICATIONS/EVENT/signal'][...].shape[0]
        ns = (f['CLASSIFICATIONS/EVENT/signal'][...] == 1).sum()
        nb = (f['CLASSIFICATIONS/EVENT/signal'][...] == 0).sum()

    print(f'\\item Total sample size: {n_tot:,}')
    print(f'\\item Signal sample size: {ns:,}')
    print(f'\\item Background sample size: {nb:,}')
    

def select_3h_event(file, output_file):
    # 選取 triHiggs HDF5 資料中，有 3 個 Higgs 的事件
    # root, ext = os.path.splitext(file)
    # new_file = root + '_3h' + ext

    with h5py.File(file, 'r') as f:
        h1b1 = f['TARGETS/h1/b1'][...]
        h1b2 = f['TARGETS/h1/b2'][...]
        h2b1 = f['TARGETS/h2/b1'][...]
        h2b2 = f['TARGETS/h2/b2'][...]
        h3b1 = f['TARGETS/h3/b1'][...]
        h3b2 = f['TARGETS/h3/b2'][...]

        quark_jet = np.array([h1b1, h1b2, h2b1, h2b2, h3b1, h3b2]).T

        h1_mask = utils.get_particle_mask(quark_jet, (0, 1))
        h2_mask = utils.get_particle_mask(quark_jet, (2, 3))
        h3_mask = utils.get_particle_mask(quark_jet, (4, 5))
        
        mask = h1_mask & h2_mask & h3_mask
        n_3h = mask.sum()

        print(f'Number of 3 Higgs events: {n_3h}')

        # copy 3h events to new file
        shutil.copyfile(file, output_file)
        with h5py.File(output_file, 'a') as f_new:
            for key in utils.get_dataset_keys(f):
                f_new[key].resize(n_3h, axis=0)
                f_new[key][:] = f[key][:][mask]

    return output_file

def select_nb_event(file, output_file, nb=6):
    # 選取 triHiggs HDF5 資料中，有 nb 個 b-jets 的事件
    # root, ext = os.path.splitext(file)

    with h5py.File(file, 'r') as f:
        bTag = f['INPUTS/Source/btag'][...]
        n_b_jet = np.sum(bTag, axis=1)

        mask = n_b_jet >= nb
        n_6b = mask.sum()

        print(f'Number of {nb} b events: {n_6b}')

        # copy 3h events to new file
        shutil.copyfile(file, output_file)
        with h5py.File(output_file, 'a') as f_new:
            for key in utils.get_dataset_keys(f):
                f_new[key].resize(n_6b, axis=0)
                f_new[key][:] = f[key][:][mask]

    return output_file

def select_4pT40_event(file, output_file):
    # 選取 triHiggs HDF5 資料中，有 nb 個 b-jets 的事件
    # root, ext = os.path.splitext(file)

    with h5py.File(file, 'r') as f:
        pt = f['INPUTS/Source/pt'][...]
        pt_mask = pt[:, 3] > 40

        n_event = pt_mask.sum()
        print(f'Number of 4 pT > 40 GeV events: {n_event}')

        # copy 3h events to new file
        shutil.copyfile(file, output_file)
        with h5py.File(output_file, 'a') as f_new:
            for key in utils.get_dataset_keys(f):
                f_new[key].resize(n_event, axis=0)
                f_new[key][:] = f[key][:][pt_mask]

    return output_file

# Example

In [14]:
file_path = './Sample/SPANet/bkg/pp6b-pT25_0b.h5'
utils.print_h5_info(file_path, 600)

output_file = './Sample/SPANet/bkg/pp6b-pT25_4b.h5'
four_b_file = select_nb_event(file_path, output_file, nb=4)
print_triHiggs_h5_info(four_b_file)

output_file = './Sample/SPANet/bkg/pp6b-4pT40_4b.h5'
four_pT40_file = select_4pT40_event(four_b_file, output_file)
print_triHiggs_h5_info(four_pT40_file)

output_file = './Sample/SPANet/bkg/pp6b-4pT40_6b.h5'
six_b_file = select_nb_event(four_pT40_file, output_file, nb=6)
print_triHiggs_h5_info(six_b_file)

./Sample/SPANet/bkg/pp6b-pT25_0b.h5
Dataset size: 5570742
CLASSIFICATIONS/EVENT/signal 0
INPUTS/Source/MASK [ True  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/btag [False  True False False  True  True False False False False False False
 False False False]
INPUTS/Source/eta [-1.5613574   1.7147591   0.50634855 -2.4052203  -2.3219755   1.6762717
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [18.679089   8.476983   9.943685   6.022513   7.6641026  3.0452356
  0.         0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [ 0.15385643  2.5918746   3.0130267  -1.8666257  -0.7792846   2.0502343
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [80.86402  63.279446 46.487667 33.09533  30.195988 25.177929  0.
  0.        0.        0.        0.    

{'total': 139246, '0h': 139246, '1h': 0, '2h': 0, '3h': 0}

In [4]:
file_path = './Sample/signals_h5/570_250_1192477_fix.h5'
output_file = './Sample/SPANet/sig/gghhh_bsm_570_250/gghhh_570_250_4b.h5'
four_b_file = select_nb_event(file_path, output_file, nb=4)
print_triHiggs_h5_info(four_b_file)

# output_file = './Sample/SPANet/gghhh_bsm_570_250/gghhh-4pT40_4b.h5'
# four_pT40_file = select_4pT40_event(four_b_file, output_file)
# print_triHiggs_h5_info(four_pT40_file)

Number of 4 b events: 1192477
./Sample/SPANet/sig/gghhh_bsm_570_250/gghhh_570_250_4b.h5
Dataset size: 1192477
Number of 0 Higgs events: 14055
Number of 1 Higgs events: 174725
Number of 2 Higgs events: 601569
Number of 3 Higgs events: 402128
\item Total sample size: 1,192,477
\item 1h sample size: 174,725
\item 2h sample size: 601,569
\item 3h sample size: 402,128


{'total': 1192477, '0h': 14055, '1h': 174725, '2h': 601569, '3h': 402128}

In [15]:
file_path = './Sample/SPANet/sig/gghhh-4pT40_4b.h5'

output_file = './Sample/SPANet/sig/gghhh-4pT40_6b.h5'
six_b_file = select_nb_event(file_path, output_file, nb=6)
print_triHiggs_h5_info(six_b_file)

./Sample/SPANet/sig/gghhh_4b.h5
Dataset size: 3653696
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/btag [ True False  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/eta [0.97436297 0.7894716  2.3529906  1.225184   0.7835297  0.6211857
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]
INPUTS/Source/mass [17.857948  10.667569   8.483025  11.8251505  5.751204   3.7765465
  0.         0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [-0.39327845  1.9963444  -0.4675336   2.6990483  -1.6782572   2.7238095
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [103.46912   56.578156  51.16346   42.79874   37.01474   25.889942
   0.         0.         0.         0.         0.      

{'total': 451431, '0h': 5085, '1h': 44775, '2h': 92249, '3h': 309322}

In [4]:
file_path = './Sample/SPANet/sig/gghhh_bsm_500_275/gghhh-4pT40_4b_20241206_005920.h5'
utils.print_h5_info(file_path, 600)

file_path = './Sample/SPANet/sig/gghhh_bsm_500_275/gghhh-4pT40_4b_20241206_005920.h5'

output_file = './Sample/SPANet/sig/gghhh_bsm_500_275/gghhh-4pT40_6b.h5'
six_b_file = select_nb_event(file_path, output_file, nb=6)
print_triHiggs_h5_info(six_b_file)

./Sample/SPANet/sig/gghhh_bsm_500_275/gghhh-4pT40_4b_20241206_005920.h5
Dataset size: 263169
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True  True  True False False False False
 False False False]
INPUTS/Source/btag [False  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/eta [ 1.5676259  -0.7082688  -1.4816394  -0.11534607 -1.3581086  -1.5187671
  1.6142794   0.5776378   0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [31.525133  40.825253  10.715859  17.185774   6.4766912  6.9549823
  5.088252   6.411431   0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [-1.0851897   2.2579503   1.7293612  -0.51168835  1.0672604  -1.816191
 -3.0935354  -0.20700423  0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [197.18651  189.17502   84.165276  69.00303   45.805927  27.921621
  

{'total': 22235, '0h': 577, '1h': 3497, '2h': 5723, '3h': 12438}

In [4]:
print_triHiggs_h5_info('../SPANet2/data/triHiggs/triHiggs-4pT40_4b-train.h5')
print_triHiggs_h5_info('../SPANet2/data/triHiggs/triHiggs-4pT40_4b-test.h5')

print_h5_sb_info('../SPANet2/data/triHiggs/triHiggs-4pT40_4b-train.h5')
print_h5_sb_info('../SPANet2/data/triHiggs/triHiggs-4pT40_4b-test.h5')

../SPANet2/data/triHiggs/triHiggs-4pT40_4b-train.h5
Dataset size: 1800000
Number of 0 Higgs events: 954693
Number of 1 Higgs events: 246462
Number of 2 Higgs events: 318057
Number of 3 Higgs events: 280788
\item Total sample size: 1,800,000
\item 1h sample size: 246,462
\item 2h sample size: 318,057
\item 3h sample size: 280,788
../SPANet2/data/triHiggs/triHiggs-4pT40_4b-test.h5
Dataset size: 200000
Number of 0 Higgs events: 106208
Number of 1 Higgs events: 27243
Number of 2 Higgs events: 35050
Number of 3 Higgs events: 31499
\item Total sample size: 200,000
\item 1h sample size: 27,243
\item 2h sample size: 35,050
\item 3h sample size: 31,499
\item Total sample size: 1,800,000
\item Signal sample size: 900,000
\item Background sample size: 900,000
\item Total sample size: 200,000
\item Signal sample size: 100,000
\item Background sample size: 100,000


In [4]:
print_triHiggs_h5_info('../SPANet2.2/data/triHiggs/triHiggs-4pT40_4b-mix-train.h5')
print_triHiggs_h5_info('../SPANet2.2/data/triHiggs/triHiggs-4pT40_4b-mix-test.h5')

print_h5_sb_info('../SPANet2.2/data/triHiggs/triHiggs-4pT40_4b-mix-train.h5')
print_h5_sb_info('../SPANet2.2/data/triHiggs/triHiggs-4pT40_4b-mix-test.h5')

../SPANet2.2/data/triHiggs/triHiggs-4pT40_4b-mix-train.h5
Dataset size: 1800000
Number of 0 Higgs events: 958041
Number of 1 Higgs events: 246041
Number of 2 Higgs events: 316793
Number of 3 Higgs events: 279125
\item Total sample size: 1,800,000
\item 1h sample size: 246,041
\item 2h sample size: 316,793
\item 3h sample size: 279,125
../SPANet2.2/data/triHiggs/triHiggs-4pT40_4b-mix-test.h5
Dataset size: 200000
Number of 0 Higgs events: 106345
Number of 1 Higgs events: 27329
Number of 2 Higgs events: 35226
Number of 3 Higgs events: 31100
\item Total sample size: 200,000
\item 1h sample size: 27,329
\item 2h sample size: 35,226
\item 3h sample size: 31,100
\item Total sample size: 1,800,000
\item Signal sample size: 900,000
\item Background sample size: 900,000
\item Total sample size: 200,000
\item Signal sample size: 100,000
\item Background sample size: 100,000


In [3]:
print_triHiggs_h5_info('../SPANet2.3/data/triHiggs/triHiggs_TRSM-4pT40_4b-mix_4-train.h5')
print_triHiggs_h5_info('../SPANet2.3/data/triHiggs/triHiggs_TRSM-4pT40_4b-mix_4-test.h5')

print_h5_sb_info('../SPANet2.3/data/triHiggs/triHiggs_TRSM-4pT40_4b-mix_4-train.h5')
print_h5_sb_info('../SPANet2.3/data/triHiggs/triHiggs_TRSM-4pT40_4b-mix_4-test.h5')

../SPANet2.3/data/triHiggs/triHiggs_TRSM-4pT40_4b-mix_4-train.h5
Dataset size: 1800000
Number of 0 Higgs events: 950820
Number of 1 Higgs events: 234321
Number of 2 Higgs events: 331837
Number of 3 Higgs events: 283022
\item Total sample size: 1,800,000
\item 1h sample size: 234,321
\item 2h sample size: 331,837
\item 3h sample size: 283,022
../SPANet2.3/data/triHiggs/triHiggs_TRSM-4pT40_4b-mix_4-test.h5
Dataset size: 200000
Number of 0 Higgs events: 104772
Number of 1 Higgs events: 24744
Number of 2 Higgs events: 38116
Number of 3 Higgs events: 32368
\item Total sample size: 200,000
\item 1h sample size: 24,744
\item 2h sample size: 38,116
\item 3h sample size: 32,368
\item Total sample size: 1,800,000
\item Signal sample size: 900,000
\item Background sample size: 900,000
\item Total sample size: 200,000
\item Signal sample size: 100,000
\item Background sample size: 100,000


In [8]:
utils.print_h5_info('./Sample/SPANet/sig/gghhh_bsm_570_250/delphes_events_20241226_175330.h5', 423440)

./Sample/SPANet/sig/gghhh_bsm_570_250/delphes_events_20241226_175330.h5
Dataset size: 423448
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True  True False False False False False
 False False False]
INPUTS/Source/btag [ True  True False  True False  True  True False False False False False
 False False False]
INPUTS/Source/eta [ 0.1386547  -0.5504687  -0.7677351  -1.0969951  -0.85081923 -1.2476897
 -1.1906663   0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [13.674232  15.549836  10.475461   7.7784557  6.708787   9.658851
  4.7329345  0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [-1.6626706   0.78136003  1.1796929  -2.194342   -1.18122     2.4228196
  0.45473677  0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [125.52984  117.52345  108.345436  57.491123  43.42492   41.985878
  

In [3]:
files = [f'Sample/SPANet/gghhh_0b_{i:02}.h5' for i in range(2, 9)] + [f'Sample/SPANet/gghhh_0b_{rnd}.h5' for rnd in [323, 423, 523, 614, 714]]

merged_h5 = utils.merge_h5_file(*files)

new_file = 'Sample/SPANet/gghhh_4b.h5'

os.rename(merged_h5, new_file)

'Sample/SPANet/gghhh_0b_02.h5' and ('Sample/SPANet/gghhh_0b_03.h5', 'Sample/SPANet/gghhh_0b_04.h5', 'Sample/SPANet/gghhh_0b_05.h5', 'Sample/SPANet/gghhh_0b_06.h5', 'Sample/SPANet/gghhh_0b_07.h5', 'Sample/SPANet/gghhh_0b_08.h5', 'Sample/SPANet/gghhh_0b_323.h5', 'Sample/SPANet/gghhh_0b_423.h5', 'Sample/SPANet/gghhh_0b_523.h5', 'Sample/SPANet/gghhh_0b_614.h5', 'Sample/SPANet/gghhh_0b_714.h5') are same structure, can be merged.
Sample/SPANet/gghhh_0b_02_merged.h5 not exist. Copy Sample/SPANet/gghhh_0b_02.h5 to Sample/SPANet/gghhh_0b_02_merged.h5
Size of Sample/SPANet/gghhh_0b_02.h5: 304372
Size of Sample/SPANet/gghhh_0b_03.h5: 303994
Size of Sample/SPANet/gghhh_0b_02_merged.h5: 608366
Size of Sample/SPANet/gghhh_0b_04.h5: 303915
Size of Sample/SPANet/gghhh_0b_02_merged.h5: 912281
Size of Sample/SPANet/gghhh_0b_05.h5: 304049
Size of Sample/SPANet/gghhh_0b_02_merged.h5: 1216330
Size of Sample/SPANet/gghhh_0b_06.h5: 304151
Size of Sample/SPANet/gghhh_0b_02_merged.h5: 1520481
Size of Sample/SP

In [3]:
files = [f'Sample/SPANet/gghhh_6b_{i:02}.h5' for i in range(2, 9)] + [f'Sample/SPANet/gghhh_6b_{rnd}.h5' for rnd in [323, 423, 523, 614, 714]]

merged_h5 = utils.merge_h5_file(*files)

new_file = 'Sample/SPANet/gghhh_6b.h5'

os.rename(merged_h5, new_file)

'Sample/SPANet/gghhh_6b_02.h5' and ('Sample/SPANet/gghhh_6b_03.h5', 'Sample/SPANet/gghhh_6b_04.h5', 'Sample/SPANet/gghhh_6b_05.h5', 'Sample/SPANet/gghhh_6b_06.h5', 'Sample/SPANet/gghhh_6b_07.h5', 'Sample/SPANet/gghhh_6b_08.h5', 'Sample/SPANet/gghhh_6b_323.h5', 'Sample/SPANet/gghhh_6b_423.h5', 'Sample/SPANet/gghhh_6b_523.h5', 'Sample/SPANet/gghhh_6b_614.h5', 'Sample/SPANet/gghhh_6b_714.h5') are same structure, can be merged.
Sample/SPANet/gghhh_6b_02_merged.h5 not exist. Copy Sample/SPANet/gghhh_6b_02.h5 to Sample/SPANet/gghhh_6b_02_merged.h5
Size of Sample/SPANet/gghhh_6b_02.h5: 40565
Size of Sample/SPANet/gghhh_6b_03.h5: 40866
Size of Sample/SPANet/gghhh_6b_02_merged.h5: 81431
Size of Sample/SPANet/gghhh_6b_04.h5: 40383
Size of Sample/SPANet/gghhh_6b_02_merged.h5: 121814
Size of Sample/SPANet/gghhh_6b_05.h5: 40879
Size of Sample/SPANet/gghhh_6b_02_merged.h5: 162693
Size of Sample/SPANet/gghhh_6b_06.h5: 40538
Size of Sample/SPANet/gghhh_6b_02_merged.h5: 203231
Size of Sample/SPANet/ggh

# Make training and testing dataset

## Background

In [3]:
file_path = './Sample/SPANet/bkg/pp6b-4pT40_4b.h5'
size = 1000000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/bkg/pp6b-4pT40_4b-1.h5')
os.rename(split_file2, './Sample/SPANet/bkg/pp6b-4pT40_4b-2.h5')

Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b.h5: 2030855
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b_split1.h5: 1000000
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b_split2.h5: 1030855


In [4]:
file_path = './Sample/SPANet/bkg/pp6b-4pT40_4b-1.h5'
size = 900000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5')
os.rename(split_file2, './Sample/SPANet/bkg/pp6b-4pT40_4b-test.h5')

Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-1.h5: 1000000
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-1_split1.h5: 900000
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-1_split2.h5: 100000


In [5]:
output_file = './Sample/SPANet/bkg/pp6b-4pT40_6b.h5'
six_b_file = select_nb_event('./Sample/SPANet/bkg/pp6b-4pT40_4b-2.h5', output_file, nb=6)
print_triHiggs_h5_info(six_b_file)

size = 50000
utils.split_h5_size(six_b_file, size)

root, ext = os.path.splitext(six_b_file)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5')
os.rename(split_file2, './Sample/SPANet/bkg/pp6b-4pT40_6b-2.h5')

Number of 6 b events: 70848
./Sample/SPANet/bkg/pp6b-4pT40_6b.h5
Dataset size: 70848
Number of 0 Higgs events: 70848
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 70,848
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0
Size of ./Sample/SPANet/bkg/pp6b-4pT40_6b.h5: 70848
Size of ./Sample/SPANet/bkg/pp6b-4pT40_6b_split1.h5: 50000
Size of ./Sample/SPANet/bkg/pp6b-4pT40_6b_split2.h5: 20848


## DM-CPV Signal

In [3]:
# merge all .h5 files in the folder
# for m3_m2 in ['500_275', '500_300', '520_325']:
for m3_m2 in ['500_275', '500_300', '520_325', '570_250', '600_325', '700_325', '800_325', '700_400', '800_400']:
    files_path = f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/'
    files = [os.path.join(files_path, name) for name in os.listdir(files_path) if name.startswith('delphes_events_')]
    merged_h5 = utils.merge_h5_file(*files)

    new_file = f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b.h5'
    os.rename(merged_h5, new_file)

'./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920.h5' and ('./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_191926.h5',) are same structure, can be merged.
./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920_merged.h5 not exist. Copy ./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920.h5 to ./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920_merged.h5
Size of ./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920.h5: 263169
Size of ./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_191926.h5: 397665
Size of ./Sample/SPANet/sig/gghhh_bsm_500_275/delphes_events_20241206_005920_merged.h5: 660834
'./Sample/SPANet/sig/gghhh_bsm_500_300/delphes_events_20241209_222922.h5' and ('./Sample/SPANet/sig/gghhh_bsm_500_300/delphes_events_20241206_025512.h5',) are same structure, can be merged.
./Sample/SPANet/sig/gghhh_bsm_500_300/delphes_events_20241209_222922_merged.h5 not exist. Copy ./Sam

In [7]:
for m3_m2 in ['570_250', '600_325', '700_325', '800_325', '700_400', '800_400']:
    file_name = f'./Sample/45b_400k/{m3_m2}_45b_400000.h5'
    new_file = f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b.h5'
    # copy 3h events to new file
    shutil.copyfile(file_name, new_file)

In [None]:
# m3_m2_list = ['420_280','500_275', '500_300', '520_325']
m3_m2_list = ['420_280', '500_275', '500_300', '520_325',
              '570_250', '600_325', '700_325',
              '800_325', '700_400', '800_400'
              ]
n_mass = len(m3_m2_list)
n_4b_total = 500000
for m3_m2 in m3_m2_list:

    # split signal files for 4b and 6b
    file_dir = Path(f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/')
    file_path = file_dir / 'gghhh-4pT40_4b.h5'
    size = n_4b_total // n_mass
    utils.split_h5_size(file_path, size)

    root, ext = os.path.splitext(file_path)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_4b-1.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_4b-2.h5')

    file_path = file_dir / 'gghhh-4pT40_4b-1.h5'
    size = int(n_4b_total // n_mass * 0.9)
    utils.split_h5_size(file_path, size)

    root, ext = os.path.splitext(file_path)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_4b-train.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_4b-test.h5')

    output_file = file_dir / 'gghhh-4pT40_6b.h5'
    six_b_file = select_nb_event(file_dir / 'gghhh-4pT40_4b-2.h5', output_file, nb=6)
    print_triHiggs_h5_info(six_b_file)

    size = 50000 // n_mass
    utils.split_h5_size(six_b_file, size)

    root, ext = os.path.splitext(six_b_file)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_6b-1.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_6b-2.h5')

Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b.h5: 400000
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b_split1.h5: 83333
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b_split2.h5: 316667
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b-1.h5: 83333
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b-1_split1.h5: 74999
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b-1_split2.h5: 8334
Number of 6 b events: 0
Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_6b.h5
Dataset size: 0
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 0
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_6b.h5: 0
Split size 8333 is greater than the input file size 0.
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_6b_split1.h5: 0
Size of Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT4

### $4b$ dataset

In [None]:
files = ['./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5'] + [f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-train.h5' for m3_m2 in m3_m2_list]
# files = [f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-train.h5' for m3_m2 in m3_m2_list]
# files = [f'./Sample/45b_400k/{m3_m2}_45b_400000.h5' for m3_m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

new_file = f'./Sample/SPANet/triHiggs-4pT40_4b-mix_{n_mass}-train.h5'
os.rename(merged_h5, new_file)

files = ['./Sample/SPANet/bkg/pp6b-4pT40_4b-test.h5'] + [f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-test.h5' for m3_m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

new_file = f'./Sample/SPANet/triHiggs-4pT40_4b-mix_{n_mass}-test.h5'
os.rename(merged_h5, new_file)

'./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5' and ('./Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b-train.h5', './Sample/SPANet/sig/gghhh_bsm_600_325/gghhh-4pT40_4b-train.h5', './Sample/SPANet/sig/gghhh_bsm_700_325/gghhh-4pT40_4b-train.h5', './Sample/SPANet/sig/gghhh_bsm_800_325/gghhh-4pT40_4b-train.h5', './Sample/SPANet/sig/gghhh_bsm_700_400/gghhh-4pT40_4b-train.h5', './Sample/SPANet/sig/gghhh_bsm_800_400/gghhh-4pT40_4b-train.h5') are same structure, can be merged.
./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5 not exist. Copy ./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5 to ./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5: 900000
Size of ./Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_4b-train.h5: 74999
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5: 974999
Size of ./Sample/SPANet/sig/gghhh_bsm_600_325/gghhh-4pT40_4b-train.h5: 74999
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5: 1049998
Size of ./Sample/

TypeError: rename: src should be string, bytes or os.PathLike, not NoneType

In [None]:
utils.shuffle_h5(f'./Sample/SPANet/triHiggs-4pT40_4b-mix_{n_mass}-train.h5')
utils.shuffle_h5(f'./Sample/SPANet/triHiggs-4pT40_4b-mix_{n_mass}-test.h5')

Dataset size: 1349994


IndexError: list index out of range

### $6b$ dataset: 50k + 50K

In [None]:
files = ['./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5'] + [f'./Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_6b-1.h5' for m3_m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

new_file = './Sample/SPANet/triHiggs-4pT40_6b-mix_10.h5'
os.rename(merged_h5, new_file)

'./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5' and ('./Sample/SPANet/sig/gghhh_bsm_420_280/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_500_275/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_500_300/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_520_325/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_570_250/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_600_325/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_700_325/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_800_325/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_700_400/gghhh-4pT40_6b-1.h5', './Sample/SPANet/sig/gghhh_bsm_800_400/gghhh-4pT40_6b-1.h5') are same structure, can be merged.
./Sample/SPANet/bkg/pp6b-4pT40_6b-1_merged.h5 not exist. Copy ./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5 to ./Sample/SPANet/bkg/pp6b-4pT40_6b-1_merged.h5
Size of ./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5: 50000
Size of ./Sample/SPANet/sig/gghhh_bsm_420_280/gghhh-4pT40_6b-1.h5: 5000
Size of ./Sample/SPANet/bkg/pp6b-

## TRSM signal

In [3]:
m3_m2_list = [(420, 280), (500, 275), (500, 300), (520, 325)]
for m3, m2 in m3_m2_list:
    files_path = f'./Sample/SPANet/TRSM/TRSM_{m3}_{m2}/'
    files = [os.path.join(files_path, name) for name in os.listdir(files_path) if name.startswith('gghhh-4pT40_4b')]
    merged_h5 = utils.merge_h5_file(*files)

    new_file = f'./Sample/SPANet/TRSM/TRSM_{m3}_{m2}/gghhh-4pT40_4b.h5'
    os.rename(merged_h5, new_file)

'./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-2.h5' and ('./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-test.h5', './Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03.h5', './Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-train.h5', './Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-1.h5', './Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b.h5') are same structure, can be merged.
./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-2_merged.h5 not exist. Copy ./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-2.h5 to ./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-2_merged.h5
Size of ./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-2.h5: 972639
Size of ./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-test.h5: 25000
Size of ./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-2_merged.h5: 997639
Size of ./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-03.h5: 324213
Size of ./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-2_merged.h5: 1321852
Size of ./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4p

In [4]:
n_mass = len(m3_m2_list)
n_4b_total = 1000000
for m3, m2 in m3_m2_list:

    # split signal files for 4b and 6b
    file_dir = Path(f'./Sample/SPANet/TRSM/TRSM_{m3}_{m2}/')
    file_path = file_dir / 'gghhh-4pT40_4b.h5'
    size = n_4b_total // n_mass
    utils.split_h5_size(file_path, size)

    root, ext = os.path.splitext(file_path)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_4b-1.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_4b-2.h5')

    file_path = file_dir / 'gghhh-4pT40_4b-1.h5'
    size = int(n_4b_total // n_mass * 0.9)
    utils.split_h5_size(file_path, size)

    root, ext = os.path.splitext(file_path)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_4b-train.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_4b-test.h5')

    output_file = file_dir / 'gghhh-4pT40_6b.h5'
    six_b_file = select_nb_event(file_dir / 'gghhh-4pT40_4b-2.h5', output_file, nb=6)
    print_triHiggs_h5_info(six_b_file)

    size = 50000 // n_mass
    utils.split_h5_size(six_b_file, size)

    root, ext = os.path.splitext(six_b_file)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    os.rename(split_file1, file_dir / 'gghhh-4pT40_6b-1.h5')
    os.rename(split_file2, file_dir / 'gghhh-4pT40_6b-2.h5')

Size of Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b.h5: 3019491
Size of Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b_split1.h5: 250000
Size of Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b_split2.h5: 2769491
Size of Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-1.h5: 250000
Size of Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-1_split1.h5: 225000
Size of Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-1_split2.h5: 25000
Number of 6 b events: 347113
Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b.h5
Dataset size: 347113
Number of 0 Higgs events: 4568
Number of 1 Higgs events: 37083
Number of 2 Higgs events: 69957
Number of 3 Higgs events: 235505
\item Total sample size: 347,113
\item 1h sample size: 37,083
\item 2h sample size: 69,957
\item 3h sample size: 235,505
Size of Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b.h5: 347113
Size of Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b_split1.h5: 12500
Size of Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b_split2.h5: 334613
Size of S

### $4b$ dataset

In [5]:
files = ['./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5'] + [f'./Sample/SPANet/TRSM/TRSM_{m3}_{m2}/gghhh-4pT40_4b-train.h5' for m3, m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

train_file = f'./Sample/SPANet/triHiggs_TRSM-4pT40_4b-mix_{n_mass}-train.h5'
os.rename(merged_h5, train_file)

files = ['./Sample/SPANet/bkg/pp6b-4pT40_4b-test.h5'] + [f'./Sample/SPANet/TRSM/TRSM_{m3}_{m2}/gghhh-4pT40_4b-test.h5' for m3_m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

test_file = f'./Sample/SPANet/triHiggs_TRSM-4pT40_4b-mix_{n_mass}-test.h5'
os.rename(merged_h5, test_file)

'./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5' and ('./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-train.h5', './Sample/SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-train.h5', './Sample/SPANet/TRSM/TRSM_500_300/gghhh-4pT40_4b-train.h5', './Sample/SPANet/TRSM/TRSM_520_325/gghhh-4pT40_4b-train.h5') are same structure, can be merged.
./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5 not exist. Copy ./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5 to ./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-train.h5: 900000
Size of ./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_4b-train.h5: 225000
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5: 1125000
Size of ./Sample/SPANet/TRSM/TRSM_500_275/gghhh-4pT40_4b-train.h5: 225000
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5: 1350000
Size of ./Sample/SPANet/TRSM/TRSM_500_300/gghhh-4pT40_4b-train.h5: 225000
Size of ./Sample/SPANet/bkg/pp6b-4pT40_4b-train_merged.h5: 1575000
Size of ./Sample/SPANet/TRS

In [6]:
utils.shuffle_h5(train_file)
utils.shuffle_h5(test_file)

Dataset size: 1800000
Dataset size: 200000


### $6b$ dataset: 50k + 50K

In [7]:
files = ['./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5'] + [f'./Sample/SPANet/TRSM/TRSM_{m3}_{m2}/gghhh-4pT40_6b-1.h5' for m3, m2 in m3_m2_list]
merged_h5 = utils.merge_h5_file(*files)

new_file = f'./Sample/SPANet/triHiggs_TRSM-4pT40_6b-mix_{n_mass}.h5'
os.rename(merged_h5, new_file)

'./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5' and ('./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b-1.h5', './Sample/SPANet/TRSM/TRSM_500_275/gghhh-4pT40_6b-1.h5', './Sample/SPANet/TRSM/TRSM_500_300/gghhh-4pT40_6b-1.h5', './Sample/SPANet/TRSM/TRSM_520_325/gghhh-4pT40_6b-1.h5') are same structure, can be merged.
./Sample/SPANet/bkg/pp6b-4pT40_6b-1_merged.h5 not exist. Copy ./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5 to ./Sample/SPANet/bkg/pp6b-4pT40_6b-1_merged.h5
Size of ./Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5: 50000
Size of ./Sample/SPANet/TRSM/TRSM_420_280/gghhh-4pT40_6b-1.h5: 12500
Size of ./Sample/SPANet/bkg/pp6b-4pT40_6b-1_merged.h5: 62500
Size of ./Sample/SPANet/TRSM/TRSM_500_275/gghhh-4pT40_6b-1.h5: 12500
Size of ./Sample/SPANet/bkg/pp6b-4pT40_6b-1_merged.h5: 75000
Size of ./Sample/SPANet/TRSM/TRSM_500_300/gghhh-4pT40_6b-1.h5: 12500
Size of ./Sample/SPANet/bkg/pp6b-4pT40_6b-1_merged.h5: 87500
Size of ./Sample/SPANet/TRSM/TRSM_520_325/gghhh-4pT40_6b-1.h5: 12500
Size of ./Sample/SPANet/bkg/

## Process SPANet predict file for pairing and DNN

In [None]:
def replace_labels(file_path, label_path):
    with h5py.File(label_path, 'r') as f:
        label = f['CLASSIFICATIONS/EVENT/signal'][...]

    with h5py.File(file_path, 'r+') as f:
        if 'CLASSIFICATIONS/EVENT/signal' in f:
            del f['CLASSIFICATIONS/EVENT/signal']
        f.create_dataset('CLASSIFICATIONS/EVENT/signal', data=label, chunks=True, maxshape=(None,))

def rename_dataset(file_path):
    with h5py.File(file_path, 'r+') as f:
        for key in utils.get_dataset_keys(f):
            if key.startswith('SpecialKey.'):
                new_key = key.replace('SpecialKey.', '')
            else:
                new_key = key
            # first word capitalize
            new_key = new_key.split('/')
            new_key = '/'.join([new_key[0].upper()] + new_key[1:])
            if new_key == key:
                continue
            
            maxShape = list(f[key].maxshape)
            maxShape[0] = None
            f.create_dataset(new_key, data=f[key][...], chunks=True, maxshape=maxShape)
            del f[key]


In [None]:
rename_dataset('Sample/SPANet/triHiggs-4pT40_4b-mix-train-4b_SPANet_pairing.h5')
rename_dataset('Sample/SPANet/triHiggs-4pT40_4b-mix-test-4b_SPANet_pairing.h5')

file_path = 'Sample/SPANet/triHiggs-4pT40_4b-mix-train-4b_SPANet_pairing.h5'
label_path = 'Sample/SPANet/triHiggs-4pT40_4b-mix-train.h5'
replace_labels(file_path, label_path)

file_path = 'Sample/SPANet/triHiggs-4pT40_4b-mix-test-4b_SPANet_pairing.h5'
label_path = 'Sample/SPANet/triHiggs-4pT40_4b-mix-test.h5'
replace_labels(file_path, label_path)

In [None]:
for m3_m2 in ['420_280', '500_275', '500_300', '520_325']:
    rename_dataset(f'Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-test-4b_SPANet_pairing.h5')
    rename_dataset(f'Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_6b-1-4b_SPANet_pairing.h5')

    file_path = f'Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-test-4b_SPANet_pairing.h5'
    label_path = f'Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_4b-test.h5'
    replace_labels(file_path, label_path)

    file_path = f'Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_6b-1-4b_SPANet_pairing.h5'
    label_path = f'Sample/SPANet/sig/gghhh_bsm_{m3_m2}/gghhh-4pT40_6b-1.h5'
    replace_labels(file_path, label_path)

rename_dataset('./Sample/SPANet/bkg/pp6b-4pT40_4b-test-4b_SPANet_pairing.h5')
rename_dataset('./Sample/SPANet/bkg/pp6b-4pT40_6b-1-4b_SPANet_pairing.h5')

file_path = './Sample/SPANet/bkg/pp6b-4pT40_4b-test-4b_SPANet_pairing.h5'
label_path = './Sample/SPANet/bkg/pp6b-4pT40_4b-test.h5'
replace_labels(file_path, label_path)

file_path = './Sample/SPANet/bkg/pp6b-4pT40_6b-1-4b_SPANet_pairing.h5'
label_path = './Sample/SPANet/bkg/pp6b-4pT40_6b-1.h5'
replace_labels(file_path, label_path)

In [None]:
rename_dataset('Sample/SPANet/triHiggs-4pT40_6b-mix-4b_SPANet_pairing.h5')

file_path = 'Sample/SPANet/triHiggs-4pT40_6b-mix-4b_SPANet_pairing.h5'
label_path = 'Sample/SPANet/triHiggs-4pT40_6b-mix.h5'
replace_labels(file_path, label_path)

# Generate 6b event

In [5]:
file_path = './Sample/SPANet/pp6b_0b.h5'
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/pp6b_6b.h5'
six_b_file = select_6b_event(file_path, output_file)
print_triHiggs_h5_info(six_b_file)

./Sample/SPANet/pp6b_0b.h5
Dataset size: 1000000
Number of 0 Higgs events: 1000000
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 1,000,000
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0
Number of 6 b events: 28755
./Sample/SPANet/pp6b_6b.h5
Dataset size: 28755
Number of 0 Higgs events: 28755
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 28,755
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0


{'total': 28755, '0h': 28755, '1h': 0, '2h': 0, '3h': 0}

In [5]:
files = ['./Sample/SPANet/pp6b_0b.h5', './Sample/SPANet/pp6b_0b_2.h5']
file_path = utils.merge_h5_file(*files)
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/pp6b_4b.h5'
nb_file = select_nb_event(file_path, output_file, nb=4)
print_triHiggs_h5_info(nb_file)

'./Sample/SPANet/pp6b_0b.h5' and ('./Sample/SPANet/pp6b_0b_2.h5',) are same structure, can be merged.
./Sample/SPANet/pp6b_0b_merged.h5 not exist. Copy ./Sample/SPANet/pp6b_0b.h5 to ./Sample/SPANet/pp6b_0b_merged.h5
Size of ./Sample/SPANet/pp6b_0b.h5: 1000000
Size of ./Sample/SPANet/pp6b_0b_2.h5: 808254
Size of ./Sample/SPANet/pp6b_0b_merged.h5: 1808254
./Sample/SPANet/pp6b_0b_merged.h5
Dataset size: 1808254
Number of 0 Higgs events: 1808254
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 1,808,254
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0
Number of 4 b events: 790372
./Sample/SPANet/pp6b_4b.h5
Dataset size: 790372
Number of 0 Higgs events: 790372
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 790,372
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0


{'total': 790372, '0h': 790372, '1h': 0, '2h': 0, '3h': 0}

In [5]:
file_path = '../SPANet2/data/triHiggs/triHiggs_0b_3h_test.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/triHiggs_6b_3h_test.h5'
six_b_file = select_6b_event(file_path, output_file)
print_triHiggs_h5_info(six_b_file)

../SPANet2/data/triHiggs/triHiggs_0b_3h_test.h5
Dataset size: 200000
Number of 0 Higgs events: 100000
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 100000
\item Total sample size: 200,000
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 100,000
Number of 6 b events: 11521
../SPANet2/data/triHiggs/triHiggs_6b_3h_test.h5
Dataset size: 11521
Number of 0 Higgs events: 2905
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 8616
\item Total sample size: 11,521
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 8,616


{'total': 11521, '0h': 2905, '1h': 0, '2h': 0, '3h': 8616}

In [None]:
file_path = '../SPANet2/data/triHiggs/triHiggs_0b_3h_test.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/triHiggs_6b_3h_test.h5'
six_b_file = select_6b_event(file_path, output_file)
print_triHiggs_h5_info(six_b_file)

../SPANet2/data/triHiggs/triHiggs_0b_3h_test.h5
Dataset size: 200000
Number of 0 Higgs events: 100000
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 100000
\item Total sample size: 200,000
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 100,000
Number of 6 b events: 11521
../SPANet2/data/triHiggs/triHiggs_6b_3h_test.h5
Dataset size: 11521
Number of 0 Higgs events: 2905
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 8616
\item Total sample size: 11,521
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 8,616


{'total': 11521, '0h': 2905, '1h': 0, '2h': 0, '3h': 8616}

# Prepare training and testing datasets with at least 4 jet $ > 40 \text{ GeV}$ and 4 $b$-tagged jet

In [4]:
file_path = 'Sample/SPANet/gghhh_4b.h5'
size = 1000000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_4b_PT40.h5')

size = 500000
utils.split_h5_size(split_file2, size)

root, ext = os.path.splitext(split_file2)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_4b_PT40_DNN.h5')
os.remove(split_file2)

Size of Sample/SPANet/gghhh_4b.h5: 3653696
Size of Sample/SPANet/gghhh_4b_split1.h5: 1000000
Size of Sample/SPANet/gghhh_4b_split2.h5: 2653696
Size of Sample/SPANet/gghhh_4b_split2.h5: 2653696
Size of Sample/SPANet/gghhh_4b_split2_split1.h5: 500000
Size of Sample/SPANet/gghhh_4b_split2_split2.h5: 2153696


In [5]:
file_path = './Sample/SPANet/gghhh_4b_PT40.h5'
r = 0.9
utils.split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_4b_PT40_train.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_4b_PT40_test.h5')

Size of ./Sample/SPANet/gghhh_4b_PT40.h5: 1000000
Size of ./Sample/SPANet/gghhh_4b_PT40_split1.h5: 900000
Size of ./Sample/SPANet/gghhh_4b_PT40_split2.h5: 100000


In [4]:
file_path = '../SPANet2/data/triHiggs/gghhh_4b_PT40_train.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/gghhh_4b_PT40_3h_train.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

../SPANet2/data/triHiggs/gghhh_4b_PT40_train.h5
Dataset size: 900000
Number of 0 Higgs events: 54693
Number of 1 Higgs events: 246462
Number of 2 Higgs events: 318057
Number of 3 Higgs events: 280788
\item Total sample size: 900,000
\item 1h sample size: 246,462
\item 2h sample size: 318,057
\item 3h sample size: 280,788
Number of 3 Higgs events: 280788
../SPANet2/data/triHiggs/gghhh_4b_PT40_3h_train.h5
Dataset size: 280788
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 280788
\item Total sample size: 280,788
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 280,788


{'total': 280788, '0h': 0, '1h': 0, '2h': 0, '3h': 280788}

In [5]:
file_path = '../SPANet2/data/triHiggs/gghhh_4b_PT40_test.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/gghhh_4b_PT40_3h_test.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

../SPANet2/data/triHiggs/gghhh_4b_PT40_test.h5
Dataset size: 100000
Number of 0 Higgs events: 6208
Number of 1 Higgs events: 27243
Number of 2 Higgs events: 35050
Number of 3 Higgs events: 31499
\item Total sample size: 100,000
\item 1h sample size: 27,243
\item 2h sample size: 35,050
\item 3h sample size: 31,499
Number of 3 Higgs events: 31499
../SPANet2/data/triHiggs/gghhh_4b_PT40_3h_test.h5
Dataset size: 31499
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 31499
\item Total sample size: 31,499
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 31,499


{'total': 31499, '0h': 0, '1h': 0, '2h': 0, '3h': 31499}

In [7]:
file_path = '../SPANet2/data/triHiggs/triHiggs_4b_PT40_3h_train.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/triHiggs_6b_PT40_3h_train.h5'
six_b_file = select_6b_event(file_path, output_file)
print_triHiggs_h5_info(six_b_file)

../SPANet2/data/triHiggs/triHiggs_4b_PT40_3h_train.h5
Dataset size: 280788
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 280788
\item Total sample size: 280,788
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 280,788
Number of 6 b events: 76506
../SPANet2/data/triHiggs/triHiggs_6b_PT40_3h_train.h5
Dataset size: 76506
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 76506
\item Total sample size: 76,506
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 76,506


{'total': 76506, '0h': 0, '1h': 0, '2h': 0, '3h': 76506}

In [3]:
file_path = './Sample/SPANet/gghhh_4b_PT40_new.h5'
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/gghhh_4b_PT40_3h_new.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

./Sample/SPANet/gghhh_4b_PT40_new.h5
Dataset size: 32337
Number of 0 Higgs events: 2111
Number of 1 Higgs events: 8941
Number of 2 Higgs events: 11341
Number of 3 Higgs events: 9944
\item Total sample size: 32,337
\item 1h sample size: 8,941
\item 2h sample size: 11,341
\item 3h sample size: 9,944
Number of 3 Higgs events: 9944
./Sample/SPANet/gghhh_4b_PT40_3h_new.h5
Dataset size: 9944
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 9944
\item Total sample size: 9,944
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 9,944


{'total': 9944, '0h': 0, '1h': 0, '2h': 0, '3h': 9944}

## classification

In [5]:
file_path = 'Sample/SPANet/gghhh_4b.h5'
print_triHiggs_h5_info(file_path)

Sample/SPANet/gghhh_4b.h5
Dataset size: 3653696
Number of 0 Higgs events: 223197
Number of 1 Higgs events: 998906
Number of 2 Higgs events: 1290603
Number of 3 Higgs events: 1140990
\item Total sample size: 3,653,696
\item 1h sample size: 998,906
\item 2h sample size: 1,290,603
\item 3h sample size: 1,140,990


{'total': 3653696, '0h': 223197, '1h': 998906, '2h': 1290603, '3h': 1140990}

In [6]:
file_path = 'Sample/SPANet/gghhh_4b.h5'
size = 500000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_4b_PT40.h5')
os.remove(split_file2)

Size of Sample/SPANet/gghhh_4b.h5: 3653696
Size of Sample/SPANet/gghhh_4b_split1.h5: 500000
Size of Sample/SPANet/gghhh_4b_split2.h5: 3153696


In [7]:
file_path = 'Sample/SPANet/gghhh_4b_PT40.h5'
r = 0.9
utils.split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_4b_PT40_train.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_4b_PT40_test.h5')

Size of Sample/SPANet/gghhh_4b_PT40.h5: 500000
Size of Sample/SPANet/gghhh_4b_PT40_split1.h5: 450000
Size of Sample/SPANet/gghhh_4b_PT40_split2.h5: 50000


In [8]:
file_path = './Sample/SPANet/pp6b_4b.h5'
size = 500000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_4b_PT40.h5')
os.remove(split_file2)

Size of ./Sample/SPANet/pp6b_4b.h5: 790372
Size of ./Sample/SPANet/pp6b_4b_split1.h5: 500000
Size of ./Sample/SPANet/pp6b_4b_split2.h5: 290372


In [9]:
file_path = './Sample/SPANet/pp6b_4b_PT40.h5'
r = 0.9
utils.split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_4b_PT40_train.h5')
os.rename(split_file2, './Sample/SPANet/pp6b_4b_PT40_test.h5')


Size of ./Sample/SPANet/pp6b_4b_PT40.h5: 500000
Size of ./Sample/SPANet/pp6b_4b_PT40_split1.h5: 450000
Size of ./Sample/SPANet/pp6b_4b_PT40_split2.h5: 50000


In [10]:
files = ['./Sample/SPANet/gghhh_4b_PT40_train.h5', 
         './Sample/SPANet/pp6b_4b_PT40_train.h5']

merged_h5 = utils.merge_h5_file(*files)

new_file = 'Sample/SPANet/triHiggs_4b_PT40_train.h5'

os.rename(merged_h5, new_file)

files = ['./Sample/SPANet/gghhh_4b_PT40_test.h5', 
         './Sample/SPANet/pp6b_4b_PT40_test.h5']

merged_h5 = utils.merge_h5_file(*files)

new_file = 'Sample/SPANet/triHiggs_4b_PT40_test.h5'

os.rename(merged_h5, new_file)

'./Sample/SPANet/gghhh_4b_PT40_train.h5' and ('./Sample/SPANet/pp6b_4b_PT40_train.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_4b_PT40_train_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_4b_PT40_train.h5 to ./Sample/SPANet/gghhh_4b_PT40_train_merged.h5
Size of ./Sample/SPANet/gghhh_4b_PT40_train.h5: 450000
Size of ./Sample/SPANet/pp6b_4b_PT40_train.h5: 450000
Size of ./Sample/SPANet/gghhh_4b_PT40_train_merged.h5: 900000
'./Sample/SPANet/gghhh_4b_PT40_test.h5' and ('./Sample/SPANet/pp6b_4b_PT40_test.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_4b_PT40_test_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_4b_PT40_test.h5 to ./Sample/SPANet/gghhh_4b_PT40_test_merged.h5
Size of ./Sample/SPANet/gghhh_4b_PT40_test.h5: 50000
Size of ./Sample/SPANet/pp6b_4b_PT40_test.h5: 50000
Size of ./Sample/SPANet/gghhh_4b_PT40_test_merged.h5: 100000


In [11]:
utils.shuffle_h5('Sample/SPANet/triHiggs_4b_PT40_train.h5')
utils.shuffle_h5('Sample/SPANet/triHiggs_4b_PT40_test.h5')

Dataset size: 900000
Dataset size: 100000


# Prepare training and testing datasets in the $6b$ region 

In [6]:
file_path = 'Sample/SPANet/gghhh_6b.h5'
size = 400000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_6b_PT40.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_6b_PT40_DNN.h5')

Size of Sample/SPANet/gghhh_6b.h5: 486694
Size of Sample/SPANet/gghhh_6b_split1.h5: 400000
Size of Sample/SPANet/gghhh_6b_split2.h5: 86694


In [5]:
file_path = './Sample/SPANet/gghhh_6b_PT40.h5'
r = 0.9
utils.split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_6b_PT40_train.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_6b_PT40_test.h5')

Size of ./Sample/SPANet/gghhh_6b_PT40.h5: 400000
Size of ./Sample/SPANet/gghhh_6b_PT40_split1.h5: 360000
Size of ./Sample/SPANet/gghhh_6b_PT40_split2.h5: 40000


In [8]:
file_path = './Sample/SPANet/gghhh_6b_PT40_train.h5'
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/gghhh_6b_PT40_3h_train.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

./Sample/SPANet/gghhh_6b_PT40_train.h5
Dataset size: 360000
Number of 0 Higgs events: 4607
Number of 1 Higgs events: 38595
Number of 2 Higgs events: 73036
Number of 3 Higgs events: 243762
\item Total sample size: 360,000
\item 1h sample size: 38,595
\item 2h sample size: 73,036
\item 3h sample size: 243,762
Number of 3 Higgs events: 243762
./Sample/SPANet/gghhh_6b_PT40_3h_train.h5
Dataset size: 243762
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 243762
\item Total sample size: 243,762
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 243,762


{'total': 243762, '0h': 0, '1h': 0, '2h': 0, '3h': 243762}

In [9]:
file_path = './Sample/SPANet/gghhh_6b_PT40_test.h5'
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/gghhh_6b_PT40_3h_test.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

./Sample/SPANet/gghhh_6b_PT40_test.h5
Dataset size: 40000
Number of 0 Higgs events: 483
Number of 1 Higgs events: 4360
Number of 2 Higgs events: 8070
Number of 3 Higgs events: 27087
\item Total sample size: 40,000
\item 1h sample size: 4,360
\item 2h sample size: 8,070
\item 3h sample size: 27,087
Number of 3 Higgs events: 27087
./Sample/SPANet/gghhh_6b_PT40_3h_test.h5
Dataset size: 27087
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 27087
\item Total sample size: 27,087
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 27,087


{'total': 27087, '0h': 0, '1h': 0, '2h': 0, '3h': 27087}