In [1]:
import os
import h5py
import shutil

import numpy as np

from pathlib import Path

import utils_HDF5 as utils

# TriHiggs

In [2]:
def print_triHiggs_h5_info(file_path):
    # 印出 triHiggs HDF5 資料中，各 Higgs 數目的事件數
    print(file_path)
    with h5py.File(file_path, 'r') as f:

        h1b1 = f['TARGETS/h1/b1'][...]
        h1b2 = f['TARGETS/h1/b2'][...]
        h2b1 = f['TARGETS/h2/b1'][...]
        h2b2 = f['TARGETS/h2/b2'][...]
        h3b1 = f['TARGETS/h3/b1'][...]
        h3b2 = f['TARGETS/h3/b2'][...]

        quark_jet = np.array([h1b1, h1b2, h2b1, h2b2, h3b1, h3b2]).T

        h1_mask = utils.get_particle_mask(quark_jet, (0, 1))
        h2_mask = utils.get_particle_mask(quark_jet, (2, 3))
        h3_mask = utils.get_particle_mask(quark_jet, (4, 5))
        
        n_tot = h1_mask.shape[0]
        n_0h = ((~h1_mask) & (~h2_mask) & (~h3_mask)).sum()
        # 任一個 Higgs 有對應的 jet
        n_1h = ((h1_mask & (~h2_mask) & (~h3_mask)) | 
                ((~h1_mask) & h2_mask & (~h3_mask)) | 
                ((~h1_mask) & (~h2_mask) & h3_mask)).sum()
        
        # 任兩個 Higgs 有對應的 jet
        n_2h = ((h1_mask & h2_mask & (~h3_mask)) | 
                ((~h1_mask) & h2_mask & h3_mask) | 
                (h1_mask & (~h2_mask) & h3_mask)).sum()
        n_3h = (h1_mask & h2_mask & h3_mask).sum()

    print(f'Dataset size: {n_tot}')
    print(f'Number of 0 Higgs events: {n_0h}')
    print(f'Number of 1 Higgs events: {n_1h}')
    print(f'Number of 2 Higgs events: {n_2h}')
    print(f'Number of 3 Higgs events: {n_3h}')
    
    print(f'\\item Total sample size: {n_tot:,}')
    print(f'\\item 1h sample size: {n_1h:,}')
    print(f'\\item 2h sample size: {n_2h:,}')
    print(f'\\item 3h sample size: {n_3h:,}')
    
    result = {
        'total': n_tot,
        '0h': n_0h,
        '1h': n_1h,
        '2h': n_2h,
        '3h': n_3h
    }
    return result

def print_h5_sb_info(file):
    # 印出訊號與背景的事件數
    with h5py.File(file,'r') as f:
        n_tot = f['CLASSIFICATIONS/EVENT/signal'][...].shape[0]
        ns = (f['CLASSIFICATIONS/EVENT/signal'][...] == 1).sum()
        nb = (f['CLASSIFICATIONS/EVENT/signal'][...] == 0).sum()

    print(f'\\item Total sample size: {n_tot:,}')
    print(f'\\item Signal sample size: {ns:,}')
    print(f'\\item Background sample size: {nb:,}')
    

def select_3h_event(file, output_file):
    # 選取 triHiggs HDF5 資料中，有 3 個 Higgs 的事件
    # root, ext = os.path.splitext(file)
    # new_file = root + '_3h' + ext

    with h5py.File(file, 'r') as f:
        h1b1 = f['TARGETS/h1/b1'][...]
        h1b2 = f['TARGETS/h1/b2'][...]
        h2b1 = f['TARGETS/h2/b1'][...]
        h2b2 = f['TARGETS/h2/b2'][...]
        h3b1 = f['TARGETS/h3/b1'][...]
        h3b2 = f['TARGETS/h3/b2'][...]

        quark_jet = np.array([h1b1, h1b2, h2b1, h2b2, h3b1, h3b2]).T

        h1_mask = utils.get_particle_mask(quark_jet, (0, 1))
        h2_mask = utils.get_particle_mask(quark_jet, (2, 3))
        h3_mask = utils.get_particle_mask(quark_jet, (4, 5))
        
        mask = h1_mask & h2_mask & h3_mask
        n_3h = mask.sum()

        print(f'Number of 3 Higgs events: {n_3h}')

        # copy 3h events to new file
        shutil.copyfile(file, output_file)
        with h5py.File(output_file, 'a') as f_new:
            for key in utils.get_dataset_keys(f):
                f_new[key].resize(n_3h, axis=0)
                f_new[key][:] = f[key][:][mask]

    return output_file

def select_nb_event(file, output_file, nb=6):
    # 選取 triHiggs HDF5 資料中，有 nb 個 b-jets 的事件
    # root, ext = os.path.splitext(file)

    with h5py.File(file, 'r') as f:
        bTag = f['INPUTS/Source/btag'][...]
        n_b_jet = np.sum(bTag, axis=1)

        mask = n_b_jet >= nb
        n_6b = mask.sum()

        print(f'Number of {nb} b events: {n_6b}')

        # copy 3h events to new file
        shutil.copyfile(file, output_file)
        with h5py.File(output_file, 'a') as f_new:
            for key in utils.get_dataset_keys(f):
                f_new[key].resize(n_6b, axis=0)
                f_new[key][:] = f[key][:][mask]

    return output_file

# Example

In [6]:
file_path = './Sample/SPANet/gghhh_4b_PT40_test-chi2_pairing.h5'
utils.print_h5_info(file_path, 600)

./Sample/SPANet/gghhh_4b_PT40_test-chi2_pairing.h5
Dataset size: 100000
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True  True False False False False False
 False False False]
INPUTS/Source/btag [False  True False  True  True  True  True False False False False False
 False False False]
INPUTS/Source/eta [-1.7665203  -1.6837091  -2.2155464  -1.8894336  -1.5082266  -2.0863128
  0.42711097  0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [13.935255  20.908175  10.868314   7.240814   5.5055876  5.445733
  6.033167   0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [-0.38953632 -2.6495998   0.8172901  -3.0919943   2.8169796   0.20008782
 -3.092532    0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [151.24182  125.23633   53.704266  45.350212  44.67909   43.811172
  27.066412   0.      

In [7]:
file_path = './Sample/SPANet/pp6b_4b-chi2_pairing.h5'
utils.print_h5_info(file_path, 600)

./Sample/SPANet/pp6b_4b-chi2_pairing.h5
Dataset size: 790372
CLASSIFICATIONS/EVENT/signal 0
INPUTS/Source/MASK [ True  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/btag [ True False  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/eta [-0.26721513  2.1262574   1.4021034   0.74789894  0.98507136 -0.9648843
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [21.368443  16.701664   9.614164   8.568227   7.7939844  6.4743977
  0.         0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [-0.07474402  1.8793612  -2.0537896   1.8073521   2.9197757   2.2181008
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [115.262054  68.29485   42.868797  36.175068  27.822865  25.04221
   0.         0.         0.       

In [4]:
file_path = './Sample/SPANet/pp6b_0b.h5'
utils.print_h5_info(file_path, 600)

./Sample/SPANet/pp6b_0b.h5
Dataset size: 1000000
CLASSIFICATIONS/EVENT/signal 0
INPUTS/Source/MASK [ True  True  True  True  True  True  True False False False False False
 False False False]
INPUTS/Source/btag [ True  True  True  True  True False  True False False False False False
 False False False]
INPUTS/Source/eta [ 2.0852013  -0.12259393 -0.97791094  1.5488727  -1.2496984   1.7120894
 -1.1295338   0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [21.311949  31.538649   9.084752   6.073683  11.769427   9.110075
  6.4713535  0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [-0.6082822  -2.9872162   1.1448561   1.6370041  -2.5668838  -0.15825841
 -1.85583     0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [106.98244  103.65977   66.599106  60.311066  50.98297   37.67056
  26.527233   0.         0.         0.        

In [8]:
print_h5_sb_info('./Sample/SPANet/triHiggs_0b_train.h5')
print_h5_sb_info('./Sample/SPANet/triHiggs_0b_test.h5')

\item Total sample size: 1,800,000
\item Signal sample size: 900,000
\item Background sample size: 900,000
\item Total sample size: 200,000
\item Signal sample size: 100,000
\item Background sample size: 100,000


In [5]:
print_triHiggs_h5_info('./Sample/SPANet/triHiggs_0b_train.h5')
print_triHiggs_h5_info('./Sample/SPANet/triHiggs_0b_test.h5')

./Sample/SPANet/triHiggs_0b_train.h5
Dataset size: 1800000
Number of 0 Higgs events: 1041627
Number of 1 Higgs events: 318053
Number of 2 Higgs events: 277876
Number of 3 Higgs events: 162444
\item Total sample size: 1,800,000
\item 1h sample size: 318,053
\item 2h sample size: 277,876
\item 3h sample size: 162,444
./Sample/SPANet/triHiggs_0b_test.h5
Dataset size: 200000
Number of 0 Higgs events: 115771
Number of 1 Higgs events: 35372
Number of 2 Higgs events: 30853
Number of 3 Higgs events: 18004
\item Total sample size: 200,000
\item 1h sample size: 35,372
\item 2h sample size: 30,853
\item 3h sample size: 18,004


{'total': 200000, '0h': 115771, '1h': 35372, '2h': 30853, '3h': 18004}

In [8]:
print_triHiggs_h5_info('../SPANet2/data/triHiggs/gghhh_6b_PT40_train.h5')
print_triHiggs_h5_info('../SPANet2/data/triHiggs/gghhh_6b_PT40_test.h5')

../SPANet2/data/triHiggs/gghhh_6b_PT40_train.h5
Dataset size: 360000
Number of 0 Higgs events: 4607
Number of 1 Higgs events: 38595
Number of 2 Higgs events: 73036
Number of 3 Higgs events: 243762
\item Total sample size: 360,000
\item 1h sample size: 38,595
\item 2h sample size: 73,036
\item 3h sample size: 243,762
../SPANet2/data/triHiggs/gghhh_6b_PT40_test.h5
Dataset size: 40000
Number of 0 Higgs events: 483
Number of 1 Higgs events: 4360
Number of 2 Higgs events: 8070
Number of 3 Higgs events: 27087
\item Total sample size: 40,000
\item 1h sample size: 4,360
\item 2h sample size: 8,070
\item 3h sample size: 27,087


{'total': 40000, '0h': 483, '1h': 4360, '2h': 8070, '3h': 27087}

In [3]:
print_triHiggs_h5_info('../SPANet2/data/triHiggs/gghhh_4b_PT40_train.h5')
print_triHiggs_h5_info('../SPANet2/data/triHiggs/gghhh_4b_PT40_test.h5')

../SPANet2/data/triHiggs/gghhh_4b_PT40_train.h5
Dataset size: 900000
Number of 0 Higgs events: 54693
Number of 1 Higgs events: 246462
Number of 2 Higgs events: 318057
Number of 3 Higgs events: 280788
\item Total sample size: 900,000
\item 1h sample size: 246,462
\item 2h sample size: 318,057
\item 3h sample size: 280,788
../SPANet2/data/triHiggs/gghhh_4b_PT40_test.h5
Dataset size: 100000
Number of 0 Higgs events: 6208
Number of 1 Higgs events: 27243
Number of 2 Higgs events: 35050
Number of 3 Higgs events: 31499
\item Total sample size: 100,000
\item 1h sample size: 27,243
\item 2h sample size: 35,050
\item 3h sample size: 31,499


{'total': 100000, '0h': 6208, '1h': 27243, '2h': 35050, '3h': 31499}

In [3]:
print_triHiggs_h5_info('../SPANet2/data/triHiggs/triHiggs_4b_PT40_train.h5')
print_triHiggs_h5_info('../SPANet2/data/triHiggs/triHiggs_4b_PT40_test.h5')

../SPANet2/data/triHiggs/triHiggs_4b_PT40_train.h5
Dataset size: 900000
Number of 0 Higgs events: 477508
Number of 1 Higgs events: 123141
Number of 2 Higgs events: 159219
Number of 3 Higgs events: 140132
\item Total sample size: 900,000
\item 1h sample size: 123,141
\item 2h sample size: 159,219
\item 3h sample size: 140,132
../SPANet2/data/triHiggs/triHiggs_4b_PT40_test.h5
Dataset size: 100000
Number of 0 Higgs events: 53099
Number of 1 Higgs events: 13785
Number of 2 Higgs events: 17448
Number of 3 Higgs events: 15668
\item Total sample size: 100,000
\item 1h sample size: 13,785
\item 2h sample size: 17,448
\item 3h sample size: 15,668


{'total': 100000, '0h': 53099, '1h': 13785, '2h': 17448, '3h': 15668}

In [3]:
print_triHiggs_h5_info('Sample/SPANet/gghhh_4b_PT40_new.h5')

Sample/SPANet/gghhh_4b_PT40_new.h5
Dataset size: 32337
Number of 0 Higgs events: 2111
Number of 1 Higgs events: 8941
Number of 2 Higgs events: 11341
Number of 3 Higgs events: 9944
\item Total sample size: 32,337
\item 1h sample size: 8,941
\item 2h sample size: 11,341
\item 3h sample size: 9,944


{'total': 32337, '0h': 2111, '1h': 8941, '2h': 11341, '3h': 9944}

In [4]:
print_triHiggs_h5_info('Sample/SPANet/pphhh_sm_4b_PT40.h5')

Sample/SPANet/pphhh_sm_4b_PT40.h5
Dataset size: 48734
Number of 0 Higgs events: 1775
Number of 1 Higgs events: 10759
Number of 2 Higgs events: 18601
Number of 3 Higgs events: 17599
\item Total sample size: 48,734
\item 1h sample size: 10,759
\item 2h sample size: 18,601
\item 3h sample size: 17,599


{'total': 48734, '0h': 1775, '1h': 10759, '2h': 18601, '3h': 17599}

In [3]:
files = [f'Sample/SPANet/gghhh_0b_{i:02}.h5' for i in range(2, 9)] + [f'Sample/SPANet/gghhh_0b_{rnd}.h5' for rnd in [323, 423, 523, 614, 714]]

merged_h5 = utils.merge_h5_file(*files)

new_file = 'Sample/SPANet/gghhh_4b.h5'

os.rename(merged_h5, new_file)

'Sample/SPANet/gghhh_0b_02.h5' and ('Sample/SPANet/gghhh_0b_03.h5', 'Sample/SPANet/gghhh_0b_04.h5', 'Sample/SPANet/gghhh_0b_05.h5', 'Sample/SPANet/gghhh_0b_06.h5', 'Sample/SPANet/gghhh_0b_07.h5', 'Sample/SPANet/gghhh_0b_08.h5', 'Sample/SPANet/gghhh_0b_323.h5', 'Sample/SPANet/gghhh_0b_423.h5', 'Sample/SPANet/gghhh_0b_523.h5', 'Sample/SPANet/gghhh_0b_614.h5', 'Sample/SPANet/gghhh_0b_714.h5') are same structure, can be merged.
Sample/SPANet/gghhh_0b_02_merged.h5 not exist. Copy Sample/SPANet/gghhh_0b_02.h5 to Sample/SPANet/gghhh_0b_02_merged.h5
Size of Sample/SPANet/gghhh_0b_02.h5: 304372
Size of Sample/SPANet/gghhh_0b_03.h5: 303994
Size of Sample/SPANet/gghhh_0b_02_merged.h5: 608366
Size of Sample/SPANet/gghhh_0b_04.h5: 303915
Size of Sample/SPANet/gghhh_0b_02_merged.h5: 912281
Size of Sample/SPANet/gghhh_0b_05.h5: 304049
Size of Sample/SPANet/gghhh_0b_02_merged.h5: 1216330
Size of Sample/SPANet/gghhh_0b_06.h5: 304151
Size of Sample/SPANet/gghhh_0b_02_merged.h5: 1520481
Size of Sample/SP

In [3]:
files = [f'Sample/SPANet/gghhh_6b_{i:02}.h5' for i in range(2, 9)] + [f'Sample/SPANet/gghhh_6b_{rnd}.h5' for rnd in [323, 423, 523, 614, 714]]

merged_h5 = utils.merge_h5_file(*files)

new_file = 'Sample/SPANet/gghhh_6b.h5'

os.rename(merged_h5, new_file)

'Sample/SPANet/gghhh_6b_02.h5' and ('Sample/SPANet/gghhh_6b_03.h5', 'Sample/SPANet/gghhh_6b_04.h5', 'Sample/SPANet/gghhh_6b_05.h5', 'Sample/SPANet/gghhh_6b_06.h5', 'Sample/SPANet/gghhh_6b_07.h5', 'Sample/SPANet/gghhh_6b_08.h5', 'Sample/SPANet/gghhh_6b_323.h5', 'Sample/SPANet/gghhh_6b_423.h5', 'Sample/SPANet/gghhh_6b_523.h5', 'Sample/SPANet/gghhh_6b_614.h5', 'Sample/SPANet/gghhh_6b_714.h5') are same structure, can be merged.
Sample/SPANet/gghhh_6b_02_merged.h5 not exist. Copy Sample/SPANet/gghhh_6b_02.h5 to Sample/SPANet/gghhh_6b_02_merged.h5
Size of Sample/SPANet/gghhh_6b_02.h5: 40565
Size of Sample/SPANet/gghhh_6b_03.h5: 40866
Size of Sample/SPANet/gghhh_6b_02_merged.h5: 81431
Size of Sample/SPANet/gghhh_6b_04.h5: 40383
Size of Sample/SPANet/gghhh_6b_02_merged.h5: 121814
Size of Sample/SPANet/gghhh_6b_05.h5: 40879
Size of Sample/SPANet/gghhh_6b_02_merged.h5: 162693
Size of Sample/SPANet/gghhh_6b_06.h5: 40538
Size of Sample/SPANet/gghhh_6b_02_merged.h5: 203231
Size of Sample/SPANet/ggh

# Make training and testing dataset

In [4]:
file_path = 'Sample/SPANet/gghhh_0b.h5'
size = 1000000
split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_0b.h5')
os.remove(split_file2)

Size of Sample/SPANet/gghhh_0b.h5: 1045417
Size of Sample/SPANet/gghhh_0b_split1.h5: 1000000
Size of Sample/SPANet/gghhh_0b_split2.h5: 45417


In [5]:
file_path = 'Sample/SPANet/gghhh_0b.h5'
r = 0.9
split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_0b_train.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_0b_test.h5')

Size of Sample/SPANet/gghhh_0b.h5: 1000000
Size of Sample/SPANet/gghhh_0b_split1.h5: 900000
Size of Sample/SPANet/gghhh_0b_split2.h5: 100000


In [6]:
file_path = '/home/public/3h6b_samples/bg_1m.h5'
size = 1000000
split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_0b.h5')
os.remove(split_file2)

Size of /home/public/3h6b_samples/bg_1m.h5: 1000000
Size of /home/public/3h6b_samples/bg_1m_split1.h5: 1000000
Size of /home/public/3h6b_samples/bg_1m_split2.h5: 0


In [7]:
file_path = './Sample/SPANet/pp6b_0b.h5'
r = 0.9
split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_0b_train.h5')
os.rename(split_file2, './Sample/SPANet/pp6b_0b_test.h5')


Size of ./Sample/SPANet/pp6b_0b.h5: 1000000
Size of ./Sample/SPANet/pp6b_0b_split1.h5: 900000
Size of ./Sample/SPANet/pp6b_0b_split2.h5: 100000


In [8]:
files = ['./Sample/SPANet/gghhh_0b_train.h5', 
         './Sample/SPANet/pp6b_0b_train.h5']

merged_h5 = merge_h5_file(*files)

new_file = 'Sample/SPANet/triHiggs_0b_train.h5'

os.rename(merged_h5, new_file)

files = ['./Sample/SPANet/gghhh_0b_test.h5', 
         './Sample/SPANet/pp6b_0b_test.h5']

merged_h5 = merge_h5_file(*files)

new_file = 'Sample/SPANet/triHiggs_0b_test.h5'

os.rename(merged_h5, new_file)

'./Sample/SPANet/gghhh_0b_train.h5' and ('./Sample/SPANet/pp6b_0b_train.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_0b_train_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_0b_train.h5 to ./Sample/SPANet/gghhh_0b_train_merged.h5
Size of ./Sample/SPANet/gghhh_0b_train.h5: 900000
Size of ./Sample/SPANet/pp6b_0b_train.h5: 900000
Size of ./Sample/SPANet/gghhh_0b_train_merged.h5: 1800000
'./Sample/SPANet/gghhh_0b_test.h5' and ('./Sample/SPANet/pp6b_0b_test.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_0b_test_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_0b_test.h5 to ./Sample/SPANet/gghhh_0b_test_merged.h5
Size of ./Sample/SPANet/gghhh_0b_test.h5: 100000
Size of ./Sample/SPANet/pp6b_0b_test.h5: 100000
Size of ./Sample/SPANet/gghhh_0b_test_merged.h5: 200000


In [12]:
shuffle_h5('Sample/SPANet/triHiggs_0b_train.h5')
shuffle_h5('Sample/SPANet/triHiggs_0b_test.h5')

Dataset size: 1800000
Dataset size: 200000


In [20]:
print_h5_info('Sample/SPANet/triHiggs_0b_train.h5', event=10)
print_h5_info('Sample/SPANet/triHiggs_0b_test.h5')

Sample/SPANet/triHiggs_0b_train.h5
Dataset size: 1800000
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/btag [False  True False  True False False False False False False False False
 False False False]
INPUTS/Source/eta [-1.7856659 -1.171275  -1.3175582 -2.0036418 -0.5670005 -1.6876249
  0.         0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/mass [21.988707 27.441921 16.656717 16.225178  8.112517  5.55458   0.
  0.        0.        0.        0.        0.        0.        0.
  0.      ]
INPUTS/Source/phi [ 2.7874901   0.42819452 -0.32609227 -1.9737867  -1.2101762  -2.9208257
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [100.418755  99.07493   57.353085  54.81301   35.30132   26.057392
   0.         0.         0.         0.         0.         0.
   0.    

In [3]:
file_path = './Sample/SPANet/pp6b_2.h5'
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/pp6b_6b_2.h5'
six_b_file = select_nb_event(file_path, output_file, nb=6)
print_triHiggs_h5_info(six_b_file)

./Sample/SPANet/pp6b_2.h5
Dataset size: 2971310
Number of 0 Higgs events: 2971310
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 2,971,310
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0
Number of 6 b events: 84031
./Sample/SPANet/pp6b_6b_2.h5
Dataset size: 84031
Number of 0 Higgs events: 84031
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 84,031
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0


{'total': 84031, '0h': 84031, '1h': 0, '2h': 0, '3h': 0}

In [4]:
file_path = './Sample/SPANet/pp6b_6b_2.h5'
size = 40000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_6b_test.h5')
os.remove(split_file2)

Size of ./Sample/SPANet/pp6b_6b_2.h5: 84031
Size of ./Sample/SPANet/pp6b_6b_2_split1.h5: 40000
Size of ./Sample/SPANet/pp6b_6b_2_split2.h5: 44031


In [6]:
files = ['./Sample/SPANet/gghhh_6b_PT40_test.h5', 
         './Sample/SPANet/pp6b_6b_test.h5']

merged_h5 = utils.merge_h5_file(*files)

new_file = './Sample/SPANet/triHiggs_6b_test.h5'

os.rename(merged_h5, new_file)

'./Sample/SPANet/gghhh_6b_PT40_test.h5' and ('./Sample/SPANet/pp6b_6b_test.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_6b_PT40_test_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_6b_PT40_test.h5 to ./Sample/SPANet/gghhh_6b_PT40_test_merged.h5
Size of ./Sample/SPANet/gghhh_6b_PT40_test.h5: 40000
Size of ./Sample/SPANet/pp6b_6b_test.h5: 40000
Size of ./Sample/SPANet/gghhh_6b_PT40_test_merged.h5: 80000


# Prepare 3h training dataset

In [5]:
file_path = './Sample/SPANet/gghhh_0b.h5'
print_triHiggs_h5_info(file_path)

triHiggs_file = select_3h_event(file_path)
print_triHiggs_h5_info(triHiggs_file)

./Sample/SPANet/gghhh_0b.h5
Dataset size: 6275278
Number of 0 Higgs events: 990165
Number of 1 Higgs events: 2214899
Number of 2 Higgs events: 1937205
Number of 3 Higgs events: 1133009
\item Total sample size: 6,275,278
\item 1h sample size: 2,214,899
\item 2h sample size: 1,937,205
\item 3h sample size: 1,133,009
Number of 3 Higgs events: 1133009
./Sample/SPANet/gghhh_0b_3h.h5
Dataset size: 1133009
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 1133009
\item Total sample size: 1,133,009
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 1,133,009


{'total': 1133009, '0h': 0, '1h': 0, '2h': 0, '3h': 1133009}

In [6]:
file_path = 'Sample/SPANet/gghhh_0b_3h.h5'
size = 1000000
split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, file_path)
os.remove(split_file2)

Size of Sample/SPANet/gghhh_0b_3h.h5: 1133009
Size of Sample/SPANet/gghhh_0b_3h_split1.h5: 1000000
Size of Sample/SPANet/gghhh_0b_3h_split2.h5: 133009


In [7]:
file_path = 'Sample/SPANet/gghhh_0b_3h.h5'
r = 0.9
split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_0b_3h_train.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_0b_3h_test.h5')

Size of Sample/SPANet/gghhh_0b_3h.h5: 1000000
Size of Sample/SPANet/gghhh_0b_3h_split1.h5: 900000
Size of Sample/SPANet/gghhh_0b_3h_split2.h5: 100000


In [8]:
file_path = '/home/public/3h6b_samples/bg_1m.h5'
size = 1000000
split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_0b.h5')
os.remove(split_file2)

Size of /home/public/3h6b_samples/bg_1m.h5: 1000000
Size of /home/public/3h6b_samples/bg_1m_split1.h5: 1000000
Size of /home/public/3h6b_samples/bg_1m_split2.h5: 0


In [9]:
file_path = './Sample/SPANet/pp6b_0b.h5'
r = 0.9
split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_0b_train.h5')
os.rename(split_file2, './Sample/SPANet/pp6b_0b_test.h5')


Size of ./Sample/SPANet/pp6b_0b.h5: 1000000
Size of ./Sample/SPANet/pp6b_0b_split1.h5: 900000
Size of ./Sample/SPANet/pp6b_0b_split2.h5: 100000


In [10]:
files = ['./Sample/SPANet/gghhh_0b_3h_train.h5', 
         './Sample/SPANet/pp6b_0b_train.h5']

merged_h5 = merge_h5_file(*files)

new_file = 'Sample/SPANet/triHiggs_0b_3h_train.h5'

os.rename(merged_h5, new_file)

files = ['./Sample/SPANet/gghhh_0b_3h_test.h5', 
         './Sample/SPANet/pp6b_0b_test.h5']

merged_h5 = merge_h5_file(*files)

new_file = 'Sample/SPANet/triHiggs_0b_3h_test.h5'

os.rename(merged_h5, new_file)

'./Sample/SPANet/gghhh_0b_3h_train.h5' and ('./Sample/SPANet/pp6b_0b_train.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_0b_3h_train_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_0b_3h_train.h5 to ./Sample/SPANet/gghhh_0b_3h_train_merged.h5
Size of ./Sample/SPANet/gghhh_0b_3h_train.h5: 900000
Size of ./Sample/SPANet/pp6b_0b_train.h5: 900000
Size of ./Sample/SPANet/gghhh_0b_3h_train_merged.h5: 1800000
'./Sample/SPANet/gghhh_0b_3h_test.h5' and ('./Sample/SPANet/pp6b_0b_test.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_0b_3h_test_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_0b_3h_test.h5 to ./Sample/SPANet/gghhh_0b_3h_test_merged.h5
Size of ./Sample/SPANet/gghhh_0b_3h_test.h5: 100000
Size of ./Sample/SPANet/pp6b_0b_test.h5: 100000
Size of ./Sample/SPANet/gghhh_0b_3h_test_merged.h5: 200000


In [11]:
shuffle_h5('Sample/SPANet/triHiggs_0b_3h_train.h5')
shuffle_h5('Sample/SPANet/triHiggs_0b_3h_test.h5')

Dataset size: 1800000
Dataset size: 200000


In [12]:
print_h5_info('Sample/SPANet/triHiggs_0b_3h_train.h5', event=10)
print_h5_info('Sample/SPANet/triHiggs_0b_3h_test.h5')

Sample/SPANet/triHiggs_0b_3h_train.h5
Dataset size: 1800000
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True  True False False False False False
 False False False]
INPUTS/Source/btag [False False False False False False  True False False False False False
 False False False]
INPUTS/Source/eta [-0.20713721  1.3114525  -0.56443137  0.05710578 -0.4570189   0.25234416
  0.90252006  0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [ 9.674998  12.387378   6.9249     8.384139   8.646945   3.5128188
  6.0978813  0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [-2.292494    2.751582   -0.76119405  1.7267805  -0.05098353 -1.8268727
  1.4313099   0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [60.725346 57.250782 54.543266 53.631428 52.888084 48.29721  29.313828
  0.        0.        0.     

In [4]:
file_path = './Sample/SPANet/pphhh_sm_4b_PT40.h5'
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/pphhh_sm_4b_PT40_3h.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

./Sample/SPANet/pphhh_sm_4b_PT40.h5
Dataset size: 48734
Number of 0 Higgs events: 1775
Number of 1 Higgs events: 10759
Number of 2 Higgs events: 18601
Number of 3 Higgs events: 17599
\item Total sample size: 48,734
\item 1h sample size: 10,759
\item 2h sample size: 18,601
\item 3h sample size: 17,599
Number of 3 Higgs events: 17599
./Sample/SPANet/pphhh_sm_4b_PT40_3h.h5
Dataset size: 17599
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 17599
\item Total sample size: 17,599
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 17,599


{'total': 17599, '0h': 0, '1h': 0, '2h': 0, '3h': 17599}

In [3]:
file_path = '../SPANet2/data/triHiggs/gghhh_6b_PT40_test.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/gghhh_6b_PT40_3h_test.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

../SPANet2/data/triHiggs/gghhh_6b_PT40_test.h5
Dataset size: 40000
Number of 0 Higgs events: 483
Number of 1 Higgs events: 4360
Number of 2 Higgs events: 8070
Number of 3 Higgs events: 27087
\item Total sample size: 40,000
\item 1h sample size: 4,360
\item 2h sample size: 8,070
\item 3h sample size: 27,087
Number of 3 Higgs events: 27087
../SPANet2/data/triHiggs/gghhh_6b_PT40_3h_test.h5
Dataset size: 27087
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 27087
\item Total sample size: 27,087
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 27,087


{'total': 27087, '0h': 0, '1h': 0, '2h': 0, '3h': 27087}

# Generate 6b event

In [5]:
file_path = './Sample/SPANet/pp6b_0b.h5'
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/pp6b_6b.h5'
six_b_file = select_6b_event(file_path, output_file)
print_triHiggs_h5_info(six_b_file)

./Sample/SPANet/pp6b_0b.h5
Dataset size: 1000000
Number of 0 Higgs events: 1000000
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 1,000,000
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0
Number of 6 b events: 28755
./Sample/SPANet/pp6b_6b.h5
Dataset size: 28755
Number of 0 Higgs events: 28755
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 28,755
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0


{'total': 28755, '0h': 28755, '1h': 0, '2h': 0, '3h': 0}

In [5]:
files = ['./Sample/SPANet/pp6b_0b.h5', './Sample/SPANet/pp6b_0b_2.h5']
file_path = utils.merge_h5_file(*files)
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/pp6b_4b.h5'
nb_file = select_nb_event(file_path, output_file, nb=4)
print_triHiggs_h5_info(nb_file)

'./Sample/SPANet/pp6b_0b.h5' and ('./Sample/SPANet/pp6b_0b_2.h5',) are same structure, can be merged.
./Sample/SPANet/pp6b_0b_merged.h5 not exist. Copy ./Sample/SPANet/pp6b_0b.h5 to ./Sample/SPANet/pp6b_0b_merged.h5
Size of ./Sample/SPANet/pp6b_0b.h5: 1000000
Size of ./Sample/SPANet/pp6b_0b_2.h5: 808254
Size of ./Sample/SPANet/pp6b_0b_merged.h5: 1808254
./Sample/SPANet/pp6b_0b_merged.h5
Dataset size: 1808254
Number of 0 Higgs events: 1808254
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 1,808,254
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0
Number of 4 b events: 790372
./Sample/SPANet/pp6b_4b.h5
Dataset size: 790372
Number of 0 Higgs events: 790372
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 0
\item Total sample size: 790,372
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 0


{'total': 790372, '0h': 790372, '1h': 0, '2h': 0, '3h': 0}

In [5]:
file_path = '../SPANet2/data/triHiggs/triHiggs_0b_3h_test.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/triHiggs_6b_3h_test.h5'
six_b_file = select_6b_event(file_path, output_file)
print_triHiggs_h5_info(six_b_file)

../SPANet2/data/triHiggs/triHiggs_0b_3h_test.h5
Dataset size: 200000
Number of 0 Higgs events: 100000
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 100000
\item Total sample size: 200,000
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 100,000
Number of 6 b events: 11521
../SPANet2/data/triHiggs/triHiggs_6b_3h_test.h5
Dataset size: 11521
Number of 0 Higgs events: 2905
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 8616
\item Total sample size: 11,521
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 8,616


{'total': 11521, '0h': 2905, '1h': 0, '2h': 0, '3h': 8616}

In [None]:
file_path = '../SPANet2/data/triHiggs/triHiggs_0b_3h_test.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/triHiggs_6b_3h_test.h5'
six_b_file = select_6b_event(file_path, output_file)
print_triHiggs_h5_info(six_b_file)

../SPANet2/data/triHiggs/triHiggs_0b_3h_test.h5
Dataset size: 200000
Number of 0 Higgs events: 100000
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 100000
\item Total sample size: 200,000
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 100,000
Number of 6 b events: 11521
../SPANet2/data/triHiggs/triHiggs_6b_3h_test.h5
Dataset size: 11521
Number of 0 Higgs events: 2905
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 8616
\item Total sample size: 11,521
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 8,616


{'total': 11521, '0h': 2905, '1h': 0, '2h': 0, '3h': 8616}

# Prepare training and testing datasets with at least 4 jet $ > 40 \text{ GeV}$ and 4 $b$-tagged jet

In [4]:
file_path = 'Sample/SPANet/gghhh_4b.h5'
size = 1000000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_4b_PT40.h5')

size = 500000
utils.split_h5_size(split_file2, size)

root, ext = os.path.splitext(split_file2)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_4b_PT40_DNN.h5')
os.remove(split_file2)

Size of Sample/SPANet/gghhh_4b.h5: 3653696
Size of Sample/SPANet/gghhh_4b_split1.h5: 1000000
Size of Sample/SPANet/gghhh_4b_split2.h5: 2653696
Size of Sample/SPANet/gghhh_4b_split2.h5: 2653696
Size of Sample/SPANet/gghhh_4b_split2_split1.h5: 500000
Size of Sample/SPANet/gghhh_4b_split2_split2.h5: 2153696


In [5]:
file_path = './Sample/SPANet/gghhh_4b_PT40.h5'
r = 0.9
utils.split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_4b_PT40_train.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_4b_PT40_test.h5')

Size of ./Sample/SPANet/gghhh_4b_PT40.h5: 1000000
Size of ./Sample/SPANet/gghhh_4b_PT40_split1.h5: 900000
Size of ./Sample/SPANet/gghhh_4b_PT40_split2.h5: 100000


In [4]:
file_path = '../SPANet2/data/triHiggs/gghhh_4b_PT40_train.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/gghhh_4b_PT40_3h_train.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

../SPANet2/data/triHiggs/gghhh_4b_PT40_train.h5
Dataset size: 900000
Number of 0 Higgs events: 54693
Number of 1 Higgs events: 246462
Number of 2 Higgs events: 318057
Number of 3 Higgs events: 280788
\item Total sample size: 900,000
\item 1h sample size: 246,462
\item 2h sample size: 318,057
\item 3h sample size: 280,788
Number of 3 Higgs events: 280788
../SPANet2/data/triHiggs/gghhh_4b_PT40_3h_train.h5
Dataset size: 280788
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 280788
\item Total sample size: 280,788
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 280,788


{'total': 280788, '0h': 0, '1h': 0, '2h': 0, '3h': 280788}

In [5]:
file_path = '../SPANet2/data/triHiggs/gghhh_4b_PT40_test.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/gghhh_4b_PT40_3h_test.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

../SPANet2/data/triHiggs/gghhh_4b_PT40_test.h5
Dataset size: 100000
Number of 0 Higgs events: 6208
Number of 1 Higgs events: 27243
Number of 2 Higgs events: 35050
Number of 3 Higgs events: 31499
\item Total sample size: 100,000
\item 1h sample size: 27,243
\item 2h sample size: 35,050
\item 3h sample size: 31,499
Number of 3 Higgs events: 31499
../SPANet2/data/triHiggs/gghhh_4b_PT40_3h_test.h5
Dataset size: 31499
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 31499
\item Total sample size: 31,499
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 31,499


{'total': 31499, '0h': 0, '1h': 0, '2h': 0, '3h': 31499}

In [7]:
file_path = '../SPANet2/data/triHiggs/triHiggs_4b_PT40_3h_train.h5'
print_triHiggs_h5_info(file_path)

output_file = '../SPANet2/data/triHiggs/triHiggs_6b_PT40_3h_train.h5'
six_b_file = select_6b_event(file_path, output_file)
print_triHiggs_h5_info(six_b_file)

../SPANet2/data/triHiggs/triHiggs_4b_PT40_3h_train.h5
Dataset size: 280788
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 280788
\item Total sample size: 280,788
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 280,788
Number of 6 b events: 76506
../SPANet2/data/triHiggs/triHiggs_6b_PT40_3h_train.h5
Dataset size: 76506
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 76506
\item Total sample size: 76,506
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 76,506


{'total': 76506, '0h': 0, '1h': 0, '2h': 0, '3h': 76506}

In [3]:
file_path = './Sample/SPANet/gghhh_4b_PT40_new.h5'
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/gghhh_4b_PT40_3h_new.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

./Sample/SPANet/gghhh_4b_PT40_new.h5
Dataset size: 32337
Number of 0 Higgs events: 2111
Number of 1 Higgs events: 8941
Number of 2 Higgs events: 11341
Number of 3 Higgs events: 9944
\item Total sample size: 32,337
\item 1h sample size: 8,941
\item 2h sample size: 11,341
\item 3h sample size: 9,944
Number of 3 Higgs events: 9944
./Sample/SPANet/gghhh_4b_PT40_3h_new.h5
Dataset size: 9944
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 9944
\item Total sample size: 9,944
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 9,944


{'total': 9944, '0h': 0, '1h': 0, '2h': 0, '3h': 9944}

## classification

In [5]:
file_path = 'Sample/SPANet/gghhh_4b.h5'
print_triHiggs_h5_info(file_path)

Sample/SPANet/gghhh_4b.h5
Dataset size: 3653696
Number of 0 Higgs events: 223197
Number of 1 Higgs events: 998906
Number of 2 Higgs events: 1290603
Number of 3 Higgs events: 1140990
\item Total sample size: 3,653,696
\item 1h sample size: 998,906
\item 2h sample size: 1,290,603
\item 3h sample size: 1,140,990


{'total': 3653696, '0h': 223197, '1h': 998906, '2h': 1290603, '3h': 1140990}

In [6]:
file_path = 'Sample/SPANet/gghhh_4b.h5'
size = 500000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_4b_PT40.h5')
os.remove(split_file2)

Size of Sample/SPANet/gghhh_4b.h5: 3653696
Size of Sample/SPANet/gghhh_4b_split1.h5: 500000
Size of Sample/SPANet/gghhh_4b_split2.h5: 3153696


In [7]:
file_path = 'Sample/SPANet/gghhh_4b_PT40.h5'
r = 0.9
utils.split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_4b_PT40_train.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_4b_PT40_test.h5')

Size of Sample/SPANet/gghhh_4b_PT40.h5: 500000
Size of Sample/SPANet/gghhh_4b_PT40_split1.h5: 450000
Size of Sample/SPANet/gghhh_4b_PT40_split2.h5: 50000


In [8]:
file_path = './Sample/SPANet/pp6b_4b.h5'
size = 500000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_4b_PT40.h5')
os.remove(split_file2)

Size of ./Sample/SPANet/pp6b_4b.h5: 790372
Size of ./Sample/SPANet/pp6b_4b_split1.h5: 500000
Size of ./Sample/SPANet/pp6b_4b_split2.h5: 290372


In [9]:
file_path = './Sample/SPANet/pp6b_4b_PT40.h5'
r = 0.9
utils.split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_4b_PT40_train.h5')
os.rename(split_file2, './Sample/SPANet/pp6b_4b_PT40_test.h5')


Size of ./Sample/SPANet/pp6b_4b_PT40.h5: 500000
Size of ./Sample/SPANet/pp6b_4b_PT40_split1.h5: 450000
Size of ./Sample/SPANet/pp6b_4b_PT40_split2.h5: 50000


In [10]:
files = ['./Sample/SPANet/gghhh_4b_PT40_train.h5', 
         './Sample/SPANet/pp6b_4b_PT40_train.h5']

merged_h5 = utils.merge_h5_file(*files)

new_file = 'Sample/SPANet/triHiggs_4b_PT40_train.h5'

os.rename(merged_h5, new_file)

files = ['./Sample/SPANet/gghhh_4b_PT40_test.h5', 
         './Sample/SPANet/pp6b_4b_PT40_test.h5']

merged_h5 = utils.merge_h5_file(*files)

new_file = 'Sample/SPANet/triHiggs_4b_PT40_test.h5'

os.rename(merged_h5, new_file)

'./Sample/SPANet/gghhh_4b_PT40_train.h5' and ('./Sample/SPANet/pp6b_4b_PT40_train.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_4b_PT40_train_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_4b_PT40_train.h5 to ./Sample/SPANet/gghhh_4b_PT40_train_merged.h5
Size of ./Sample/SPANet/gghhh_4b_PT40_train.h5: 450000
Size of ./Sample/SPANet/pp6b_4b_PT40_train.h5: 450000
Size of ./Sample/SPANet/gghhh_4b_PT40_train_merged.h5: 900000
'./Sample/SPANet/gghhh_4b_PT40_test.h5' and ('./Sample/SPANet/pp6b_4b_PT40_test.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_4b_PT40_test_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_4b_PT40_test.h5 to ./Sample/SPANet/gghhh_4b_PT40_test_merged.h5
Size of ./Sample/SPANet/gghhh_4b_PT40_test.h5: 50000
Size of ./Sample/SPANet/pp6b_4b_PT40_test.h5: 50000
Size of ./Sample/SPANet/gghhh_4b_PT40_test_merged.h5: 100000


In [11]:
utils.shuffle_h5('Sample/SPANet/triHiggs_4b_PT40_train.h5')
utils.shuffle_h5('Sample/SPANet/triHiggs_4b_PT40_test.h5')

Dataset size: 900000
Dataset size: 100000


# Prepare training and testing datasets in the $6b$ region 

In [6]:
file_path = 'Sample/SPANet/gghhh_6b.h5'
size = 400000
utils.split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_6b_PT40.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_6b_PT40_DNN.h5')

Size of Sample/SPANet/gghhh_6b.h5: 486694
Size of Sample/SPANet/gghhh_6b_split1.h5: 400000
Size of Sample/SPANet/gghhh_6b_split2.h5: 86694


In [5]:
file_path = './Sample/SPANet/gghhh_6b_PT40.h5'
r = 0.9
utils.split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_6b_PT40_train.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_6b_PT40_test.h5')

Size of ./Sample/SPANet/gghhh_6b_PT40.h5: 400000
Size of ./Sample/SPANet/gghhh_6b_PT40_split1.h5: 360000
Size of ./Sample/SPANet/gghhh_6b_PT40_split2.h5: 40000


In [8]:
file_path = './Sample/SPANet/gghhh_6b_PT40_train.h5'
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/gghhh_6b_PT40_3h_train.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

./Sample/SPANet/gghhh_6b_PT40_train.h5
Dataset size: 360000
Number of 0 Higgs events: 4607
Number of 1 Higgs events: 38595
Number of 2 Higgs events: 73036
Number of 3 Higgs events: 243762
\item Total sample size: 360,000
\item 1h sample size: 38,595
\item 2h sample size: 73,036
\item 3h sample size: 243,762
Number of 3 Higgs events: 243762
./Sample/SPANet/gghhh_6b_PT40_3h_train.h5
Dataset size: 243762
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 243762
\item Total sample size: 243,762
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 243,762


{'total': 243762, '0h': 0, '1h': 0, '2h': 0, '3h': 243762}

In [9]:
file_path = './Sample/SPANet/gghhh_6b_PT40_test.h5'
print_triHiggs_h5_info(file_path)

output_file = './Sample/SPANet/gghhh_6b_PT40_3h_test.h5'
triHiggs_file = select_3h_event(file_path, output_file)
print_triHiggs_h5_info(triHiggs_file)

./Sample/SPANet/gghhh_6b_PT40_test.h5
Dataset size: 40000
Number of 0 Higgs events: 483
Number of 1 Higgs events: 4360
Number of 2 Higgs events: 8070
Number of 3 Higgs events: 27087
\item Total sample size: 40,000
\item 1h sample size: 4,360
\item 2h sample size: 8,070
\item 3h sample size: 27,087
Number of 3 Higgs events: 27087
./Sample/SPANet/gghhh_6b_PT40_3h_test.h5
Dataset size: 27087
Number of 0 Higgs events: 0
Number of 1 Higgs events: 0
Number of 2 Higgs events: 0
Number of 3 Higgs events: 27087
\item Total sample size: 27,087
\item 1h sample size: 0
\item 2h sample size: 0
\item 3h sample size: 27,087


{'total': 27087, '0h': 0, '1h': 0, '2h': 0, '3h': 27087}