In [5]:
import os
import h5py
import shutil
import random

import numpy as np

In [6]:
def get_dataset_keys(f):
    # 取得所有 Dataset 的名稱
    keys = []
    f.visit(lambda key: keys.append(key) if isinstance(f[key], h5py.Dataset) else None)
    return keys


def print_h5_info(file, event=0):
    # 印出所有 Dataset
    print(file)
    with h5py.File(file, 'r') as f:
        dataset_keys = get_dataset_keys(f)
        print('Dataset size:', f[dataset_keys[0]].shape[0])
        for key in dataset_keys:
            print(key, end=' ')
            print(f[key][event])


def split_h5_size(main_file, size=1000):
    # 將輸入的 HDF5 檔案以 size 分成兩個
    root, ext = os.path.splitext(main_file)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    with h5py.File(main_file, 'r') as f_main:
        dataset_keys = get_dataset_keys(f_main)
        key0 = dataset_keys[0]
        total_size = f_main[key0].shape[0]
        print(f'Size of {main_file}: {total_size}')
        
        if size > total_size:
            print(f'Split size {size} is greater than the input file size {total_size}.')
            
        with h5py.File(split_file1, 'w') as f_sp1, h5py.File(split_file2, 'w') as f_sp2:    
            sp_size = size
            for key in dataset_keys:
                maxShape = list(f_main[key].maxshape)
                maxShape[0] = None
                f_sp1.create_dataset(key, maxshape=maxShape, data=f_main[key][:sp_size])
                f_sp2.create_dataset(key, maxshape=maxShape, data=f_main[key][sp_size:])
            
            print(f'Size of {split_file1}: {f_sp1[key0].shape[0]}')
            print(f'Size of {split_file2}: {f_sp2[key0].shape[0]}')

            
def split_h5_file(main_file, r=0.9):
    # 將輸入的 HDF5 檔案以 r 的比例分成兩個
    root, ext = os.path.splitext(main_file)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    with h5py.File(main_file, 'r') as f_main:
        dataset_keys = get_dataset_keys(f_main)
        key0 = dataset_keys[0]
        total_size = f_main[key0].shape[0]
        print(f'Size of {main_file}: {total_size}')
        
        size = int(total_size * r)
        if size > total_size:
            print(f'Split size {size} is greater than the input file size {total_size}.')

        with h5py.File(split_file1, 'w') as f_sp1, h5py.File(split_file2, 'w') as f_sp2:    
            sp_size = size
            for key in dataset_keys:
                maxShape = list(f_main[key].maxshape)
                maxShape[0] = None
                f_sp1.create_dataset(key, maxshape=maxShape, data=f_main[key][:sp_size])
                f_sp2.create_dataset(key, maxshape=maxShape, data=f_main[key][sp_size:])
                    
            print(f'Size of {split_file1}: {f_sp1[key0].shape[0]}')
            print(f'Size of {split_file2}: {f_sp2[key0].shape[0]}')
    

def merge_h5_file(main_file, *arg):
    # 合併傳入的 HDF5 檔案    
    
    # 檢查傳入檔案結構是否都相同
    same_structure = True
    with h5py.File(main_file, 'r') as f_main:
        main_dataset_keys = get_dataset_keys(f_main)
        for append_file in arg:
            with h5py.File(append_file, 'r') as f_append:
                append_dataset_keys = get_dataset_keys(f_append)    
                if set(main_dataset_keys) != set(append_dataset_keys):
                    same_structure = False
                    print(f"'{main_file}' and '{append_file}' are not same structure, can not be merged.")
                    break

    # 檢查檔案結構是否都相同
    if not same_structure:
        return
    print(f"'{main_file}' and {arg} are same structure, can be merged.")

    root, ext = os.path.splitext(main_file)
    new_file = root + '_merged' + ext

    # 檢查合併檔案是否存在
    if os.path.isfile(new_file):
        print(f'{new_file} exist. Can not copy {main_file} to {new_file}')
        return

    print(f'{new_file} not exist. Copy {main_file} to {new_file}')
    shutil.copyfile(main_file, new_file)

    with h5py.File(new_file, 'a') as f_main:
        key0 = main_dataset_keys[0]
        total_size = f_main[key0].shape[0]
        print(f'Size of {main_file}: {total_size}')

        for append_file in arg:
            with h5py.File(append_file, 'r') as f_append:
                size_of_append = f_append[key0].shape[0]            
                print(f'Size of {append_file}: {size_of_append}')

                total_size += size_of_append   
                for key in main_dataset_keys:
                    f_main[key].resize(total_size, axis=0)
                    f_main[key][-size_of_append:] = f_append[key]
                
                print(f'Size of {new_file}: {f_main[key0].shape[0]}')
    return new_file

def shuffle_h5(file_path):

    with h5py.File(file_path,'a') as f:
        dataset_keys = get_dataset_keys(f)
        nevent = f[dataset_keys[0]].shape[0]
        print(f'Dataset size: {nevent}')
        
        ind_list = list(range(nevent))
        random.shuffle(ind_list)
        for key in dataset_keys:      
            f[key][...] = np.array(f[key])[ind_list]

In [7]:
def get_particle_mask(quark_jet, particle_quarks):
    # quark_jet: 每個夸克對應的 jet 編號，shape 為 (n_event, 6)
    # particle_quarks: 粒子對應的夸克編號，shape 為 (n_quarks,)

    # 檢查是否每個夸克都有對應的 jet
    mask1 = np.all(quark_jet[:, particle_quarks] != -1, axis=1)

    # 對每一個事件，檢查每個夸克對應的 jet 都不重複
    count = np.array([[np.sum(event == event[i]) for i in particle_quarks] for event in quark_jet])
    mask2 = np.all(count == 1, axis=1)

    return mask1 & mask2


def print_triHiggs_h5_info(file_path):
    # 印出 triHiggs HDF5 資料中，各 Higgs 數目的事件數
    print(file_path)
    with h5py.File(file_path, 'r') as f:

        h1b1 = f['TARGETS/h1/b1'][...]
        h1b2 = f['TARGETS/h1/b2'][...]
        h2b1 = f['TARGETS/h2/b1'][...]
        h2b2 = f['TARGETS/h2/b2'][...]
        h3b1 = f['TARGETS/h3/b1'][...]
        h3b2 = f['TARGETS/h3/b2'][...]

        quark_jet = np.array([h1b1, h1b2, h2b1, h2b2, h3b1, h3b2]).T

        h1_mask = get_particle_mask(quark_jet, (0, 1))
        h2_mask = get_particle_mask(quark_jet, (2, 3))
        h3_mask = get_particle_mask(quark_jet, (4, 5))
        
        n_tot = h1_mask.shape[0]
        n_0h = ((~h1_mask) & (~h2_mask) & (~h3_mask)).sum()
        # 任一個 Higgs 有對應的 jet
        n_1h = ((h1_mask & (~h2_mask) & (~h3_mask)) | 
                ((~h1_mask) & h2_mask & (~h3_mask)) | 
                ((~h1_mask) & (~h2_mask) & h3_mask)).sum()
        
        # 任兩個 Higgs 有對應的 jet
        n_2h = ((h1_mask & h2_mask & (~h3_mask)) | 
                ((~h1_mask) & h2_mask & h3_mask) | 
                (h1_mask & (~h2_mask) & h3_mask)).sum()
        n_3h = (h1_mask & h2_mask & h3_mask).sum()

    print(f'Dataset size: {n_tot}')
    print(f'Number of 0 Higgs events: {n_0h}')
    print(f'Number of 1 Higgs events: {n_1h}')
    print(f'Number of 2 Higgs events: {n_2h}')
    print(f'Number of 3 Higgs events: {n_3h}')
    
    print(f'\\item Total sample size: {n_tot:,}')
    print(f'\\item 1h sample size: {n_1h:,}')
    print(f'\\item 2h sample size: {n_2h:,}')
    print(f'\\item 3h sample size: {n_3h:,}')
    
    result = {
        'total': n_tot,
        '0h': n_0h,
        '1h': n_1h,
        '2h': n_2h,
        '3h': n_3h
    }
    return result

def print_h5_sb_info(file):
    # 印出訊號與背景的事件數
    with h5py.File(file,'r') as f:
        n_tot = f['CLASSIFICATIONS/EVENT/signal'][...].shape[0]
        ns = (f['CLASSIFICATIONS/EVENT/signal'][...] == 1).sum()
        nb = (f['CLASSIFICATIONS/EVENT/signal'][...] == 0).sum()

    print(f'\\item Total sample size: {n_tot:,}')
    print(f'\\item Signal sample size: {ns:,}')
    print(f'\\item Background sample size: {nb:,}')

# Example

In [12]:
# 取得資料夾內所有檔案的檔名，返回路徑
def get_all_files(folder, ext=None):
    files = []
    for root, dirs, filenames in os.walk(folder):
        for filename in filenames:
            if ext:
                if filename.endswith(ext):
                    files.append(os.path.join(root, filename))
            else:
                files.append(os.path.join(root, filename))
    return files


files = get_all_files('Sample/h5')
merged_h5 = merge_h5_file(files[0], *files[1:])
new_file = 'Sample/SPANet/pp6b_0b.h5'

os.rename(merged_h5, new_file)

'Sample/h5/out_977-00.h5' and ('Sample/h5/out_1173-00.h5', 'Sample/h5/out_441-00.h5', 'Sample/h5/out_1640-00.h5', 'Sample/h5/out_1300-00.h5', 'Sample/h5/out_1331-00.h5', 'Sample/h5/out_478-00.h5', 'Sample/h5/out_1177-00.h5', 'Sample/h5/out_856-00.h5', 'Sample/h5/out_1397-00.h5', 'Sample/h5/out_655-00.h5', 'Sample/h5/out_345-00.h5', 'Sample/h5/out_689-00.h5', 'Sample/h5/out_449-00.h5', 'Sample/h5/out_1903-00.h5', 'Sample/h5/out_1351-00.h5', 'Sample/h5/out_1942-00.h5', 'Sample/h5/out_129-00.h5', 'Sample/h5/out_1576-00.h5', 'Sample/h5/out_818-00.h5', 'Sample/h5/out_665-00.h5', 'Sample/h5/out_1854-00.h5', 'Sample/h5/out_680-00.h5', 'Sample/h5/out_1815-00.h5', 'Sample/h5/out_1695-00.h5', 'Sample/h5/out_1178-00.h5', 'Sample/h5/out_594-00.h5', 'Sample/h5/out_1617-00.h5', 'Sample/h5/out_1628-00.h5', 'Sample/h5/out_1115-00.h5', 'Sample/h5/out_1607-00.h5', 'Sample/h5/out_579-00.h5', 'Sample/h5/out_1225-00.h5', 'Sample/h5/out_630-00.h5', 'Sample/h5/out_251-00.h5', 'Sample/h5/out_1133-00.h5', 'Sam

In [5]:
file_path = 'Sample/SPANet/gghhh_0b_01.h5'
print_h5_info(file_path)

Sample/SPANet/gghhh_0b_01.h5
Dataset size: 52061
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/btag [False  True  True  True False False False False False False False False
 False False False]
INPUTS/Source/eta [ 0.01805974 -0.9333278  -0.08137434  1.2160542  -1.7853693  -0.31661886
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [23.57334   18.395107   6.7061844 14.5643     6.147192   4.2319746
  0.         0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [ 0.7539842   1.285823    2.8136048   0.17197756 -0.63893867 -1.8587049
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [92.083786 78.457825 51.525585 46.00039  33.061516 27.063663  0.
  0.        0.        0.        0.        0.  

In [8]:
print_h5_sb_info('./Sample/SPANet/triHiggs_0b_train.h5')
print_h5_sb_info('./Sample/SPANet/triHiggs_0b_test.h5')

\item Total sample size: 1,800,000
\item Signal sample size: 900,000
\item Background sample size: 900,000
\item Total sample size: 200,000
\item Signal sample size: 100,000
\item Background sample size: 100,000


In [9]:
print_triHiggs_h5_info('./Sample/SPANet/triHiggs_0b_train.h5')
print_triHiggs_h5_info('./Sample/SPANet/triHiggs_0b_test.h5')

./Sample/SPANet/triHiggs_0b_train.h5
Dataset size: 1800000
Number of 0 Higgs events: 1041627
Number of 1 Higgs events: 318053
Number of 2 Higgs events: 277876
Number of 3 Higgs events: 162444
\item Total sample size: 1,800,000
\item 1h sample size: 318,053
\item 2h sample size: 277,876
\item 3h sample size: 162,444
./Sample/SPANet/triHiggs_0b_test.h5
Dataset size: 200000
Number of 0 Higgs events: 115771
Number of 1 Higgs events: 35372
Number of 2 Higgs events: 30853
Number of 3 Higgs events: 18004
\item Total sample size: 200,000
\item 1h sample size: 35,372
\item 2h sample size: 30,853
\item 3h sample size: 18,004


{'total': 200000, '0h': 115771, '1h': 35372, '2h': 30853, '3h': 18004}

# Make training and testing dataset

In [3]:
files = ['Sample/SPANet/gghhh_0b_02.h5', 
         'Sample/SPANet/gghhh_0b_03.h5']

merged_h5 = merge_h5_file(*files)

new_file = 'Sample/SPANet/gghhh_0b.h5'

os.rename(merged_h5, new_file)

'Sample/SPANet/gghhh_0b_02.h5' and ('Sample/SPANet/gghhh_0b_03.h5',) are same structure, can be merged.
Sample/SPANet/gghhh_0b_02_merged.h5 not exist. Copy Sample/SPANet/gghhh_0b_02.h5 to Sample/SPANet/gghhh_0b_02_merged.h5
Size of Sample/SPANet/gghhh_0b_02.h5: 522899
Size of Sample/SPANet/gghhh_0b_03.h5: 522518
Size of Sample/SPANet/gghhh_0b_02_merged.h5: 1045417


In [4]:
file_path = 'Sample/SPANet/gghhh_0b.h5'
size = 1000000
split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_0b.h5')
os.remove(split_file2)

Size of Sample/SPANet/gghhh_0b.h5: 1045417
Size of Sample/SPANet/gghhh_0b_split1.h5: 1000000
Size of Sample/SPANet/gghhh_0b_split2.h5: 45417


In [5]:
file_path = 'Sample/SPANet/gghhh_0b.h5'
r = 0.9
split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/gghhh_0b_train.h5')
os.rename(split_file2, './Sample/SPANet/gghhh_0b_test.h5')

Size of Sample/SPANet/gghhh_0b.h5: 1000000
Size of Sample/SPANet/gghhh_0b_split1.h5: 900000
Size of Sample/SPANet/gghhh_0b_split2.h5: 100000


In [6]:
file_path = '/home/public/3h6b_samples/bg_1m.h5'
size = 1000000
split_h5_size(file_path, size)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_0b.h5')
os.remove(split_file2)

Size of /home/public/3h6b_samples/bg_1m.h5: 1000000
Size of /home/public/3h6b_samples/bg_1m_split1.h5: 1000000
Size of /home/public/3h6b_samples/bg_1m_split2.h5: 0


In [7]:
file_path = './Sample/SPANet/pp6b_0b.h5'
r = 0.9
split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, './Sample/SPANet/pp6b_0b_train.h5')
os.rename(split_file2, './Sample/SPANet/pp6b_0b_test.h5')


Size of ./Sample/SPANet/pp6b_0b.h5: 1000000
Size of ./Sample/SPANet/pp6b_0b_split1.h5: 900000
Size of ./Sample/SPANet/pp6b_0b_split2.h5: 100000


In [8]:
files = ['./Sample/SPANet/gghhh_0b_train.h5', 
         './Sample/SPANet/pp6b_0b_train.h5']

merged_h5 = merge_h5_file(*files)

new_file = 'Sample/SPANet/triHiggs_0b_train.h5'

os.rename(merged_h5, new_file)

files = ['./Sample/SPANet/gghhh_0b_test.h5', 
         './Sample/SPANet/pp6b_0b_test.h5']

merged_h5 = merge_h5_file(*files)

new_file = 'Sample/SPANet/triHiggs_0b_test.h5'

os.rename(merged_h5, new_file)

'./Sample/SPANet/gghhh_0b_train.h5' and ('./Sample/SPANet/pp6b_0b_train.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_0b_train_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_0b_train.h5 to ./Sample/SPANet/gghhh_0b_train_merged.h5
Size of ./Sample/SPANet/gghhh_0b_train.h5: 900000
Size of ./Sample/SPANet/pp6b_0b_train.h5: 900000
Size of ./Sample/SPANet/gghhh_0b_train_merged.h5: 1800000
'./Sample/SPANet/gghhh_0b_test.h5' and ('./Sample/SPANet/pp6b_0b_test.h5',) are same structure, can be merged.
./Sample/SPANet/gghhh_0b_test_merged.h5 not exist. Copy ./Sample/SPANet/gghhh_0b_test.h5 to ./Sample/SPANet/gghhh_0b_test_merged.h5
Size of ./Sample/SPANet/gghhh_0b_test.h5: 100000
Size of ./Sample/SPANet/pp6b_0b_test.h5: 100000
Size of ./Sample/SPANet/gghhh_0b_test_merged.h5: 200000


In [12]:
shuffle_h5('Sample/SPANet/triHiggs_0b_train.h5')
shuffle_h5('Sample/SPANet/triHiggs_0b_test.h5')

Dataset size: 1800000
Dataset size: 200000


In [20]:
print_h5_info('Sample/SPANet/triHiggs_0b_train.h5', event=10)
print_h5_info('Sample/SPANet/triHiggs_0b_test.h5')

Sample/SPANet/triHiggs_0b_train.h5
Dataset size: 1800000
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/btag [False  True False  True False False False False False False False False
 False False False]
INPUTS/Source/eta [-1.7856659 -1.171275  -1.3175582 -2.0036418 -0.5670005 -1.6876249
  0.         0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/mass [21.988707 27.441921 16.656717 16.225178  8.112517  5.55458   0.
  0.        0.        0.        0.        0.        0.        0.
  0.      ]
INPUTS/Source/phi [ 2.7874901   0.42819452 -0.32609227 -1.9737867  -1.2101762  -2.9208257
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [100.418755  99.07493   57.353085  54.81301   35.30132   26.057392
   0.         0.         0.         0.         0.         0.
   0.    