In [1]:
import os
import h5py
import shutil

import numpy as np

In [2]:
def get_dataset_keys(f):
    # 取得所有 Dataset 的名稱
    keys = []
    f.visit(lambda key: keys.append(key) if isinstance(f[key], h5py.Dataset) else None)
    return keys


def print_h5_info(file, event=0):
    # 印出所有 Dataset
    print(file)
    with h5py.File(file, 'r') as f:
        dataset_keys = get_dataset_keys(f)
        print('Dataset size:', f[dataset_keys[0]].shape[0])
        for key in dataset_keys:
            print(key, end=' ')
            print(f[key][event])


def split_h5_size(main_file, size=1000):
    # 將輸入的 HDF5 檔案以 size 分成兩個
    root, ext = os.path.splitext(main_file)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    with h5py.File(main_file, 'r') as f_main:
        dataset_keys = get_dataset_keys(f_main)
        key0 = dataset_keys[0]
        total_size = f_main[key0].shape[0]
        print(f'Size of {main_file}: {total_size}')
        
        if size > total_size:
            print(f'Split size {size} is greater than the input file size {total_size}.')
            
        with h5py.File(split_file1, 'w') as f_sp1, h5py.File(split_file2, 'w') as f_sp2:    
            sp_size = size
            for key in dataset_keys:
                maxShape = list(f_main[key].maxshape)
                maxShape[0] = None
                f_sp1.create_dataset(key, maxshape=maxShape, data=f_main[key][:sp_size])
                f_sp2.create_dataset(key, maxshape=maxShape, data=f_main[key][sp_size:])
            
            print(f'Size of {split_file1}: {f_sp1[key0].shape[0]}')
            print(f'Size of {split_file2}: {f_sp2[key0].shape[0]}')

            
def split_h5_file(main_file, r=0.9):
    # 將輸入的 HDF5 檔案以 r 的比例分成兩個
    root, ext = os.path.splitext(main_file)
    split_file1 = root + '_split1' + ext
    split_file2 = root + '_split2' + ext

    with h5py.File(main_file, 'r') as f_main:
        dataset_keys = get_dataset_keys(f_main)
        key0 = dataset_keys[0]
        total_size = f_main[key0].shape[0]
        print(f'Size of {main_file}: {total_size}')
        
        size = int(total_size * r)
        if size > total_size:
            print(f'Split size {size} is greater than the input file size {total_size}.')

        with h5py.File(split_file1, 'w') as f_sp1, h5py.File(split_file2, 'w') as f_sp2:    
            sp_size = size
            for key in dataset_keys:
                maxShape = list(f_main[key].maxshape)
                maxShape[0] = None
                f_sp1.create_dataset(key, maxshape=maxShape, data=f_main[key][:sp_size])
                f_sp2.create_dataset(key, maxshape=maxShape, data=f_main[key][sp_size:])
                    
            print(f'Size of {split_file1}: {f_sp1[key0].shape[0]}')
            print(f'Size of {split_file2}: {f_sp2[key0].shape[0]}')
    

def merge_h5_file(main_file, *arg):
    # 合併傳入的 HDF5 檔案    
    
    # 檢查傳入檔案結構是否都相同
    same_structure = True
    with h5py.File(main_file, 'r') as f_main:
        main_dataset_keys = get_dataset_keys(f_main)
        for append_file in arg:
            with h5py.File(append_file, 'r') as f_append:
                append_dataset_keys = get_dataset_keys(f_append)    
                if set(main_dataset_keys) != set(append_dataset_keys):
                    same_structure = False
                    print(f"'{main_file}' and '{append_file}' are not same structure, can not be merged.")
                    break

    # 檢查檔案結構是否都相同
    if not same_structure:
        return
    print(f"'{main_file}' and {arg} are same structure, can be merged.")

    root, ext = os.path.splitext(main_file)
    new_file = root + '_merged' + ext

    # 檢查合併檔案是否存在
    if os.path.isfile(new_file):
        print(f'{new_file} exist. Can not copy {main_file} to {new_file}')
        return

    print(f'{new_file} not exist. Copy {main_file} to {new_file}')
    shutil.copyfile(main_file, new_file)

    with h5py.File(new_file, 'a') as f_main:
        key0 = main_dataset_keys[0]
        total_size = f_main[key0].shape[0]
        print(f'Size of {main_file}: {total_size}')

        for append_file in arg:
            with h5py.File(append_file, 'r') as f_append:
                size_of_append = f_append[key0].shape[0]            
                print(f'Size of {append_file}: {size_of_append}')

                total_size += size_of_append   
                for key in main_dataset_keys:
                    f_main[key].resize(total_size, axis=0)
                    f_main[key][-size_of_append:] = f_append[key]
                
                print(f'Size of {new_file}: {f_main[key0].shape[0]}')
    return new_file

In [3]:
def get_particle_mask(quark_jet, particle_quarks):
    # quark_jet: 每個夸克對應的 jet 編號，shape 為 (n_event, 6)
    # particle_quarks: 粒子對應的夸克編號，shape 為 (n_quarks,)

    # 檢查是否每個夸克都有對應的 jet
    mask1 = np.all(quark_jet[:, particle_quarks] != -1, axis=1)

    # 對每一個事件，檢查每個夸克對應的 jet 都不重複
    count = np.array([[np.sum(event == event[i]) for i in particle_quarks] for event in quark_jet])
    mask2 = np.all(count == 1, axis=1)

    return mask1 & mask2


def print_triHiggs_h5_info(file_path):
    # 印出 triHiggs HDF5 資料中，各 Higgs 數目的事件數
    print(file_path)
    with h5py.File(file_path, 'r') as f:

        h1b1 = f['TARGETS/h1/b1'][...]
        h1b2 = f['TARGETS/h1/b2'][...]
        h2b1 = f['TARGETS/h2/b1'][...]
        h2b2 = f['TARGETS/h2/b2'][...]
        h3b1 = f['TARGETS/h3/b1'][...]
        h3b2 = f['TARGETS/h3/b2'][...]

        quark_jet = np.array([h1b1, h1b2, h2b1, h2b2, h3b1, h3b2]).T

        h1_mask = get_particle_mask(quark_jet, (0, 1))
        h2_mask = get_particle_mask(quark_jet, (2, 3))
        h3_mask = get_particle_mask(quark_jet, (4, 5))
        
        n_tot = h1_mask.shape[0]
        n_0h = ((~h1_mask) & (~h2_mask) & (~h3_mask)).sum()
        # 任一個 Higgs 有對應的 jet
        n_1h = ((h1_mask & (~h2_mask) & (~h3_mask)) | 
                ((~h1_mask) & h2_mask & (~h3_mask)) | 
                ((~h1_mask) & (~h2_mask) & h3_mask)).sum()
        
        # 任兩個 Higgs 有對應的 jet
        n_2h = ((h1_mask & h2_mask & (~h3_mask)) | 
                ((~h1_mask) & h2_mask & h3_mask) | 
                (h1_mask & (~h2_mask) & h3_mask)).sum()
        n_3h = (h1_mask & h2_mask & h3_mask).sum()

    print(f'Dataset size: {n_tot}')
    print(f'Number of 0 Higgs events: {n_0h}')
    print(f'Number of 1 Higgs events: {n_1h}')
    print(f'Number of 2 Higgs events: {n_2h}')
    print(f'Number of 3 Higgs events: {n_3h}')
    
    print(f'\\item Total sample size: {n_tot:,}')
    print(f'\\item 1h sample size: {n_1h:,}')
    print(f'\\item 2h sample size: {n_2h:,}')
    print(f'\\item 3h sample size: {n_3h:,}')
    
    result = {
        'total': n_tot,
        '0h': n_0h,
        '1h': n_1h,
        '2h': n_2h,
        '3h': n_3h
    }
    return result

# Example

In [None]:
# 取得資料夾內所有檔案的檔名，返回路徑
def get_all_files(folder, ext=None):
    files = []
    for root, dirs, filenames in os.walk(folder):
        for filename in filenames:
            if ext:
                if filename.endswith(ext):
                    files.append(os.path.join(root, filename))
            else:
                files.append(os.path.join(root, filename))
    return files


files = get_all_files('Sample/h5')
merged_h5 = merge_h5_file(files[0], *files[1:])
new_file = 'Sample/h5/background.h5'

os.rename(merged_h5, new_file)

In [5]:
file_path = 'Sample/SPANet/gghhh_0b_01.h5'
print_h5_info(file_path)


Sample/SPANet/gghhh_0b_01.h5
Dataset size: 52061
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/btag [False  True  True  True False False False False False False False False
 False False False]
INPUTS/Source/eta [ 0.01805974 -0.9333278  -0.08137434  1.2160542  -1.7853693  -0.31661886
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [23.57334   18.395107   6.7061844 14.5643     6.147192   4.2319746
  0.         0.         0.         0.         0.         0.
  0.         0.         0.       ]
INPUTS/Source/phi [ 0.7539842   1.285823    2.8136048   0.17197756 -0.63893867 -1.8587049
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [92.083786 78.457825 51.525585 46.00039  33.061516 27.063663  0.
  0.        0.        0.        0.        0.  

In [6]:
file_path = 'Sample/SPANet/gghhh_0b_02.h5'
print_triHiggs_h5_info(file_path)

Sample/SPANet/gghhh_0b_02.h5


Dataset size: 522899
Number of 0 Higgs events: 82190
Number of 1 Higgs events: 184769
Number of 2 Higgs events: 161476
Number of 3 Higgs events: 94464
\item Total sample size: 522,899
\item 1h sample size: 184,769
\item 2h sample size: 161,476
\item 3h sample size: 94,464


{'total': 522899, '0h': 82190, '1h': 184769, '2h': 161476, '3h': 94464}

# Make training and testing dataset

In [7]:
file_path = 'Sample/SPANet/gghhh_0b_02.h5'
r = 0.9
split_h5_file(file_path, r)

root, ext = os.path.splitext(file_path)
split_file1 = root + '_split1' + ext
split_file2 = root + '_split2' + ext

os.rename(split_file1, '../SPANet2/data/triHiggs/gghhh_0b_train.h5')
os.rename(split_file2, '../SPANet2/data/triHiggs/gghhh_0b_test.h5')


Size of Sample/SPANet/gghhh_0b_02.h5: 522899
Size of Sample/SPANet/gghhh_0b_02_split1.h5: 470609
Size of Sample/SPANet/gghhh_0b_02_split2.h5: 52290


In [8]:
print_h5_info('../SPANet2/data/triHiggs/gghhh_0b_train.h5')
print_h5_info('../SPANet2/data/triHiggs/gghhh_0b_test.h5')

../SPANet2/data/triHiggs/gghhh_0b_train.h5
Dataset size: 470609
CLASSIFICATIONS/EVENT/signal 1
INPUTS/Source/MASK [ True  True  True  True  True  True False False False False False False
 False False False]
INPUTS/Source/btag [False False  True False False False False False False False False False
 False False False]
INPUTS/Source/eta [-0.6438533   1.0463842   0.7722511   2.124152   -0.17052057  1.7725807
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/mass [16.58891  12.245288  7.338748 13.670185  8.0877    8.103769  0.
  0.        0.        0.        0.        0.        0.        0.
  0.      ]
INPUTS/Source/phi [ 2.3027658  -0.5379352   0.29748732  2.5127425  -2.0321293   1.7991484
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
INPUTS/Source/pt [118.89806  100.983536  79.25531   47.51311   37.596863  36.342655
   0.         0.         0.         0.         0. 