Convert a directory containing 1D HDF5 output from AMR-Wind to 3D `.npy` files. Terminology: `big_box` refers to the entire domain. `little_box` refers to the smaller chunks of data, each of which is presumably output by a single process in the AMR-Wind writer.

In [1]:
import h5py
import hdf5plugin
from matplotlib.colors import TwoSlopeNorm, BoundaryNorm, LinearSegmentedColormap
import numpy as np
import matplotlib.pyplot as plt
from netCDF4 import Dataset
import os
from pathlib import Path

In [2]:
### User inputs
desired_channels = ['velocityx', 'velocityy', 'velocityz']
input_dir = Path('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train')
output_dir = Path('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/npy')

In [3]:
### Identify HDF5 files
h5_files = list(Path(input_dir).glob('*.h5'))
h5_files.sort()

In [4]:
print(len(h5_files))
print(h5_files[:5])
print(h5_files[-5:])

5041
[PosixPath('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/plt100080.h5'), PosixPath('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/plt100200.h5'), PosixPath('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/plt100320.h5'), PosixPath('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/plt100440.h5'), PosixPath('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/plt100560.h5')]
[PosixPath('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/plt99480.h5'), PosixPath('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/plt99600.h5'), PosixPath('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/plt99720.h5'), PosixPath('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/plt99840.h5'), PosixPath('/scratch/orybchuk/wakedynamics/ldm-3d/simulations/train/plt99960.h5')]


In [5]:
### Extract some parameters based on the first data file
with h5py.File(h5_files[0], mode='r') as f_ex:
    ## ~~~~~~~~~~ File structure ~~~~~~~~~~
    ## Count total number of variables in HDF5 file & identify desired channel numbers
    nvars_out = len(desired_channels)
    nvars_tot = 0
    component_nums_out = {}
    for key in f_ex.attrs.keys():
        print(key, ', ', f_ex.attrs[key])

        # Count the total number of variables in the HDF5 file
        if 'component_' in key: nvars_tot+=1

        if type(f_ex.attrs[key]) == np.bytes_:
            # Identify the component number of desired variables
            keyval = f_ex.attrs[key].decode('UTF-8')
            if keyval in desired_channels:
                component_num = int(key.split("_")[1])
                component_nums_out[keyval] = component_num

        # Identify time
        if key == 'time':
            filetime = f_ex.attrs[key][0]
            assert (filetime).is_integer(), "Filetime is not an integer! Write additional pre-processing code."
            filetime = int(filetime)
        
    print("\nnvars_tot:", nvars_tot)
    print("component_nums_out:", component_nums_out)
    
    ## ~~~~~~~~~~ Geometry ~~~~~~~~~~
    ## Characterize little boxes
    little_shape0 = f_ex['level_0']['boxes'][0]
    little_di = little_shape0[3] - little_shape0[0] + 1
    little_dj = little_shape0[4] - little_shape0[1] + 1
    little_dk = little_shape0[5] - little_shape0[2] + 1
    little_var_len = little_di*little_dj*little_dk
    boxstride = little_var_len*nvars_tot
    nlittle = len(f_ex['level_0']['boxes'])

    # Check that di/dj/dk are true for all shapes
    for box in f_ex['level_0']['boxes']:
        assert box[0] + little_di - 1 == box[3], box
        assert box[1] + little_dj - 1 == box[4], box
        assert box[2] + little_dk - 1 == box[5], box

    ## Characterize big box
    big_nx = f_ex['level_0'].attrs['prob_domain'][3] + 1
    big_ny = f_ex['level_0'].attrs['prob_domain'][4] + 1
    big_nz = f_ex['level_0'].attrs['prob_domain'][5] + 1

    print("(little_di, little_dj, little_dk):", (little_di, little_dk, little_dk))
    print("(big_nx, big_ny, big_nz):", (big_nx, big_ny, big_nz))

component_0 ,  b'density'
component_1 ,  b'gpx'
component_10 ,  b'velocity_mueff'
component_2 ,  b'gpy'
component_3 ,  b'gpz'
component_4 ,  b'mu_turb'
component_5 ,  b'p'
component_6 ,  b'temperature'
component_7 ,  b'velocityx'
component_8 ,  b'velocityy'
component_9 ,  b'velocityz'
coordinate_system ,  [0]
dim ,  [3]
finest_level ,  [0]
num_components ,  [11]
num_levels ,  [1]
plotfile_type ,  b'VanillaHDF5'
time ,  [50040.]
version_name ,  b'HyperCLaw-V1.1'

nvars_tot: 11
component_nums_out: {'velocityx': 7, 'velocityy': 8, 'velocityz': 9}
(little_di, little_dj, little_dk): (16, 16, 16)
(big_nx, big_ny, big_nz): (128, 128, 64)


In [6]:
# ### Extract key data file parameters
# ## Characterize little boxes
# little_shape0 = f3d['level_0']['boxes'][0]
# little_di = little_shape0[3] - little_shape0[0] + 1
# little_dj = little_shape0[4] - little_shape0[1] + 1
# little_dk = little_shape0[5] - little_shape0[2] + 1
# little_var_len = little_di*little_dj*little_dk
# boxstride = little_var_len*nvars_tot
# nlittle = len(f3d['level_0']['boxes'])

# # Check that di/dj/dk are true for all shapes
# for box in f3d['level_0']['boxes']:
#     assert box[0] + little_di - 1 == box[3], box
#     assert box[1] + little_di - 1 == box[4], box
#     assert box[2] + little_di - 1 == box[5], box
    
# ## Characterize big box
# big_nx = f3d['level_0'].attrs['prob_domain'][3] + 1
# big_ny = f3d['level_0'].attrs['prob_domain'][4] + 1
# big_nz = f3d['level_0'].attrs['prob_domain'][5] + 1

# print("(little_di, little_dj, little_dk):", (little_di, little_dk, little_dk))
# print("(big_nx, big_ny, big_nz):", (big_nx, big_ny, big_nz))

In [7]:
### Prepare to convert the files
var_maxes = np.zeros(nvars_out)
var_mins = np.zeros(nvars_out)

In [8]:
### Operate on stats files before starting the main conversion
## Purge stats files if they exist
fsample_stats = "sample_stats.csv"
fall_stats = "all_stats.txt"

if Path(output_dir, fsample_stats).is_file():
    Path(output_dir, fsample_stats).unlink()
    print(f"Purged pre-existing {fsample_stats}!")
if Path(output_dir, fall_stats).is_file():
    Path(output_dir, fall_stats).unlink()
    print(f"Purged pre-existing {fall_stats}!")
    
## Create a header
with open(Path(output_dir, fsample_stats), 'w') as fstats:
    fstats.write("sample, umax, umin, vmax, vmin, wmax, wmin\n")

In [None]:
### Iterate over files and convert HDF5 to .npy
fnum = 0
for fname in h5_files:
    if fnum % 100 == 0: print(fnum)
    ### ~~~~~ Read the .h5 file ~~~~~
    f = h5py.File(fname, mode='r')
    
    ### ~~~~~ Check that nothing about the geometry changed w.r.t. the reference file ~~~~~
    # Check little boxes
    for box in f['level_0']['boxes']:
        assert box[0] + little_di - 1 == box[3], box
        assert box[1] + little_dj - 1 == box[4], box
        assert box[2] + little_dk - 1 == box[5], box
        
    # Check big box
    assert f['level_0'].attrs['prob_domain'][3] + 1 == big_nx, "x-dimension error!"
    assert f['level_0'].attrs['prob_domain'][4] + 1 == big_ny, "y-dimension error!"
    assert f['level_0'].attrs['prob_domain'][5] + 1 == big_nz, "z-dimension error!"
    
    ### ~~~~~ Reformat 1D into 3D variables ~~~~~
    ## Prepare the volume array
    big_out = np.zeros((nvars_out, big_nx, big_ny, big_nz))

    ## Reformat 1D into 3D data, iterating over variables
    for i, var in enumerate(list(component_nums_out.values())):  # Iterate over variables
        var_offset = var*little_var_len

        for lb in range(nlittle):  # Iterate over little boxes
            # Get index of little box
            little_inds = f['level_0']['boxes'][lb]
            lo_i, lo_j, lo_k = little_inds[0], little_inds[1], little_inds[2]
            hi_i, hi_j, hi_k = little_inds[3]+1, little_inds[4]+1, little_inds[5]+1

            # Get 1D data
            data1d = f['level_0']['data:datatype=0'][var_offset+lb*boxstride:
                                                          var_offset+lb*boxstride+little_var_len]

            # Reshape into 3D
            data3d = np.reshape(data1d,
                                (hi_i-lo_i,hi_j-lo_j,hi_k-lo_k),
                                order='F')

            # Place little 3D data into the big domain
            big_out[i,lo_i:hi_i,lo_j:hi_j,lo_k:hi_k] = data3d
            
    ### ~~~~~ Close .h5 file ~~~~~
    f.close()
    
    ### ~~~~~ Save .npy file ~~~~~
    savenum = fname.stem[3:]
    savename = 'gt' + savenum + '.npy'
    np.save(Path(output_dir, savename), big_out)
    
    ### ~~~~~ Update statistics ~~~~~
    ## Write min/max of each field for each sample to a file
    with open(Path(output_dir, fsample_stats), 'a') as fstats:
        fstats.write(f"{savenum}, ")  # Sample ID
        for j in range(nvars_out):
            fstats.write(f"{big_out[j,:,:,:].max()}, {big_out[j,:,:,:].min()}, ")
        fstats.write("\n")
        
    ## Update samplewide min/max
    if fnum == 0:
        for j in range(nvars_out):
            var_maxes[j] = big_out[j,:,:,:].max()
            var_mins[j] = big_out[j,:,:,:].min()
    else:
        for j in range(nvars_out):
            var_maxes[j] = max(var_maxes[j], big_out[j,:,:,:].max())
            var_mins[j] = min(var_mins[j], big_out[j,:,:,:].min())
            
    fnum+=1
            
## Save sample-set statistics
with open(Path(output_dir, fall_stats), 'w') as fstats:
    for i in range(nvars_out):
        fstats.write(f"Variable number {i} max: {var_maxes[i]}\n")
        fstats.write(f"Variable number {i} min: {var_mins[i]}\n")
    
print("Variable maximums:", var_maxes)
print("Variable minimums:", var_mins)

0
100


In [None]:
### Do a quick check for obvious outliers or reshaping artifacts
fig, ax = plt.subplots(1, 3, figsize=(9,3), dpi=125)

for axs in ax.reshape(-1):
    axs.set_box_aspect(1)

myind = 1

for i, var in enumerate(component_nums_out):  # Iterate over variables
#     # Top-down view
#     im1 = ax[i].imshow(big_out[i,:,:,myind].T)
    
    # Side view
    im1 = ax[i].imshow(big_out[i,:,myind,:].T,
                      origin='lower')


    ax[i].set_title(desired_channels[i])
    

plt.show()