# Conver HDF5 file from (H,W,N,C) into (N,C,H,W)

In [1]:
import h5py
import numpy as np

data_dir = './'

input_file_path = data_dir + 'train_200000.hdf5'
output_file_path = data_dir + 'train_200000_reshaped.hdf5'


# Open the existing HDF5 file for reading
with h5py.File(input_file_path, 'r') as hf_r:
    # Get the existing datasets
    train_x = hf_r['k']
    train_y = hf_r['S']
    
    # Get the dimensions of the datasets
    H, W, N, _ = train_x.shape

    total = 0
    
    # Create a temporary HDF5 file for writing
    with h5py.File(output_file_path, 'w', libver='latest') as hf_w:
        # Create new datasets with the desired dimensions and chunking
        chunk_size = 2000  # Adjust the chunk size based on your available memory
        hf_w.create_dataset('k', shape=(N, 3, H, W), dtype=train_x.dtype, chunks=(1, 3, H, W))
        hf_w.create_dataset('S', shape=(N, 1, H, W), dtype=train_y.dtype, chunks=(1, 1, H, W))
        
        # Iterate through chunks and transpose data
        for start in range(0, N, chunk_size):
            end = min(start + chunk_size, N)
            chunk_x = train_x[:, :, start:end, :].transpose((2, 3, 0, 1))
            chunk_y = train_y[:, :, start:end, :].transpose((2, 3, 0, 1))
            
            # Write the transposed chunk to the new datasets
            hf_w['k'][start:end, :, :, :] = chunk_x
            hf_w['S'][start:end, :, :, :] = chunk_y

            total += chunk_size
            print(str(total) + "/" + str(N))


2000/200000
4000/200000
6000/200000
8000/200000
10000/200000
12000/200000
14000/200000
16000/200000
18000/200000
20000/200000
22000/200000
24000/200000
26000/200000
28000/200000
30000/200000
32000/200000
34000/200000
36000/200000
38000/200000
40000/200000
42000/200000
44000/200000
46000/200000
48000/200000
50000/200000
52000/200000
54000/200000
56000/200000
58000/200000
60000/200000
62000/200000
64000/200000
66000/200000
68000/200000
70000/200000
72000/200000
74000/200000
76000/200000
78000/200000
80000/200000
82000/200000
84000/200000
86000/200000
88000/200000
90000/200000
92000/200000
94000/200000
96000/200000
98000/200000
100000/200000
102000/200000
104000/200000
106000/200000
108000/200000
110000/200000
112000/200000
114000/200000
116000/200000
118000/200000
120000/200000
122000/200000
124000/200000
126000/200000
128000/200000
130000/200000
132000/200000
134000/200000
136000/200000
138000/200000
140000/200000
142000/200000
144000/200000
146000/200000
148000/200000
150000/200000
152

# Summary of orginial and transposed data

In [2]:
import h5py
data_dir = './'

def print_hdf5_summary(file_path):
    with h5py.File(file_path, 'r') as hf:
        print("Summary of HDF5 file:", file_path)
        print("")

        # Print information about datasets
        print("Datasets:")
        hf.visititems(print_dataset_info)
        print("---------")

def print_dataset_info(name, obj):
    if isinstance(obj, h5py.Dataset):
        print(f"{name} - Shape: {obj.shape}, Dtype: {obj.dtype}, Compression: {obj.compression}, Compression Options: {obj.compression_opts}")

# Provide the path to your HDF5 file
file_path = data_dir + 'train_200000_reshaped.hdf5'

# Print the summary of the HDF5 file
print_hdf5_summary(file_path)

# Provide the path to your HDF5 file
file_path = data_dir + 'train_200000.hdf5'

# Print the summary of the HDF5 file
print_hdf5_summary(file_path)

Summary of HDF5 file: ./train_200000_reshaped.hdf5

Datasets:
S - Shape: (200000, 1, 128, 128), Dtype: float64, Compression: None, Compression Options: None
k - Shape: (200000, 3, 128, 128), Dtype: float64, Compression: None, Compression Options: None
---------
Summary of HDF5 file: ./train_200000.hdf5

Datasets:
S - Shape: (128, 128, 200000, 1), Dtype: float64, Compression: None, Compression Options: None
k - Shape: (128, 128, 200000, 3), Dtype: float64, Compression: None, Compression Options: None
---------


# Read certain line of data and verify it

In [3]:
import h5py

def extract_data_from_two_datasets(file1, file2, target_index):
    with h5py.File(file1, 'r') as infile1, h5py.File(file2, 'r') as infile2:
        # Assuming the datasets have the same structure
        dataset_shape = infile1['k'].shape

        # Extract data from the first file
        data1 = infile1['k'][:, :, target_index, :]

        # Extract data from the second file
        data2 = infile2['k'][target_index, :, :, :]

        # Write data to the output dataset
        print(data1.shape, data1.dtype)
        print(data2.shape, data2.dtype)

        transpose = np.transpose(data1, (2, 0, 1))  # (H, W, C) -> (C, H, W)
        comparison_result = np.array_equal(data2, transpose)
        if not comparison_result:
            diff_locations = np.where(transpose != data2)
            print(f"Differences found at positions: {diff_locations}")
        else:
            print("Same!")


extract_data_from_two_datasets('train_200000.hdf5', 'train_200000_reshaped.hdf5', target_index=1)


(128, 128, 3) float64
(3, 128, 128) float64
Same!
