# Partition H5 File

After processing files using "process_data.py", need to partition into train/test split.

In [1]:
import h5py as h5
import numpy as np

from os.path import join

In [2]:
fname = join('/home/squirt/Documents/data/weather_data/','all_data.h5')

In [3]:
def load_hdf5(filename:str):
    '''
    Load data from an HDF5 file and return a list of dictionaries.
    Inputs:
        - filename (str): Path to the HDF5 file.
    Outputs:
        - data (list): List of dictionaries, where each dictionary represents an entry in the original list.
    '''
    data = []  # List to hold dictionaries
    with h5.File(filename, 'r') as f:
        # Iterate through groups (each representing an entry in the original list)
        for group_name in f:
            group = f[group_name]
            # Reconstruct dictionary from datasets and attributes
            entry = {
                # Attributes (metadata)
                'day': group.attrs['day'],
                'region': group.attrs['region'],
                'time': group.attrs['time'],

                # groups (numpy arrays)
                'landmass': group['landmass'][...],  # Use [...] to read the full dataset
                'x': group['x'][...],
                'y': group['y'][...],
            }
            data.append(entry)
    return data

In [4]:
all_data = load_hdf5(fname)

In [5]:
split = 0.6

In [6]:
train_data = all_data[:int(split*len(all_data))]
test_data = all_data[int(split*len(all_data)):]

## Save H5

In [7]:
def save_h5(data:list[dict], filename:str):
    '''
    Saves a list of dictionaries to an HDF5 file, with dictionaries converted to groups.
    Inputs:
        - data (list[dict]): list of dictionaries to save with keys {day, region, time, landmass, x, y}
        - filename (str): path to the file to save
    Outputs:
        - None, data saved to disk
    ''' 
    with h5.File(filename, 'w') as f:
        for i, entry in enumerate(data):
            group = f.create_group(f'entry_{i}')
            for key, value in entry.items():
                if isinstance(value, np.ndarray):
                    group.create_dataset(key, data=value)
                else:
                    # Store non-array data as attributes
                    # Ensure that the value is converted to a string, as HDF5 attributes
                    # are more versatile with string data types.
                    group.attrs[key] = str(value)
    return

In [8]:
training_file = join('/home/squirt/Documents/data/weather_data/','train_data.h5')
testing_file = join('/home/squirt/Documents/data/weather_data/','test_data.h5')

In [9]:
save_h5(train_data, training_file)
save_h5(test_data, testing_file)