In [53]:
import sys
sys.path.insert(0, "../")
from src.data.data_functions.hdf5_utils import unpack_hdf5
import statsmodels.api as sm
import matplotlib.pyplot as plt
import h5py
from collections import defaultdict
import numpy as np

In [4]:
raw = '../data/raw/AutoPi_CAN/platoon_CPH1_HH.hdf5'
processed = "../data/processed/w_kpis/segments.hdf5"

In [143]:
import h5py
from collections import defaultdict

def nested_dict():
    """ Helper function to create a nested defaultdict. """
    return defaultdict(nested_dict)

def summarize_hdf5(file_path):
    """
    Print summary statistics of an HDF5 file, including the number of values at each layer,
    the count of children (datasets and groups) at each layer, and the total count of values.
    Also, create a nested dictionary where each group name maps to its children groups and datasets,
    with the number of values in each dataset.
    
    Parameters:
    file_path (str): Path to the HDF5 file.
    """
    group_counts = defaultdict(int)
    total_value_count = 0
    group_dict = nested_dict()

    def print_attrs(name, obj, indent=0):
        nonlocal total_value_count
        indent_str = '    ' * indent
        if isinstance(obj, h5py.Dataset):
            print(f"{indent_str}Dataset: {name}")
            print(f"{indent_str}    Shape: {obj.shape}")
            print(f"{indent_str}    Number of values: {obj.size}")
            total_value_count += obj.size
            # Insert dataset into the nested dictionary
            parts = name.split('/')
            d = group_dict
            for part in parts[:-1]:
                d = d[part]
            d[parts[-1]] = obj.size
        elif isinstance(obj, h5py.Group):
            print(f"{indent_str}Group: {name}")
            # Insert group into the nested dictionary
            parts = name.split('/')
            d = group_dict
            for part in parts:
                d = d[part]
        group_counts[indent] += 1

    def visit_items(name, obj):
        indent_level = name.count('/')
        print_attrs(name, obj, indent_level)

    with h5py.File(file_path, 'r') as f:
        print(f"Summary of HDF5 file: {file_path}")
        print("=" * (len(file_path) + 18))
        f.visititems(visit_items)

        print("\nChildren count by layer:")
        print("=======================")
        for layer, count in sorted(group_counts.items()):
            print(f"Layer {layer}: {count} children(s)")

        print("\nTotal count of values in datasets:")
        print("==================================")
        print(f"Total count of values: {total_value_count}")
    return group_dict, total_value_count

In [144]:
group_dict, total_value_count = summarize_hdf5(raw)


Summary of HDF5 file: ../data/raw/AutoPi_CAN/platoon_CPH1_HH.hdf5
Group: GM
    Group: GM/16006
        Group: GM/16006/pass_1
            Dataset: GM/16006/pass_1/acc.xyz
                Shape: (11359, 4)
                Number of values: 45436
            Dataset: GM/16006/pass_1/acc_long
                Shape: (16011, 2)
                Number of values: 32022
            Dataset: GM/16006/pass_1/acc_trans
                Shape: (16011, 2)
                Number of values: 32022
            Dataset: GM/16006/pass_1/acc_yaw
                Shape: (16011, 2)
                Number of values: 32022
            Dataset: GM/16006/pass_1/alt
                Shape: (125, 2)
                Number of values: 250
            Dataset: GM/16006/pass_1/asr_trq_req_dyn
                Shape: (8005, 2)
                Number of values: 16010
            Dataset: GM/16006/pass_1/asr_trq_req_st
                Shape: (8005, 2)
                Number of values: 16010
            Dataset: GM/16006/pa

In [145]:
group_dict, total_value_count = summarize_hdf5(processed)
segments = list(group_dict.keys())
seconds_in_segments = [len(group_dict[seg]) for seg in segments]
values_in_seconds = [group_dict[seg][sec][val] for seg in segments for sec in group_dict[seg] for val in ['aran', 'gm', 'gopro', 'p79'] if isinstance(group_dict[seg][sec][val], int)]

print(f"Number of segments:          {len(segments):,}")
print(f"Total number of seconds:     {np.sum(seconds_in_segments):,}")
print(f"Total number of values:      {np.sum(values_in_seconds) + 2*4*np.sum(seconds_in_segments):,}")

Summary of HDF5 file: ../data/processed/w_kpis/segments.hdf5
Group: 0
    Group: 0/10
        Dataset: 0/10/aran
            Shape: (14, 79)
            Number of values: 1106
        Dataset: 0/10/gm
            Shape: (250, 42)
            Number of values: 10500
        Dataset: 0/10/gopro
            Shape: (250, 15)
            Number of values: 3750
        Group: 0/10/kpis
            Dataset: 0/10/kpis/1
                Shape: (4,)
                Number of values: 4
            Dataset: 0/10/kpis/2
                Shape: (4,)
                Number of values: 4
        Dataset: 0/10/p79
            Shape: (128, 36)
            Number of values: 4608
    Group: 0/11
        Dataset: 0/11/aran
            Shape: (13, 79)
            Number of values: 1027
        Dataset: 0/11/gm
            Shape: (250, 42)
            Number of values: 10500
        Dataset: 0/11/gopro
            Shape: (250, 15)
            Number of values: 3750
        Group: 0/11/kpis
            Dataset: