In [None]:
import h5py
import json
import numpy as np # Import numpy for array handling

def hdf5_to_json(hdf5_file_path, json_file_path):
    """Converts an HDF5 file to JSON format."""

    try:
        with h5py.File(hdf5_file_path, 'r') as hdf5_file:
            # Convert the HDF5 file content to a Python dictionary
            data = h5_to_dict(hdf5_file)

            # Write the dictionary to a JSON file
            with open(json_file_path, 'w') as json_file:
                # Use default=str to handle non-JSON serializable types like numpy arrays
                json.dump(data, json_file, indent=4)

        print(f"Successfully converted {hdf5_file_path} to {json_file_path}")

    except FileNotFoundError:
        print(f"Error: File not found at {hdf5_file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

def h5_to_dict(obj):
    """
    Recursively converts an HDF5 object (file, group, or dataset) to a Python dictionary.
    Handles NumPy arrays by converting them to lists for JSON serialization.
    """
    d = {}
    if isinstance(obj, h5py.Group):
        # If it's a group, iterate through its items and recursively convert them
        for key, value in obj.items():
            d[key] = h5_to_dict(value)
    elif isinstance(obj, h5py.Dataset):
        # Read the dataset content
        data_content = obj[()]
        
        # Check if the data is a NumPy array and convert it to a list
        if isinstance(data_content, np.ndarray):
            # Convert NumPy array to a list
            d = data_content.tolist()
        elif isinstance(data_content, bytes):
            # Attempt to decode bytes as UTF-8 string
            try:
                d = data_content.decode('utf-8')
            except UnicodeDecodeError:
                # If not UTF-8, represent as hex string
                d = data_content.hex()
        else:
            # For other data types directly assign
            d = data_content
    return d

In [None]:
filename = "data/demo_2021/pdb_fast_with_demo_hydro.hdf"
hdf5_to_json(filename, "data/demo_2021/pdb_fast_with_demo_hydro.json")

In [None]:
import h5py
import pandas as pd
import numpy as np

filename = "data/1bzc.hdf"

def explore_hdf5(filepath, max_depth=10):
    with h5py.File(filepath, "r") as f:
        # Print root keys
        root_keys = list(f.keys())
        print(f"Root keys ({len(root_keys)}): {root_keys}")

        # Save to CSV
        pd.Series(root_keys, name="complexes").to_csv("ingenni_complexes.csv", index=False)

        path = root_keys[0]  # start from first key, e.g., '1bzc'
        obj = f[path]

        for depth in range(max_depth):
            print(f"\n[Depth {depth}] Path: {path}")
            if isinstance(obj, h5py.Group):
                keys = list(obj.keys())
                print(f"Group keys ({len(keys)}): {keys[:5]}")
                if keys:
                    path += f"/{keys[0]}"
                    obj = obj[keys[0]]
                else:
                    print("Empty group.")
                    break
            elif isinstance(obj, h5py.Dataset):
                print(f"Reached dataset: {path}")
                data = obj[()]
                print(f"Data type: {type(data)}")
                try:
                    arr = np.array(data)
                    print(f"Array shape: {arr.shape}")
                except Exception as e:
                    print(f"Could not convert to NumPy array: {e}")
                break
            else:
                print(f"Unknown HDF5 object at {path}")
                break

explore_hdf5("data/1bzc.hdf")


In [None]:
import numpy as np
import json
import h5py

with h5py.File("data/1bzc.hdf", "r") as f:
    hdf_data = f["1bzc"]["pybel"]["processed"]["pdbbind"]["data"][()]
    print("HDF5 shape:", hdf_data.shape, "dtype:", hdf_data.dtype)

with open("data/1bzc.json") as jf:
    json_data = json.load(jf)
    json_np = np.array(json_data["1bzc"]["pybel"]["processed"]["pdbbind"]["data"])
    print("JSON shape:", json_np.shape, "dtype:", json_np.dtype)

print("Arrays equal?", np.array_equal(hdf_data, json_np))
