In [6]:
import h5py
import pandas as pd

# Step 1: Open the HDF5 file
h5_file_path = 'compressed_parkinson.h5'
h5file = h5py.File(h5_file_path, 'r')

# Step 2: Explore the structure of the file (optional but useful)
def print_hdf5_structure(item, path='/'):
    if isinstance(item, h5py.Dataset):
        print(path + item.name, item.shape, item.dtype)
    elif isinstance(item, h5py.Group):
        for key, val in item.items():
            print_hdf5_structure(val, path + key + '/')

print_hdf5_structure(h5file)


/measurements/gene_expression/celltype/average//measurements/gene_expression/celltype/average (1, 34088) float32
/measurements/gene_expression/celltype/cell_count//measurements/gene_expression/celltype/cell_count (1,) int64
/measurements/gene_expression/celltype/fraction//measurements/gene_expression/celltype/fraction (1, 34088) float32
/measurements/gene_expression/celltype/index//measurements/gene_expression/celltype/index (1,) |S16
/measurements/gene_expression/celltype/neighborhood/average//measurements/gene_expression/celltype/neighborhood/average (3, 34088) float32
/measurements/gene_expression/celltype/neighborhood/cell_count//measurements/gene_expression/celltype/neighborhood/cell_count (3, 1) int64
/measurements/gene_expression/celltype/neighborhood/convex_hull/0//measurements/gene_expression/celltype/neighborhood/convex_hull/0 (14, 2) float32
/measurements/gene_expression/celltype/neighborhood/convex_hull/1//measurements/gene_expression/celltype/neighborhood/convex_hull/1 (10

In [20]:
import h5py
import pandas as pd
import os

def h5_to_dataframe(h5_file_path):
    """
    Convert all datasets in an HDF5 file to a dictionary of Pandas DataFrames.
    
    Parameters:
    h5_file_path (str): Path to the HDF5 file.
    
    Returns:
    dict: A dictionary where keys are dataset paths and values are Pandas DataFrames.
    """
    dataframes = {}

    with h5py.File(h5_file_path, 'r') as h5file:
        def extract_dataset(name, obj):
            if isinstance(obj, h5py.Dataset):
                try:
                    data = obj[()]
                    if data.ndim == 1:
                        df = pd.DataFrame(data, columns=[name.split('/')[-1]])
                    elif data.ndim == 2:
                        df = pd.DataFrame(data)
                    else:
                        df = pd.DataFrame(data.reshape(-1, data.shape[-1]))
                    dataframes[name] = df
                except Exception as e:
                    print(f"Failed to read dataset {name}: {e}")

        h5file.visititems(extract_dataset)
    
    return dataframes

# Example usage:
h5_file_path = 'compressed_parkinson.h5'
dfs = h5_to_dataframe(h5_file_path)

# Save each DataFrame to a different sheet in the same Excel file
output_excel_path = 'output.xlsx'
count = 0

with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    for dataset_path, df in dfs.items():
        sheet_name = count
        df.to_excel(writer, sheet_name=str(sheet_name), index=False)
        count = count + 1
        print(f"Saved {dataset_path} to sheet {sheet_name}")

print(f"All datasets saved to {output_excel_path}")

Failed to read dataset measurements/gene_expression/celltype/average: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/fraction: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/average: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/convex_hull/0: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/convex_hull/1: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/convex_hull/2: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/coords_centroid: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/fraction: Can't read data (can't open directory)
Saved measurements/gen

In [21]:
def h5_to_dataframe(h5_file_path):
    """
    Convert all datasets in an HDF5 file to a dictionary of Pandas DataFrames.

    This function reads an HDF5 file and converts each dataset into a Pandas DataFrame.
    The resulting DataFrames are stored in a dictionary where the keys are the dataset paths
    and the values are the corresponding DataFrames.

    Args:
        h5_file_path (str): Path to the HDF5 file.

    Returns:
        dict: A dictionary where keys are dataset paths (str) and values are Pandas DataFrames.
        
    Raises:
        OSError: If the file cannot be opened.
        KeyError: If there is an issue accessing a dataset.
        Exception: For any other issues encountered during dataset reading.

    Example:
        >>> dfs = h5_to_dataframe('path_to_your_file.h5')
        >>> for path, df in dfs.items():
        >>>     print(f"Dataset {path} has shape {df.shape}")
    """
    dataframes = {}

    try:
        with h5py.File(h5_file_path, 'r') as h5file:
            def extract_dataset(name, obj):
                if isinstance(obj, h5py.Dataset):
                    try:
                        data = obj[()]
                        if data.ndim == 1:
                            df = pd.DataFrame(data, columns=[name.split('/')[-1]])
                        elif data.ndim == 2:
                            df = pd.DataFrame(data)
                        else:
                            df = pd.DataFrame(data.reshape(-1, data.shape[-1]))
                        dataframes[name] = df
                    except Exception as e:
                        print(f"Failed to read dataset {name}: {e}")

            h5file.visititems(extract_dataset)
    except OSError as e:
        print(f"Error opening file: {e}")
        raise
    except KeyError as e:
        print(f"Dataset path error: {e}")
        raise
    except Exception as e:
        print(f"Unexpected error: {e}")
        raise

    return dataframes

# Example usage:
h5_file_path = 'compressed_parkinson.h5'
dfs = h5_to_dataframe(h5_file_path)
dfs

Failed to read dataset measurements/gene_expression/celltype/average: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/fraction: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/average: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/convex_hull/0: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/convex_hull/1: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/convex_hull/2: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/coords_centroid: Can't read data (can't open directory)
Failed to read dataset measurements/gene_expression/celltype/neighborhood/fraction: Can't read data (can't open directory)


{'measurements/gene_expression/celltype/cell_count':    cell_count
 0       14903,
 'measurements/gene_expression/celltype/index':                  index
 0  b'endothelial cell',
 'measurements/gene_expression/celltype/neighborhood/cell_count':      0
 0   99
 1  145
 2   56,
 'measurements/gene_expression/celltype/neighborhood/index':   index
 0  b'0'
 1  b'1'
 2  b'2',
 'measurements/gene_expression/features':                  features
 0      b'ENSG00000237613'
 1      b'ENSG00000186092'
 2      b'ENSG00000238009'
 3      b'ENSG00000239945'
 4      b'ENSG00000239906'
 ...                   ...
 34083  b'ENSG00000202490'
 34084  b'ENSG00000212459'
 34085  b'ENSG00000202497'
 34086  b'ENSG00000263858'
 34087  b'ENSG00000264309'
 
 [34088 rows x 1 columns]}

In [23]:
??h5_to_dataframe