In [1]:
pip install dask[complete] h5py gcsfs

Note: you may need to restart the kernel to use updated packages.


In [3]:
import h5py
import dask.array as da
import gcsfs
from dask import delayed

# Path to the GCS directory containing HDF5 files
gcs_directory = 'gs://test-agb-bucket/GEDIL4A2023/'

# Create a GCSFileSystem instance
gcs = gcsfs.GCSFileSystem()

# List all HDF5 files in the GCS directory
hdf5_files = gcs.ls(gcs_directory)
hdf5_files = [f for f in hdf5_files if f.endswith('.h5')]

print(f"Found {len(hdf5_files)} HDF5 files.")

# Function to process and filter datasets within a single HDF5 file
@delayed
def process_hdf5_file(file_path):
    with gcs.open(file_path, 'rb') as f:
        print("Checkpoint1")
        with h5py.File(f, 'r') as hdf:
            print("Checkpoint2")
            filtered_results = {}
            print(hdf.keys())
            # Iterate over all datasets dynamically
            for dataset_name in hdf.keys():
                if dataset_name in ['l4_quality_flag', 'agbd']:  # Process only relevant datasets
                    data = hdf[dataset_name][:]
                    
                    # Example filtering logic for L4_flag and AGBD
                    if dataset_name == 'l4_quality_flag':
                        filtered_results['l4_quality_flag'] = data
                    elif dataset_name == 'agbd':
                        mask = ~data.isnan()  # Filter non-null AGBD values
                        filtered_results['agbd'] = data[mask]

            return filtered_results

# Process all files in parallel
processed_files = [process_hdf5_file(f) for f in hdf5_files]

# Collect results
results = delayed(lambda x: x)(processed_files)

# Compute results
final_results = results.compute()
print(final_results)


Found 58 HDF5 files.
[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
