In [None]:
# !pip install h5py s3fs xarray zstandard git+https://github.com/fsspec/kerchunk

In [None]:
from kerchunk.hdf import SingleHdf5ToZarr
import fsspec

In [None]:
fs = fsspec.filesystem('s3', anon=True)  # S3 file system to manage ERA5 files
flist = (fs.glob('s3://era5-pds/2020/*/data/air_pressure_at_mean_sea_level.nc')[:2]
        + fs.glob('s3://era5-pds/2020/*/data/*sea_surface_temperature.nc')[:2])

fs2 = fsspec.filesystem('')  # local file system to save final jsons to

print(flist)

In [None]:
from pathlib import Path
import os
import ujson

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first') # kwargs to fs.open()
# default_fill_cache=False avoids caching data in between file chunks to lower memory usage.

In [None]:
def gen_json(file_url):
    """Generate JSON reference files using a file_url
    
    file_url should be the url of the final file destination and not the current location.
    file_url is not used to open the file and is intended to allow the user to compute the 
    reference files on data before it is uploaded to its final storage location.
    """
    with fs.open(file_url, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=300)  # 300 bytes???
        # inline_threshold adjusts the Size below which binary blocks are included directly in the output
        # a higher inline threshold can result in a larger json file but faster loading time
        variable = file_url.split('/')[-1].split('.')[0]
        month = file_url.split('/')[2]
        outf = f'{month}_{variable}.json' # file name to save json to
        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())

In [None]:
%%time
for file in flist:
    gen_json(file)

The output json file contains references under the `ref` key that represents the hierarchical group nature of the original NetCDF4 file. `.zgroup` and `.zattrs` are strings that contain JSON objects. Normally, these are files that contain a JSON object each. Each dataset is split into chunks that are encoded as either:
1. a list where the elements are:
    1. the source URL of the file
    2. the byte offset to the chunk
    3. the number of bytes to read
2. a base64-encoded string if the size is less than the inline threshold

See also https://fsspec.github.io/kerchunk/spec.html

In [None]:
import xarray as xr

In [None]:
%%time
ds = xr.open_dataset(
    "reference://", 
    engine="zarr", 
    backend_kwargs={
        "consolidated": False,
        "storage_options": {
            "fo": '01_air_pressure_at_mean_sea_level.json', 
            "remote_protocol": "s3",
            "remote_options": {"anon": True},
        },
    },
)
print(ds)

In [None]:
type(ds.air_pressure_at_mean_sea_level)

In [None]:
%time
ds.air_pressure_at_mean_sea_level[100,100,100].values

# Using a remote reference JSON file (kerchunk does not need to be installed)

## Read a remote zstd-compressed JSON file that maps to 10 ERA5 variables across a 43 year time span.

The sidecar file has been compressed using zstd, from the original 1.8GB to 194MB. Opening this virtual dataset requires 7GB of free system memory.

In [None]:
%%time
fs = fsspec.filesystem(
    'reference', 
    fo='s3://esip-qhub-public/ecmwf/ERA5_1979_2022_multivar.json.zst',
    target_options={'compression': 'zstd', 'anon': True},
    remote_protocol='s3', 
    remote_options={'anon': True},
)

# QUESTION: is the reference JSON loaded only into memory?

In [None]:
m = fs.get_mapper('')
ds = xr.open_dataset(m, engine='zarr', backend_kwargs={'consolidated': False})
print(ds)

In [None]:
ds.eastward_wind_at_100_metres[100,100,100].values

# Now try with NWB files on DANDI S3

In [None]:
from kerchunk.hdf import SingleHdf5ToZarr
import fsspec

In [None]:
# from dandi.dandiapi import DandiAPIClient

# dandiset_id = '000053'
# filepath = 'sub-npI1/sub-npI1_ses-20190413_behavior+ecephys.nwb'
# with DandiAPIClient() as client:
#     asset = client.get_dandiset(dandiset_id, 'draft').get_asset_by_path(filepath)
#     s3_url = asset.get_content_url(follow_redirects=1, strip_query=True)

# print(s3_url)

In [None]:
# OR get s3 URL from dandi website for this NWB file
# https://api.dandiarchive.org/api/dandisets/000053/versions/0.210819.0345/assets/22f70021-de36-44c4-8f29-4998b9ff1123/
# s3_url = "https://dandiarchive.s3.amazonaws.com/blobs/d74/1e1/d741e149-620a-4eab-a0c0-24c8133d0fc7"

In [None]:
s3_url = "s3://dandiarchive/blobs/d74/1e1/d741e149-620a-4eab-a0c0-24c8133d0fc7"

In [None]:
fs = fsspec.filesystem('s3', anon=True)  # HTTP file system to manage DANDI NWB files
fs2 = fsspec.filesystem('')  # local file system to save final jsons to

In [None]:
from pathlib import Path
import os
import ujson

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first') # kwargs to fs.open()
# default_fill_cache=False avoids caching data in between file chunks to lower memory usage.

In [None]:
def gen_json(file_url, outf):
    """Generate JSON reference files using a file_url
    
    file_url should be the url of the final file destination and not the current location.
    file_url is not used to open the file and is intended to allow the user to compute the 
    reference files on data before it is uploaded to its final storage location.
    """
    with fs.open(file_url, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=300)  # 300 bytes???
        # inline_threshold adjusts the Size below which binary blocks are included directly in the output
        # a higher inline threshold can result in a larger json file but faster loading time

        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())

In [None]:
%%time
outf = '000053_sub-npI1_ses-20190413_behavior+ecephys.nwb.json'  # file name to save json to
gen_json(s3_url, outf)

# this takes a long time????
# this needs a progress bar or something!!!

# compare this to downloading the file and creating this JSON locally
# that might actually be faster

# Try with local NWB files

In [1]:
from kerchunk.hdf import SingleHdf5ToZarr
import fsspec
import ujson

In [2]:
fs_read = fsspec.filesystem('')  # local file system to read from
fs_write = fsspec.filesystem('')  # local file system to save final jsons to

In [3]:
def gen_json_from_local(local_file_path, final_remote_url, outf):
    """Generate JSON reference files using a file_url
    
    file_url should be the url of the final file destination and not the current location.
    file_url is not used to open the file and is intended to allow the user to compute the 
    reference files on data before it is uploaded to its final storage location.
    """
    with fs_read.open(local_file_path, 'rb') as infile:
        h5chunks = SingleHdf5ToZarr(infile, final_remote_url, inline_threshold=300, error="raise")  # 300 bytes???
        # inline_threshold adjusts the Size below which binary blocks are included directly in the output
        # a higher inline threshold can result in a larger json file but faster loading time

        with fs_write.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())

In [None]:
%%time
%pdb on
local_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000053/sub-npI1/sub-npI1_ses-20190413_behavior+ecephys.nwb"
final_remote_url = "s3://dandiarchive/blobs/d74/1e1/d741e149-620a-4eab-a0c0-24c8133d0fc7"
outf = "000053_sub-npI1_ses-20190413_behavior+ecephys.nwb.json"  # file name to save json to
gen_json_from_local(local_file_path, final_remote_url, outf)

In [None]:
%debug

In [None]:
%%time
local_file_path = "/Users/rly/Documents/NWB/kerchunk-playground/test_str.h5"
final_remote_url = "s3://..."
outf = "test_str.json"  # file name to save json to
gen_json_from_local(local_file_path, final_remote_url, outf)

In [None]:
%%time
local_file_path = "/Users/rly/Documents/NWB/kerchunk-playground/test_bytes.h5"
final_remote_url = "s3://..."
outf = "test_bytes.json"  # file name to save json to
gen_json_from_local(local_file_path, final_remote_url, outf)

In [None]:
%%time
local_file_path = "/Users/rly/Documents/NWB/kerchunk-playground/test_int.h5"
final_remote_url = "s3://..."
outf = "test_int.json"  # file name to save json to
gen_json_from_local(local_file_path, final_remote_url, outf)

In [4]:
%%time
local_file_path = "/Users/rly/Documents/NWB/kerchunk-playground/test_multi_str.h5"
final_remote_url = "s3://..."
outf = "test_multi_str.json"  # file name to save json to
gen_json_from_local(local_file_path, final_remote_url, outf)

CPU times: user 8.55 ms, sys: 5.79 ms, total: 14.3 ms
Wall time: 15.8 ms


In [10]:
import zarr
z = zarr.open("reference://", storage_options={"fo": "test_multi_str_json_codec.json"})
z["data"][:]

array(['test', 'more test'], dtype=object)

In [11]:
import zarr
z = zarr.open("reference://", storage_options={"fo": "test_multi_str_vlenutf8_codec.json"})
z["data"][:]

array(['test', 'more test'], dtype=object)

In [13]:
import zarr
z = zarr.open("reference://", storage_options={"fo": "test_str.json"})
z["data"][()]

'test'

# Normal streaming

In [None]:
import h5py
import pynwb

In [None]:
fs = fsspec.filesystem("http")
with fs.open(s3_url, "rb") as f:
    with h5py.File(f) as file:
        with pynwb.NWBHDF5IO(file=file, load_namespaces=True) as io:
            nwbfile = io.read()
            print(nwbfile)

# Zarr test with scalar dataset

In [None]:
import zarr
import numpy as np
import numcodecs

In [None]:
z = zarr.zeros(shape=tuple(), dtype=str)

In [None]:
z[...] = "test"

In [None]:
z

In [None]:
z[()]

In [None]:
data = np.array("test")

In [None]:
z = zarr.array(data)

In [None]:
z

In [None]:
z[()]

In [None]:
z = zarr.array(data="test", object_codec=numcodecs.JSON())
z

In [None]:
z = zarr.empty(shape=tuple(), dtype=str, object_codec=numcodecs.JSON())
z[...] = "test"

In [None]:
z = zarr.array(data="test", object_codec=numcodecs.JSON())
z

In [None]:
z = zarr.array(data="test", dtype=str, object_codec=numcodecs.JSON())

In [None]:
z = zarr.array(data="test", dtype=str)

In [None]:
z

In [None]:
z[()]

In [None]:
z = zarr.array(data=["test", "more test"], dtype=str)

In [None]:
z

In [None]:
import h5py
with h5py.File("test_int.h5", "w") as f:
    f.create_dataset("data", data=42, shape=None, dtype=int)
    print(f["data"].shape)

In [None]:
with h5py.File("test_str.h5", "r") as h5_file:
    zarr.copy_all(h5_file, zarr.open("test_out.zarr", "w"))