In [None]:
import h5py
import json
import numpy as np
from linked_arrays import H5ToJson

data = np.arange(10000).reshape(1000, 10)
data[0]

In [None]:
hdf5_file_path = "test.h5"
with h5py.File(hdf5_file_path, "w") as f:
    f.create_dataset("data", data=data, chunks=(600, 8))

In [None]:
json_file_path = "test.json"
translator = H5ToJson(hdf5_file_path, json_file_path)
translator.translate()

with open(json_file_path) as f:
    json_dict = json.load(f)

print(json.dumps(json_dict, indent=4))

In [None]:
mock_json = {
    "version": 1,
    "refs": {
        # ".zgroup": "{\n    \"zarr_format\": 2\n}",
        "data/.zattrs": "{}",
        "data/.zarray": "{\"chunks\":[600,8],\"compressor\":null,\"dtype\":\"<i8\",\"fill_value\":null,\"filters\":null,\"order\":\"C\",\"shape\":[1000,10],\"zarr_format\":2}",
        "data/0.0": [hdf5_file_path, 4016, 38400],
        "data/0.1": [hdf5_file_path, 42416, 38400],
        "data/1.0": [hdf5_file_path, 80816, 38400],
        "data/1.1": [hdf5_file_path, 119216, 38400]
    }
}
# we can also do fancier, space-efficient things with v1 spec https://fsspec.github.io/kerchunk/spec.html

# see also referencefilesystem decoding scheme
# https://github.com/fsspec/filesystem_spec/blob/master/fsspec/implementations/reference.py#L899
# https://github.com/fsspec/filesystem_spec/blob/master/fsspec/implementations/reference.py#L692

# fsspec unpacks these refs into a directory store as if they were files
# https://github.com/fsspec/filesystem_spec/blob/master/fsspec/implementations/reference.py#L979

# this may be the code that handles getting a requested data chunk
# https://github.com/fsspec/filesystem_spec/blob/master/fsspec/implementations/reference.py#L744C9-L744C17

In [None]:
import fsspec
mapper = fsspec.get_mapper(
    'reference://',
    fo=mock_json,
)

In [None]:
import zarr
z = zarr.open(mapper)
z.info

In [None]:
arr = z["data"]
arr

In [None]:
arr[:]

In [None]:
mock_json = {  # reference file system format version 0
    # ".zgroup": "{\n    \"zarr_format\": 2\n}",  # <-- this does not seem to be necessary but is probably good to have
    # "data/.zattrs": "{}",
    "data/.zarray": "{\"chunks\":[600,8],\"compressor\":null,\"dtype\":\"<i8\",\"fill_value\":null,\"filters\":null,\"order\":\"C\",\"shape\":[1000,10],\"zarr_format\":2}",
    "data/0.0": [hdf5_file_path, 4016, 38400],
    "data/0.1": [hdf5_file_path, 42416, 38400],
    "data/1.0": [hdf5_file_path, 80816, 38400],
    "data/1.1": [hdf5_file_path, 119216, 38400]
}

import fsspec
mapper = fsspec.get_mapper(
    'reference://',
    fo=mock_json,
)

import zarr
z = zarr.open(mapper)
print(z.info)

arr = z["data"]
print(arr.info)
arr[:]

In [None]:
from fsspec.implementations.reference import ReferenceFileSystem
fs = ReferenceFileSystem(fo=mock_json)

In [None]:
fs.cat("data/0.0")

In [None]:
import json
zarray_props = json.loads(fs.cat("data/.zarray"))
zarray_props

In [None]:
# basically try to reverse engineer how zarr makes arrays given the chunk info
# the simple case is easy but when filters are involved it gets more complicated
data = np.empty(shape=zarray_props["shape"], dtype=np.dtype(zarray_props["dtype"]))
data[0:600,0:8] = np.frombuffer(fs.cat_file("data/0.0"), dtype=np.int64).reshape(600, 8)
# data[0:600,8:10] = np.frombuffer(fs.cat_file("data/0.1"), dtype=np.int64).reshape(600, 8)[0:600, 0:2]
data[0:600,8:16] = np.frombuffer(fs.cat_file("data/0.1"), dtype=np.int64).reshape(600, 8)
data

In [None]:
z = zarr.array([1, 2, 3])
z[:]
z.info

In [None]:
z.chunks

In [None]:
dict("{1: 2}")

In [None]:
import h5py
import json
import numpy as np
from linked_arrays import H5ToJson

hdf5_file_path = "test_str.h5"
with h5py.File(hdf5_file_path, "w") as f:
    f.create_dataset("data", data=["a", "b", "c"], dtype=h5py.string_dtype("utf-8"))

In [None]:
json_file_path = "test.json"
chunk_refs_file_path = "test_chunks.json"
translator = H5ToJson(hdf5_file_path, json_file_path, chunk_refs_file_path)
translator.translate()

with open(json_file_path) as f:
    json_dict = json.load(f)

print(json.dumps(json_dict, indent=4))

with open(chunk_refs_file_path) as f:
    chunk_refs = json.load(f)

print(json.dumps(chunk_refs, indent=4))

In [None]:
# how would zarr know that these are variable length strings? it only see that dtype=object...

In [None]:
mock_json = {  # reference file system format version 0
    # ".zgroup": "{\n    \"zarr_format\": 2\n}",  # <-- this does not seem to be necessary but is probably good to have
    # "data/.zattrs": "{}",
    "data/.zarray": "{\"chunks\":[3],\"compressor\":null,\"dtype\":\"object\",\"fill_value\":null,\"filters\":null,\"order\":\"C\",\"shape\":[3],\"zarr_format\":2}",
    "data/0": [hdf5_file_path, 2048, 48],
}

import fsspec
mapper = fsspec.get_mapper(
    'reference://',
    fo=mock_json,
)

import zarr
z = zarr.open(mapper)
print(z.info)

arr = z["data"]
print(arr.info)
arr[:]

# the first filter must be an object codec...

In [None]:
# basically reverse engineer the zarr chunk encoding scheme
# the simple case is easy but when filters are involved it gets more complicated
from fsspec.implementations.reference import ReferenceFileSystem
fs = ReferenceFileSystem(fo=mock_json)
zarray_props = json.loads(fs.cat("data/.zarray"))
print(fs.cat_file("data/0"))
print(zarray_props)

data = np.empty(shape=zarray_props["shape"], dtype=np.dtype(zarray_props["dtype"]))
# data[:] = np.frombuffer(fs.cat_file("data/0"), dtype=str)
# data

In [None]:
data = fs.cat_file("data/0")
print(data)
print(type(data))

In [None]:
a = np.frombuffer(fs.cat_file("data/0"), dtype='S1')
print(a)
print(a.shape)

In [None]:
# Create an in-memory HDF5 file using h5py
with h5py.File('in_memory_data.h5', 'w', driver='core', backing_store=False) as h5f:
    # Create a dataset from the buffer
    dataset = h5f.create_dataset('variable_strings', data=np.void(data))
    print(dataset)

    # Read the dataset into a NumPy array
    data = dataset[()]
    print(data)

In [1]:
import json
import numpy as np
import os
from linked_arrays import H5ToJson

hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000004/sub-P11HMH/sub-P11HMH_ses-20061101_ecephys+image.nwb"
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000015/sub-an041/sub-an041_ses-20140821_obj-17pzgym.nwb"
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000017/sub-Cori/sub-Cori_ses-20161214T120000.nwb"
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000021/sub-699733573/sub-699733573_ses-715093703_probe-810755797_ecephys.nwb"
# # below takes a long time because there is a dataset with 393k chunks (est. time 50 min). set chunk_refs_file_path = None
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000028/sub-MEAREC-250neuron-Neuropixels/sub-MEAREC-250neuron-Neuropixels_ses-20200727T094620_ecephys.nwb"
# # below takes a long time because there is a dataset with 253k chunks (est. time 18 min). set chunk_refs_file_path = None
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000028/sub-mouse412804/sub-mouse412804_ses-20200803T115732_ecephys.nwb"
# # below takes a long time because there is a dataset with 65k chunks (est. time 1 min) and one with 262k chunks (est. time 15 min). set chunk_refs_file_path = None
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000037/sub-408021/sub-408021_ses-758519303_behavior+image+ophys.nwb"
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000048/sub-222549/sub-fly01_ophys.nwb"
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000049/sub-661968859/sub-661968859_ses-681698752_behavior+ophys.nwb"
# # below takes a long time because there is a dataset with 196k chunks (est. time 11 min). set chunk_refs_file_path = None
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000053/sub-npI1/sub-npI1_ses-20190413_behavior+ecephys.nwb"
# # below takes a long time because there is a dataset with 1.2M chunks (est. time 2.5 hours). set chunk_refs_file_path = None
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000054/sub-F1/sub-F1_ses-20190407T210000_behavior+ophys.nwb"
# # this file ends up pretty big because the voxel mask is a large struct array (int, int, int, float)...
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000167/sub-163/sub-163_ses-20200212T160655_ophys.nwb"
# # setting chunk_refs_file_path = None from here on out...
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000223/sub-2282/sub-2282_ses-20190914T145458_ecephys+ophys.nwb"
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000231/sub-219CR/sub-219CR_ses-20190403T123013_behavior+image.nwb"
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000296/sub-10002342988018666858/sub-10002342988018666858_ses-20170911T135306_ophys.nwb"
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000402/sub-17797/sub-17797_ses-4-scan-10_behavior+image+ophys.nwb"
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000409/sub-CSHL047/sub-CSHL047_ses-b52182e7-39f6-4914-9717-136db589706e_behavior+ecephys+image.nwb"
# hdf5_file_path = "/Users/rly/Documents/NWB_Data/dandisets/000575/sub-02/sub-02_ses-20171011T152100_behavior+ecephys.nwb"
json_file_path = "test.json"
chunk_refs_file_path = None  #"test_chunks.json"
translator = H5ToJson(hdf5_file_path, json_file_path, chunk_refs_file_path)
translator.translate()

hdf5_file_size = os.path.getsize(hdf5_file_path)  # in bytes
if hdf5_file_size > 1e9:
    print(f"HDF5 file size: {hdf5_file_size / 1e9} GB")
elif hdf5_file_size > 1e6:
    print(f"HDF5 file size: {hdf5_file_size / 1e6} MB")
else:
    print(f"HDF5 file size: {hdf5_file_size / 1000} KB")

json_file_size = os.path.getsize(json_file_path)  # in bytes
if json_file_size > 1e6:
    print(f"JSON file size: {json_file_size / 1e6} MB")
else:
    print(f"JSON file size: {json_file_size / 1000} KB")

# with open(json_file_path) as f:
#     json_dict = json.load(f)
# print(json.dumps(json_dict, indent=4))
# with open(chunk_refs_file_path) as f:
#     chunk_refs = json.load(f)
# print(json.dumps(chunk_refs, indent=4))

HDF5 file size: 72.628704 MB
JSON file size: 117.218 KB


In [None]:
val = json_dict["refs"]["/"]["groups"]["general"]["groups"]["extracellular_ephys"]["groups"]["electrodes"]["datasets"]["imp"]["data"][0]
print(val, type(val))

In [None]:
import base64
np.frombuffer(base64.b64decode("AQAAAAAAAAACAAAAAAAAAAMAAAAAAAAA"), dtype="int64")