# Kerchunk the AORC-OWP NetCDF files

In [None]:
import fsspec
import xarray as xr
import ujson   # fast json
from kerchunk.hdf import SingleHdf5ToZarr 
from kerchunk.combine import MultiZarrToZarr
from pathlib import Path

In [None]:
fs_read = fsspec.filesystem('file')

In [None]:
fs_write = fsspec.filesystem('file')

In [None]:
nc_files = fs_read.glob('/caldera/hytest_scratch/scratch/rsignell/nc_files/200002/*.nc4') 

len(nc_files)

In [None]:
from dask.distributed import LocalCluster, Client

cluster = LocalCluster(threads_per_worker=1)

client = Client(cluster)

client

In [None]:
json_dir = '/caldera/hytest_scratch/scratch/rsignell/json_files'

In [None]:
nc_files[0]

In [None]:
#cluster.close()

In [None]:
def gen_json(u):
    with fs_read.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        fstem = Path(u).stem 
        outf = f'{json_dir}/{fstem}.json'
        print(outf)
        with fs_write.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

In [None]:
so = dict(mode='rb', default_fill_cache=False, default_cache_type='first')

#### Test one file

In [None]:
%%time 
gen_json(nc_files[0])

In [None]:
single_json = '/caldera/hytest_scratch/scratch/rsignell/json_files/AORC-OWP_2000020100z.json'

In [None]:
%%time
fs = fsspec.filesystem("reference", fo=single_json, ref_storage_args={'skip_instance_cache':True},
                       )

In [None]:
fs.ls('PRES_surface')

In [None]:
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})

In [None]:
ds

#### Use Dask bag to compute the JSONs in parallel

In [None]:
import dask.bag as db

In [None]:
cluster.workers

In [None]:
b = db.from_sequence(nc_files, npartitions=80)

In [None]:
b1 = b.map(gen_json)

In [None]:
%%time
_ = b1.compute(retries=10)

In [None]:
json_list = fs_write.glob(f'{json_dir}/*.json')  

In [None]:
print(len(json_list))