# NWM ReferenceFileSystem JSON 
Create ReferenceFileSystem JSON file for a collection of NWM NetCDF files on S3 

In [None]:
import os
import shutil
import zipfile
import fsspec
import ujson
import json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr

In [None]:
fs = fsspec.filesystem('s3', anon=True)

In [None]:
flist = fs.glob('s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/20170401*.CHRTOUT*.comp')

In [None]:
flist

In [None]:
urls = ["s3://" + f for f in flist]

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')

If the directory exists, remove it (and all the files), then create it:

In [None]:
tdir = './jsons'
try:
    shutil.rmtree(tdir)
except OSError as e:
    print ("Error: %s - %s." % (e.filename, e.strerror))
os.makedirs(tdir)

In [None]:
def gen_json(u):
    with fs.open(u, **so) as inf:
        h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
        with open(f"jsons/{u.split('/')[-1]}.json", 'wb') as outf:
            outf.write(ujson.dumps(h5chunks.translate()).encode())

In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 10
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Pangeo Worker', 
                                      propagate_env=True)

In [None]:
cluster

In [None]:
%%time
for u in urls:
    gen_json(u)

In [None]:
%%time
dask.compute(*[dask.delayed(gen_json)(u) for u in urls]);

In [None]:
from glob import glob
json_list = sorted(glob("jsons/*.json"))

In [None]:
mzz = MultiZarrToZarr(
    json_list,
    remote_protocol='s3',
    remote_options={
       'anon' : 'True'
    },    
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
        "data_vars": "minimal",
        "coords": "minimal",
        "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "t"

    }
)

In [None]:
%%time
%%prun -D multizarr_profile 
mzz.translate('nwm.json')