# NWM ReferenceFileSystem JSON 
Create ReferenceFileSystem JSON file for a collection of NWM NetCDF files on S3 

In [None]:
import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask

In [None]:
fs = fsspec.filesystem('s3', anon=True)
flist = fs.glob('s3://coastalcoupling/usgs/gom1km/*.nc')

fobjs = [xr.open_dataset(fs.open(f)) for f in flist]
ds = xr.concat(fobjs,  join='override', combine_attrs='override', dim='time')
ds.water_u
ds.water_u[0,0,:,:].plot()

We need to include the "s3://" prefix to the list of files so that fsspec will recognize that these JSON files are on S3.   There is no "storage_

In [None]:
urls = ["s3://" + f for f in flist]

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')

If the directory exists, remove it (and all the files), then create it:

In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 10
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=False,
                                      adaptive_scaling=False, wait_for_cluster=True, 
                                      environment='pangeo', worker_profile='Pangeo Worker', 
                                      propagate_env=True)

In [None]:
cluster

In [None]:
#### try xr_concat on ncfiles

We passed AWS credentials to the Dask workers via environment variables above, and the dask workers don't have the AWS credentials file with profiles defined, so we can't define a profile here. 

In [None]:
fs2 = fsspec.filesystem('s3', anon=False)  

In [None]:
def gen_json(u):
    with fs.open(u, **so) as inf:
        h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
        fname = u.split('/')[-1]
        outf = f's3://esip-qhub/usgs/gom1km/testing/jsons/{fname}.json'
        print(outf)
        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())

In [None]:
%%time
dask.compute(*[dask.delayed(gen_json)(u) for u in urls], retries=10);

#### Try accessing the individual JSON files directly from S3 as file-like objects

In [None]:
flist = fs2.ls('s3://esip-qhub/usgs/gom1km/testing/jsons/')
fobjs = [fs2.open(f) for f in flist]

In [None]:
furls = ['s3://'+f for f in flist]

In [None]:
furls[0]

In [None]:
#fsspec.utils.setup_logging(logger_name='s3fs')
#fsspec.utils.setup_logging(logger_name='fsspec.reference')

In [None]:
import xarray as xr

#r_opts = {'anon': False} # NetCDF files on AWS Open Data public bucket
r_opts = {'profile': 'julia'}
fo = furls[0]

fs = fsspec.filesystem("reference", fo=fo, 
                       remote_protocol='s3', remote_options=r_opts)

m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
fs.ls('water_u')

In [None]:
ds

In [None]:
ds.water_u.plot()

In [None]:
mzz = MultiZarrToZarr(furls, 
    storage_options={'anon':False}, 
    remote_protocol='s3',
    remote_options={'anon' : 'True'},   #JSON files  
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
#        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#          "data_vars": "minimal",
#          "coords": "minimal",
#          "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

In [None]:
%%time
#%%prun -D multizarr_profile 
mzz.translate('gom1km.json')

#### Try opening the consolidated JSON file

In [None]:
import xarray as xr

r_opts = {'anon': True} # NetCDF files on AWS Open Data public bucket

fo = 'gom1km.json'
fs = fsspec.filesystem("reference", fo=fo, 
                       remote_protocol='s3', remote_options=r_opts)

m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
ds.water_u

In [None]:
ds.water_u.plot()

In [None]:
m['water_u/.zarray']

In [None]:
m['water_u/0.0.0.0']

In [None]:
fs.ls('water_u')

In [None]:
len(m['water_u/0.0.0.0.0'])

In [None]:
cluster.shutdown(); client.close()