# GOM1KM ReferenceFileSystem JSON 
Create ReferenceFileSystem JSON file for a collection of GOM1km NetCDF files on S3 

In [None]:
import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask
import numpy as np
import hvplot.xarray

In [None]:
fs = fsspec.filesystem('s3', profile='zarr-collab')
flist = fs.glob('s3://zarrcollab/oot/nrl/gom1km/*.nc')
flist

We need to include the "s3://" prefix to the list of files so that fsspec will recognize that these JSON files are on S3.   There is no "storage_options"

In [None]:
urls = ["s3://" + f for f in flist]

so = dict(mode='rb', profile='zarr-collab', default_fill_cache=False, default_cache_type='first')

Create a filesystem to write the json files

In [None]:
def gen_json(u):
    with fs.open(u, **so) as inf:
        h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
        fname = u.split('/')[-1]
        outf = f's3://zarrcollab/oot/nrl/gom1km/jsons/{fname}.json'
        print(outf)
        with fs.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())

In [None]:
from dask.distributed import Client

In [None]:
client = Client()

In [None]:
%%time
dask.compute(*[dask.delayed(gen_json)(u) for u in urls], retries=10);

In [None]:
client

#### Try accessing an individual JSON file

In [None]:
flist = fs.ls('s3://zarrcollab/oot/nrl/gom1km/jsons/')
fobjs = [fs.open(f) for f in flist]

In [None]:
furls = sorted(['s3://'+f for f in flist])

In [None]:
furls[0]

In [None]:
#r_opts = {'anon': False} # NetCDF files on AWS Open Data public bucket
r_opts = {'profile': 'zarr-collab'}
fo = 's3://zarrcollab/oot/nrl/gom1km/jsons/gom1km_2021_03_01_00.nc.json'
fs = fsspec.filesystem("reference", fo=fo, ref_storage_args=r_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
#fsspec.utils.setup_logging(logger_name='s3fs')
#fsspec.utils.setup_logging(logger_name='fsspec.reference')

In [None]:
fs.ls('water_u')

In [None]:
ds

In [None]:
ds.water_u.plot()

In [None]:
mzz = MultiZarrToZarr(furls, 
    storage_options=r_opts,
    remote_protocol='s3',
    remote_options=r_opts,  
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
#        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#          "data_vars": "minimal",
#          "coords": "minimal",
#          "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

In [None]:
%%time
#%%prun -D multizarr_profile 
mzz.translate('gom1km.json')

#### Try opening the consolidated JSON file

In [None]:
fo = 'gom1km.json'
fs = fsspec.filesystem("reference", fo=fo, 
                       remote_protocol='s3', remote_options=r_opts)

m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
ds.water_u

In [None]:
ds.water_u[0,0,:,:].plot()

In [None]:
# fs.ls('water_u')

#### Let's make a fancy plot of surface speed using hvplot
hvplot likes lon=[-180,180]

In [None]:
ds = ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180))

In [None]:
ds['speed'] = np.sqrt(ds['water_u']**2 + ds['water_v']**2)

In [None]:
ds['speed'].hvplot.quadmesh(x='lon', y='lat', rasterize=True, geo=True, cmap='turbo', tiles='OSM')

In [None]:
fs2 = fsspec.filesystem('s3', profile='zarr-collab', skip_instance_cache=True )

In [None]:
fs2.info(fo)

In [None]:
r_opts = {'profile': 'zarr-collab', 'skip_instance_cache':True}
fo = 's3://zarrcollab/oot/nrl/gom1km/gom1km.json'
fs = fsspec.filesystem("reference", fo=fo, ref_storage_args=r_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
fs.clear_instance_cache()

In [None]:
fs2 = fsspec.filesystem('s3', anon=False, profile='zarr-collab', skip_instance_cache=True)

In [None]:
fs2.ls('zarrcollab/oot/nrl/gom1km/')

In [None]:
fs2.ls("")