# GOM1KM ReferenceFileSystem JSON 
Create ReferenceFileSystem JSON file for a collection of GOM1KM NetCDF files on S3 

In [None]:
import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask

In [None]:
fs = fsspec.filesystem('s3', anon=True)
flist = fs.glob('s3://coastalcoupling/usgs/gom1km/*.nc')
flist

#### `xr.concat` works on the remote NetCDF files:

In [None]:
%%time
fobjs = [xr.open_dataset(fs.open(f)) for f in flist]
ds = xr.concat(fobjs,  join='override', combine_attrs='override', dim='time')

In [None]:
ds.water_u.shape

In [None]:
ds.water_u[0,0,:,:].plot()

#### The individual JSON files also work fine

In [None]:
fs2 = fsspec.filesystem('s3', requester_pays=True)  

In [None]:
flist = fs2.ls('s3://esip-qhub/usgs/gom1km/testing/jsons/')
fobjs = [fs2.open(f) for f in flist]

In [None]:
furls = ['s3://'+f for f in flist]

In [None]:
fo = furls[0]

In [None]:
#fsspec.utils.setup_logging(logger_name='s3fs')
#fsspec.utils.setup_logging(logger_name='fsspec.reference')

In [None]:
#r_opts = {'anon': False} # NetCDF files on AWS Open Data public bucket
r_opts = {'requester_pays': True} 

fs = fsspec.filesystem("reference", fo=fo, 
                       remote_protocol='s3', remote_options=r_opts)

m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
ds.water_u.shape

In [None]:
ds.water_u.plot()

In [None]:
ds.data_vars

In [None]:
fs.ls('water_u')

In [None]:
fs.ls('surf_wnd_stress_e')

#### Create consolidated JSON

In [None]:
mzz = MultiZarrToZarr(furls, 
    storage_options={'requester_pays': True},   #json files 
    remote_protocol='s3',
    remote_options={'anon' : 'True'},    #netcdf files
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
#        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#          "data_vars": "minimal",
#          "coords": "minimal",
#          "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

In [None]:
%%time
#%%prun -D multizarr_profile 
mzz.translate('gom1km.json')

#### Try opening the consolidated JSON

In [None]:
import xarray as xr

r_opts = {'anon': True} # NetCDF files on AWS Open Data public bucket

fo = 'gom1km.json'
fs = fsspec.filesystem("reference", fo=fo, 
                       remote_protocol='s3', remote_options=r_opts)

m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
ds.water_u

In [None]:
ds.water_u[0,0,:,:].plot()

In [None]:
m['water_u/.zarray']

In [None]:
fs.ls('surf_wnd_stress_e')

So the shape of water_u is 4D.  But Multi is creating 5D references instead of 4D references:

In [None]:
fs.ls('water_u')

So this 4D reference attempt fails:

In [None]:
m['water_u/0.0.0.0']