# NWM ReferenceFileSystem JSON 
Create ReferenceFileSystem JSON file for a collection of NWM NetCDF files on S3 

In [None]:
import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask

In [None]:
fs = fsspec.filesystem('s3', profile='julia')

In [None]:
flist = fs.glob('coawst-public/rsignell/testing/gom1km/ncom_1_*.nc')
flist

In [None]:
urls = ["s3://" + f for f in flist]

In [None]:
ds = xr.open_dataset(fs.open(flist[0]))

In [None]:
ds.water_u.plot()

We need to include the "s3://" prefix to the list of files so that fsspec will recognize that these JSON files are on S3.   There is no "storage_

In [None]:
urls = ["s3://" + f for f in flist]

so = dict(mode='rb', profile='julia', default_fill_cache=False, default_cache_type='first')

If the directory exists, remove it (and all the files), then create it:

In [None]:
from dask_gateway import Gateway
from dask.distributed import Client

gateway = Gateway()
gateway.list_clusters()
if gateway.list_clusters():
    print('Existing Dask clusters:')
    for c in gateway.list_clusters():
        print('Cluster Name:',c.name,c.status)
else:
    print('No Cluster running.')

In [None]:
# New or connect:
# If no cluster is running, create a new one, else connect to the first one found (idx=0, change if other cluster should be running)
idx=0
if not gateway.list_clusters():
    cluster = gateway.new_cluster(environment='pangeo', profile='Small Worker')
else:
    cluster=gateway.connect(gateway.list_clusters()[idx].name)  

In [None]:
client = Client(cluster)

In [None]:
cluster.scale(5)

In [None]:
import os
from dask.distributed import WorkerPlugin

class UploadFile(WorkerPlugin):
    """A WorkerPlugin to upload a local file to workers.
    Parameters
    ----------
    filepath: str
        A path to the file to upload
    Examples
    --------
    >>> client.register_worker_plugin(UploadFile(".env"))
    """
    def __init__(self, filepath):
        """
        Initialize the plugin by reading in the data from the given file.
        """

        self.filename = os.path.basename(filepath)
        self.dirname = os.path.dirname(filepath)
        with open(filepath, "rb") as f:
            self.data = f.read()

    async def setup(self, worker):
        if not os.path.exists(self.dirname):
            os.mkdir(self.dirname)
        os.chdir(self.dirname)
        with open(self.filename, "wb+") as f:
            f.write(self.data)
        return os.listdir()

In [None]:
client.register_worker_plugin(UploadFile('/home/jovyan/.aws/credentials'))
client.register_worker_plugin(UploadFile('/home/jovyan/.aws/config'))

In [None]:
#client.close; cluster.shutdown()

In [None]:
cluster

We passed AWS credentials to the Dask workers via environment variables above, and the dask workers don't have the AWS credentials file with profiles defined, so we can't define a profile here. 

In [None]:
def gen_json(u):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        fname = u.split('/')[-1]
        outf = f'coawst-public/rsignell/testing/gom1km/jsons/{fname}.json'
        print(outf)
        with fs.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())

In [None]:
%%time
dask.compute(*[dask.delayed(gen_json)(u) for u in urls], retries=10);

#### Try accessing the individual JSON files directly from S3 as file-like objects

In [None]:
flist = fs2.ls('coawst-public/rsignell/testing/gom1km/jsons/')
fobjs = [fs2.open(f) for f in flist]

In [None]:
furls = sorted(['s3://'+f for f in flist])

In [None]:
furls

In [None]:
mzz = MultiZarrToZarr(furls, 
    storage_options={'anon':False},
    remote_protocol='s3',                  
    remote_options={'anon' : False},   #JSON files  
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
#        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#          "data_vars": "minimal",
#          "coords": "minimal",
#          "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

Download the individual JSON files from S3:

In [None]:
#fs2.download('s3://esip-qhub/usgs/testing/jsons/', './jsons', recursive=True)

In [None]:
##from glob import glob
#json_list = sorted(glob("jsons/*.json"))

In [None]:
# mzz = MultiZarrToZarr(json_list, 
#     remote_protocol='s3',    #
#     remote_options={'anon' : 'True'},    
#     xarray_open_kwargs={
#         'decode_cf' : False,
#         'mask_and_scale' : False,
#         'decode_times' : False,
#         'use_cftime' : False,
#         'drop_variables': ['reference_time', 'crs'],
#         'decode_coords' : False
#     },
#     xarray_concat_args={
# #          "data_vars": "minimal",
# #          "coords": "minimal",
# #          "compat": "override",
#         "join": "override",
#         "combine_attrs": "override",
#         "dim": "time"
#     }
# )

In [None]:
%%time
#%%prun -D multizarr_profile 
mzz.translate('gom1km.json')

#### Try opening the consolidated JSON file

In [None]:
import xarray as xr

r_opts = {'anon': False} # NetCDF files on AWS Open Data public bucket
#r_opts = {'profile':'julia'}
fo = 'gom1km.json'
fs = fsspec.filesystem("reference", fo=fo, 
                       remote_protocol='s3', remote_options=r_opts)

m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
ds.water_u.plot()

In [None]:
import xarray as xr

r_opts = {'anon': False} # NetCDF files on AWS Open Data public bucket

fo = 's3://coawst-public/rsignell/testing/gom1km/jsons/ncom_1_2021032200_00780000.nc.json'
fs = fsspec.filesystem("reference", fo=fo, 
                       remote_protocol='s3', remote_options=r_opts)

m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
ds.water_u.plot()

In [None]:
import hvplot.xarray
import numpy as np

In [None]:
ds['speed'] = np.sqrt(ds.water_u**2 + ds.water_v**2)

In [None]:
ds = ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180))

In [None]:
ds.water_u[-1,-1,:,:].plot()

In [None]:
ds.speed[-1,:,:].hvplot.quadmesh(x='lon', y='lat', 
                                 rasterize=True, cmap='turbo', 
                                 geo=True, tiles='OSM')

In [None]:
ds.surf_wnd_stress_e.hvplot.image(x='lon', y='lat', rasterize=True)

In [None]:
cluster.shutdown(); client.close()