# NWM ReferenceFileSystem JSON 
Create ReferenceFileSystem JSON file for a collection of NWM NetCDF files on S3 

In [None]:
import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask

In [None]:
fs = fsspec.filesystem('s3', profile='julia')

In [None]:
flist = fs.glob('coawst-public/rsignell/testing/gom1km/ncom_1_*.nc')
flist

In [None]:
urls = ["s3://" + f for f in flist]

In [None]:
ds = xr.open_dataset(fs.open(flist[0]))

In [None]:
ds.water_u.plot()

We need to include the "s3://" prefix to the list of files so that fsspec will recognize that these JSON files are on S3.   There is no "storage_

In [None]:
urls = ["s3://" + f for f in flist]

so = dict(mode='rb', anon=False, default_fill_cache=False, default_cache_type='first')

If the directory exists, remove it (and all the files), then create it:

In [None]:
fs2 = fsspec.filesystem('s3', anon=False)  

In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

#ebd.set_credentials(profile='julia')

profile = 'julia'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 10
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=True, 
                                      environment='pangeo', worker_profile='Pangeo Worker', 
                                      propagate_env=True)

In [None]:
#client.close; cluster.shutdown()

In [None]:
import configparser
import os
def set_aws_credentials(cfile=os.path.join(os.environ['HOME'],'.aws','credentials'),profile_name='default',region_name='us-east-1',endpoint='s3.amazonaws.com',verbose=False):
    '''Sets the aws credentials if not set already and profilename is default'''
    cp = configparser.ConfigParser()
    cp.read(cfile)
    os.environ['aws_access_key_id'.upper()]=cp[profile_name]['aws_access_key_id']	
    os.environ['aws_secret_access_key'.upper()]=cp[profile_name]['aws_secret_access_key']	
    os.environ['aws_profile'.upper()]=profile_name
    os.environ['aws_default_profile'.upper()]=profile_name
    os.environ['aws_s3_region'.upper()]=region_name
    os.environ['aws_s3_endpoint'.upper()]=endpoint
    os.environ['aws_default_region'.upper()]=region_name
    if verbose:
        print('export {}={}'.format('aws_access_key_id'.upper(),cp[profile_name]['aws_access_key_id']	))
        print('export {}={}'.format('aws_secret_access_key'.upper(),cp[profile_name]['aws_secret_access_key']	))

In [None]:
set_aws_credentials(profile_name='julia')

In [None]:
from dask.distributed import WorkerPlugin
import os
import uuid
import asyncio

class InitWorker(WorkerPlugin):
    name = "init_worker"

    def __init__(self, filepath=None, script=None):
        self.data = {}
        if filepath:
            if isinstance(filepath, str):
                filepath = [filepath]
            for file_ in filepath:
                with open(file_, "rb") as f:
                    filename = os.path.basename(file_)
                    self.data[filename] = f.read()
        if script:
            filename = f"{uuid.uuid1()}.py"
            self.data[filename] = script

    async def setup(self, worker):
        responses = await asyncio.gather(
            *[
                worker.upload_file(
                    comm=None, filename=filename, data=data, load=True
                )
                for filename, data in self.data.items()
            ]
        )
        assert all(
            len(data) == r["nbytes"]
            for r, data in zip(responses, self.data.values())
        )


In [None]:
script = f"""
import os
os.environ["AWS_ACCESS_KEY_ID"] = "{os.getenv("AWS_ACCESS_KEY_ID")}"
os.environ["AWS_SECRET_ACCESS_KEY"] = "{os.getenv("AWS_SECRET_ACCESS_KEY")}"
os.environ["AWS_DEFAULT_REGION"] = "{os.getenv("AWS_DEFAULT_REGION")}"
"""

In [None]:
plugin = InitWorker(script=script)
client.register_worker_plugin(plugin)

In [None]:
client.run(print(os.getenv("AWS_ACCESS_KEY_ID")))

In [None]:
cluster

In [None]:
client.close()

We passed AWS credentials to the Dask workers via environment variables above, and the dask workers don't have the AWS credentials file with profiles defined, so we can't define a profile here. 

In [None]:
def gen_json(u):
    with fs.open(u, **so) as inf:
        h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
        fname = u.split('/')[-1]
        outf = f'coawst-public/rsignell/testing/gom1km/jsons/{fname}.json'
        print(outf)
        with fs.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode())

In [None]:
%%time
dask.compute(*[dask.delayed(gen_json)(u) for u in urls], retries=10);

#### Try accessing the individual JSON files directly from S3 as file-like objects

In [None]:
flist = fs.ls('coawst-public/rsignell/testing/gom1km/jsons/')
fobjs = [fs.open(f) for f in flist]

In [None]:
furls = sorted(['s3://'+f for f in flist])

In [None]:
furls

#### Try opening one of the single JSONs

In [None]:
import xarray as xr

#r_opts = {'anon': False} # NetCDF files on AWS Open Data public bucket
r_opts = {'profile': 'julia'}
fo = 's3://coawst-public/rsignell/testing/gom1km/jsons/ncom_1_2021032200_00780000.nc.json'

fs = fsspec.filesystem("reference", fo=fo, 
                       remote_protocol='s3', remote_options=r_opts)

m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
ds.water_u.plot()

#### Now create a single JSON for the whole dataset

In [None]:
furls

In [None]:
mzz = MultiZarrToZarr(furls, 
    storage_options={'profile':'julia'},
    remote_protocol='s3',                  
    remote_options={'profile':'julia'},   #JSON files  
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
#        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#          "data_vars": "minimal",
#          "coords": "minimal",
#          "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

Download the individual JSON files from S3:

In [None]:
#fs2.download('s3://esip-qhub/usgs/testing/jsons/', './jsons', recursive=True)

In [None]:
##from glob import glob
#json_list = sorted(glob("jsons/*.json"))

In [None]:
# mzz = MultiZarrToZarr(json_list, 
#     remote_protocol='s3',    #
#     remote_options={'anon' : 'True'},    
#     xarray_open_kwargs={
#         'decode_cf' : False,
#         'mask_and_scale' : False,
#         'decode_times' : False,
#         'use_cftime' : False,
#         'drop_variables': ['reference_time', 'crs'],
#         'decode_coords' : False
#     },
#     xarray_concat_args={
# #          "data_vars": "minimal",
# #          "coords": "minimal",
# #          "compat": "override",
#         "join": "override",
#         "combine_attrs": "override",
#         "dim": "time"
#     }
# )

In [None]:
%%time
#%%prun -D multizarr_profile 
mzz.translate('gom1km.json')

#### Try opening the consolidated JSON file

In [None]:
import xarray as xr

#r_opts = {'anon': False} # NetCDF files on AWS Open Data public bucket
r_opts = {'profile':'julia'}
fo = 'gom1km.json'
fs = fsspec.filesystem("reference", fo=fo, 
                       remote_protocol='s3', remote_options=r_opts)

m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
ds.water_u

In [None]:
ds.water_u.plot()

In [None]:
import hvplot.xarray
import numpy as np

In [None]:
ds['speed'] = np.sqrt(ds.water_u**2 + ds.water_v**2)

In [None]:
ds = ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180))

In [None]:
ds.water_u[-1,-1,:,:].plot()

In [None]:
ds.speed[-1,:,:].hvplot.quadmesh(x='lon', y='lat', 
                                 rasterize=True, cmap='turbo', 
                                 geo=True, tiles='OSM')

In [None]:
ds.surf_wnd_stress_e.hvplot.image(x='lon', y='lat', rasterize=True)

In [None]:
cluster.shutdown(); client.close()