# NWM ReferenceFileSystem JSON 
Create ReferenceFileSystem JSON file for a collection of NWM NetCDF files on S3 

In [None]:
# this PR uses only the first two files to figure things out
# !pip install git+https://github.com/martindurant/fsspec-reference-maker@self_combine

In [None]:
import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray

In [None]:
import fsspec_reference_maker
print(fsspec_reference_maker.__version__)

In [None]:
fs = fsspec.filesystem('s3', anon=True)

#### Look at the first few files

In [None]:
fs.ls('noaa-nwm-retrospective-2-1-pds/model_output/1979/')[:5]

#### Look at the last few files

In [None]:
fs.ls('noaa-nwm-retrospective-2-1-pds/model_output/2020/')[-5:]

#### Use the first and last file to create the list of dates

In [None]:
import pandas as pd
dates = pd.date_range(start='1979-02-01 01:00',end='2020-12-31 23:00', freq='1h')

In [None]:
len(dates)

In [None]:
def date2cfile(date):
# Create S3 URL from date
    yyyymmddhh = date.strftime('%Y%m%d%H')
    yyyy = date.strftime('%Y')
    cfile = f's3://noaa-nwm-retrospective-2-1-pds/model_output/{yyyy}/{yyyymmddhh}00.CHRTOUT_DOMAIN1.comp'
    return cfile

We need to include the "s3://" prefix to the list of files so that fsspec will recognize that these JSON files are on S3:

In [None]:
urls = [date2cfile(date) for date in dates]

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')

In [None]:
print(urls[0])
print(urls[-1])

#### Start a Dask Gateway cluster

In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Pangeo Worker', 
                                      propagate_env=True)

#### Create the individual JSON files directly on S3 

We passed AWS credentials to the Dask workers via environment variables above, and the dask workers don't have the AWS credentials file with profiles defined, so we don't define a profile here, we just set `anon=False` and let the workers find the credentials via the environment variables:

In [None]:
fs2 = fsspec.filesystem('s3', anon=False)  

If the directory exists, remove it (and all the files)

In [None]:
json_dir = 's3://esip-qhub/usgs/nwm_reanalysis_v21/jsons/'

In [None]:
try:
    fs2.rm(json_dir, recursive=True)
except:
    pass

In [None]:
urls[0].split('/')[-1]

In [None]:
def gen_json(u):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = u.split('/')
        fname = p[-1]
        outf = f'{json_dir}{fname}.json'
        print(outf)
        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

#### Parallel creation of JSON for each file using Dask Bag

In [None]:
import dask.bag as db

In [None]:
%time b = db.from_sequence(urls, npartitions=900)

In [None]:
b1 = b.map(gen_json)

In [None]:
%%time
from dask.distributed import performance_report
with performance_report(filename="dask-report-whole.html"):
    b1.compute(retries=10)

In [None]:
client

#### Parallel creation of JSONS for each file using Dask Delayed (Deprecated): 
(we tried this before Dask Bag)

In [None]:
#%%time
# _ = dask.compute(*[dask.delayed(gen_json)(u) for u in urls[:1000]], retries=10);

In [None]:
%%time
flist2 = fs2.ls(json_dir)
furls = sorted(['s3://'+f for f in flist2])
furls[0]

In [None]:
len(furls)

#### Switch to a Dask LocalCluster
Since MultiZarrtoZarr only writes local files, we close the client and create a local cluster instead

In [None]:
client.close()

In [None]:
from dask.distributed import Client

In [None]:
client = Client(n_workers=1)

In [None]:
client

In [None]:
mzz = MultiZarrToZarr(furls, 
    storage_options={'anon':False}, 
    remote_protocol='s3',
    remote_options={'anon' : 'True'},   #JSON files  
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#        "data_vars": "minimal",
#        "coords": "minimal",
#        "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

In [None]:
%%time
#%%prun -D multizarr_profile 
local_json = 'nwm_reanalysis_v21.json'
mzz.translate(local_json)

#### Copy the local consolidated JSON file to S3

In [None]:
rpath = 's3://esip-qhub-public/noaa/nwm/nwm_reanalysis_v21.json'

fs2.put_file(lpath=local_json, rpath=rpath)

#### Try a single json

In [None]:
f = fs2.glob(f'{json_dir}1979020101*.json')[0]

In [None]:
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo=f's3://{f}', ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
ds = xr.open_dataset(m, engine="zarr", 

In [None]:
ds.streamflow.encoding

#### Try opening the consolidated JSON file from S3

In [None]:
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo=rpath, ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [None]:
%%time
ds.streamflow[:,1000].hvplot(x='time', grid=True)

In [None]:
cluster.shutdown(); client.close()

In [None]:
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo='1000.json', ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", chunks={})

In [None]:
ds

In [None]:
%%time
ds.streamflow[:10,1000].hvplot(x='time')

In [None]:
ds