# NWM ReferenceFileSystem JSON 
Create ReferenceFileSystem JSON file for a collection of NWM NetCDF files on S3 

In [None]:
# !pip install git+https://github.com/martindurant/fsspec-reference-maker@self_combine

In [None]:
import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray

In [None]:
import fsspec_reference_maker

In [None]:
fsspec_reference_maker.__version__

In [None]:
fs2 = fsspec.filesystem('s3', profile='esip-qhub')  

In [None]:
json_dir = 's3://esip-qhub/usgs/nwm_reanalysis_v21/jsons/'

In [None]:
%%time
flist2 = fs2.ls(json_dir)
furls = sorted(['s3://'+f for f in flist2])
furls[0]

In [None]:
len(furls)

In [None]:
client

In [None]:
def preprocess(ds):
    return ds.set_coords(['latitude', 'longitude'])

In [None]:
import tempfile

In [None]:
def tasks(furls):
    mzz = MultiZarrToZarr(
    furls,
    remote_protocol="s3",
    remote_options={'anon':True},
    xarray_open_kwargs={
        "decode_cf" : False,
        "mask_and_scale" : False,
        "drop_variables": ["crs", "reference_time"]
    },
    xarray_concat_args={
        'dim' : 'time'
    },
    preprocess=preprocess
    )

    wd = tempfile.mkdtemp()
    local_name = furls[0].split('/')[-1]
    local_json = os.path.join(wd, local_name)
    mzz.translate(local_json)
    rpath = f's3://esip-qhub-public/noaa/nwm/{local_name}'
    fs2.put_file(lpath=local_json, rpath=rpath)

In [None]:
from dask.distributed import Client

In [None]:
#%%time
#_ = dask.compute(*[dask.delayed(tasks)(u) for u in [furls[:1000],furls[1000:2000]]], retries=10);

In [None]:
%%time
tasks(furls[:1000])

In [None]:
#shut everything down
#client.close();cluster.shutdown()

#### Stragegy to parallelize the MultiZarrToZarr is to:
* Use Dask Bag with partitionis to create a collection of consolidated files each with 1000 or so JSONS
* then consolidate the consolidated files

In [None]:
# test first 400 jsons with local client (only 4 workers, each with only 1.75GB)
from dask.distributed import Client
client = Client()
b = db.from_sequence(furls[:400], npartitions=4)
b1 = b.map_partitions(tasks)

from dask.distributed import performance_report
with performance_report(filename="dask-report-whole.html"):
    b1.compute(retries=10)

In [None]:
#### Let's see the collection of consolidated JSONs we made

In [None]:
f = fs2.glob('s3://esip-qhub-public/noaa/nwm/*comp.json')
f = [f's3://{a}' for a in f]
f

### We would like to do the whole list of furls with a distributed cluster
but I haven't been successful yet

In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Pangeo Worker', 
                                      propagate_env=True)

In [None]:
import dask.bag as db

In [None]:
b = db.from_sequence(furls, npartitions=400)

In [None]:
b1 = b.map_partitions(tasks)

In [None]:
%%time
from dask.distributed import performance_report
with performance_report(filename="dask-report-whole.html"):
    b1.compute(retries=10)

In [None]:
%%time
#%%prun -D multizarr_profile 

local_json = 'nwm_reanalysis_v21.json'
dask.compute(mzz.translate(local_json))

#### Try opening one of the consolidated JSONs from S3

In [None]:
f[-1]

In [None]:
s_opts = {'anon':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo=f[-1], ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", mask_and_scale=False, 
                     backend_kwargs={'consolidated':False})

In [None]:
ds

In [None]:
ds.streamflow

In [None]:
df = ds.streamflow.isel(time=0).to_pandas().to_frame()

In [None]:
df = df.assign(latitude=ds.latitude)
df = df.assign(longitude=ds.longitude)
df.rename(columns={0: "transport"}, inplace=True)

In [None]:
import hvplot.pandas
import geoviews as gv
from holoviews.operation.datashader import rasterize
import cartopy.crs as ccrs

In [None]:
p = df.hvplot.points('longitude', 'latitude', geo=True,
                     c='transport', colorbar=True, size=14)
g = rasterize(p, aggregator='mean', x_sampling=0.02, y_sampling=0.02, width=500).opts(tools=['hover'], 
                aspect='equal', logz=True, cmap='viridis', clim=(1e-2, np.nan))
g * gv.tile_sources.OSM