# Explore the National Water Model Reanalysis v2.1 
Explore the NWM Reanalysis (1979-2020) NetCDF files (all 367,439 of them) on AWS as a single xarray dataset! 
The only new file we created was a JSON file that points to data chunks in the original NetCDF files that is then read with the [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) and [zarr](https://zarr.readthedocs.io/en/stable/) packages. 

See this [blog post](https://medium.com/pangeo/cloud-performant-netcdf4-hdf5-with-zarr-fsspec-and-intake-3d3a3e7cb935) for how this works. 

**Important note on performance**: The data in the original NetCDF files is chunked as the entire spatial domain and a single time step.  Thus reading a time series will be very slow -- to extract a time series at a single location for the entire time period will require reading and uncompressing 8TB of data!   But extraction of a few days or weeks of data will be relatively fast. 


In [None]:
import intake
import fsspec

#### Use Intake to load the consolidated NWM dataset
The Intake catalog, the consolidated JSON file it accesses, and the NetCDF files the JSON file references are all on public S3 buckets that do not require an AWS account, so no credentials are required!

In [None]:
%%time
cat = intake.open_catalog('s3://esip-qhub-public/noaa/nwm/nwm_catalog.yml')

In [None]:
list(cat)

In [None]:
%%time
ds = cat['nwm-reanalysis'].to_dask()

In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Pangeo Worker', 
                                      propagate_env=True)

In [None]:
#client.close(); cluster.shutdown()

In [None]:
ds.streamflow

In [None]:
from rechunker import rechunk
import numpy as np

In [None]:
time_chunk_size = 672   
feature_chunk_size = 30000

In [None]:
nh_chunks = len(ds.feature_id)/feature_chunk_size
nh_chunks

In [None]:
nt_chunks = int(np.ceil(len(ds.time)/time_chunk_size))
nt_chunks

In [None]:
def delete_s3(url):
    fs1 = fsspec.open(url, anon=False).fs
    if fs1.exists(url):
        fs1.rm(url, recursive=True)

In [None]:
chunked_url = 's3://esip-qhub/usgs/zarr/nwm/chunked.zarr'
step_url = 's3://esip-qhub/usgs/zarr/step/step.zarr'
temp_url = 's3://esip-qhub/usgs/zarr/tmp/temp.zarr'

fs2 = fsspec.filesystem('s3', anon=False, default_fill_cache=False, skip_instance_cache=True)

delete_s3(chunked_url)
zarr_chunked = fs2.get_mapper(chunked_url)

In [None]:
fs2.ls('s3://esip-qhub/usgs/zarr/tmp/')

In [None]:
max_mem='1.8GB'

In [None]:
ds2.streamflow

In [None]:
chunks={'time':720, 'feature_id':30000}

In [None]:
ds2.streamflow.encoding

In [None]:
group_chunks = {}
# newer tuple version that also takes into account when specified chunks are larger than the array
for var in ds.variables:
    # pick appropriate chunks from above, and default to full length chunks for dimensions that are not in `chunks` above.
    group_chunks[var] = []
    for di in ds[var].dims:
        if di in chunks.keys():
            if chunks[di] > len(ds[di]):
                group_chunks[var].append(len(ds[di]))
            else:
                group_chunks[var].append(chunks[di])

        else:
            group_chunks[var].append(len([di]))

    group_chunks[var] = tuple(group_chunks[var])

In [None]:
group_chunks

In [None]:
%%time
#for i in range(nt_chunks):
for i in range(3):
    print(i)
    istart = i * time_chunk_size
    istop = int(np.min([(i+1) * time_chunk_size, len(ds.time)]))
    
#    ds = xr.open_mfdataset(files[istart:istop], parallel=True, 
#                           preprocess=drop_coords, combine='by_coords', 
#                       concat_dim='time', coords='minimal', compat='override')
    
    ds2 = ds.isel(time=slice(istart,istop))

    # remote the temp and step zarr datasets
    # chunk this step to zarr using rechunker
    delete_s3(step_url)
    delete_s3(temp_url)
    zarr_step = fs2.get_mapper(step_url)
    zarr_temp = fs2.get_mapper(temp_url)
    
    for var in ds2.data_vars:
        if len(ds2[var].dims)==2:

            ds2[var].encoding['_FillValue'] = -999900
            ds2[var].encoding['missing_value'] = -999900
            ds2[var].encoding['dtype'] = 'int16'
            ds2[var].encoding['chunks']: (720,30000)
    array_plan = rechunk(ds2, group_chunks, max_mem, zarr_step, 
                     temp_store=zarr_temp)
    
    print('Executing rechunk for {}'.format(i))
    with performance_report(filename="dask-report.html"):
        result = array_plan.execute(retries=10)

        
    print('Finished rechunk for {}'.format(i))
    # read back in the zarr chunk rechunker wrote
    ds3 = xr.open_zarr(zarr_step)

    if i==0:
        ds3.to_zarr(zarr_chunked, consolidated=True, mode='w')
    else:
        ds3.to_zarr(zarr_chunked, consolidated=True, append_dim='time')

In [None]:
ds2

In [None]:
ds2['streamflow'].encoding