# Processing NWM data
Read 200,000+ NetCDF files (each about 40MB) from the National Water Model version 2 and create rechunked Zarr. 

In [None]:
import xarray as xr
import pandas as pd
import fsspec
import dask

from dask.distributed import Client, LocalCluster, performance_report
import xarray
xarray.set_options(display_style="text")   # html repr is 14mb!

#### Create Dask Cluster

In [None]:
#cluster_type = 'Fargate'
cluster_type = 'Gateway'

In [None]:
if cluster_type == 'Fargate':
    import boto3
    ecs = boto3.client('ecs')
    resp = ecs.list_clusters()
    clusters = resp['clusterArns']
    if len(clusters) > 1:
        print("Please manually select your cluster")
    cluster = clusters[0]
    numWorkers=70
    ecs.update_service(cluster=cluster, service='Dask-Worker', desiredCount=numWorkers)
    ecs.get_waiter('services_stable').wait(cluster=cluster, services=['Dask-Worker'])
    client = Client('Dask-Scheduler.local-dask:8786')

elif cluster_type == 'Gateway':
    import dask_gateway
    gateway = dask_gateway.Gateway()
    cluster = gateway.new_cluster(environment='pangeo', profile='Medium Worker')
    cluster.adapt(minimum=2, maximum=18)
    client = Client(cluster)

In [None]:
cluster

In [None]:
#client.close();cluster.close()

## Generate the list of NWM Streamflow files on AWS 

In [None]:
import pandas as pd
nwm_bucket = 's3://noaa-nwm-retro-v2.0-pds'
nwm_type = 'CHRTOUT'

dates = pd.date_range(start='1993-01-01 00:00',end='2018-12-31 23:00', freq='1h')
files = ['{}/full_physics/{}/{}.{}_DOMAIN1.comp'.format(nwm_bucket,date.strftime('%Y'),
            date.strftime('%Y%m%d%H%M'),nwm_type) for date in dates]
print(files[0])
print(files[-1])

#### Open an example file and check the native chunking
We want to chunk in a similar way for maximum performance

In [None]:
url = files[0]
ncfile = fsspec.open(url)
ds0 = xr.open_dataset(ncfile.open())
feature_id = ds0.feature_id
ds0.streamflow.encoding

In [None]:
ds0.streamflow

#### Open all the data as a single dataset (avoiding open_mfdataset)

In [None]:
@dask.delayed
def s3open_data(path):
    fs = fsspec.filesystem('s3', anon=True, default_fill_cache=False)
    f = fs.open(path)
    ds = xr.open_dataset(f) 
    return ds['streamflow'].values

files_mapper = [s3open_data(file) for file in files]

In [None]:
len(files_mapper)

In [None]:
%%time
shape = ds0.streamflow.shape
dtype = ds0.streamflow.dtype
data_mapper = [dask.array.from_delayed(f, shape, dtype=dtype) for f in files_mapper]
all_data = dask.array.stack(data_mapper)
print(all_data)

#### Convert Dask array to Xarray dataset, copying attributes from sample file

In [None]:
da = xr.DataArray(all_data, coords=[dates, feature_id], dims=["time", "feature_id"])

ds = da.to_dataset(name='streamflow')
ds = ds.assign_attrs(ds0.attrs)
ds['time'] = ds['time'].assign_attrs(ds0.time.attrs)
ds['feature_id'] = ds['feature_id'].assign_attrs(ds0.feature_id.attrs)
ds['streamflow'] = ds['streamflow'].assign_attrs(ds0.streamflow.attrs)

In [None]:
print(ds)

In [None]:
print(ds.streamflow)

#### Make sure we can access the private bucket we wish to write to

In [None]:
fs2 = fsspec.filesystem('s3',anon=False)

In [None]:
fs2.ls('/esip-qhub/noaa/NWM2')

#### Try writing 6 days (144 time steps) to Zarr

In [None]:
from rechunker import rechunk
import zarr

In [None]:
ds_test = ds.sel(time=slice('2015-01-01 00:00','2015-01-06 23:00'))

Write to Zarr (no rechunking)

In [None]:
%%time 
with performance_report(filename="dask-report.html"):
    ds_test.to_zarr(fsspec.get_mapper('s3://esip-qhub/noaa/NWM2/test_zarr2'), mode='w', consolidated=True)

Write to Zarr using rechunker

In [None]:
ds.streamflow

In [None]:
time_chunk = 72
feature_chunk = 30000
max_mem = '1.5GB'    # about 75% of dask worker

Chunk only 2D variables

In [None]:
chunk_plan={}
for var in ds.data_vars:
    if len(ds[var].dims)==2:
        var_chunk = (time_chunk, feature_chunk)
        chunk_plan[var] = var_chunk
print(chunk_plan)

In [None]:
#cluster.close();client.close();
#cluster.shutdown()

In [None]:
#cluster = LocalCluster(); client=Client(cluster); cluster

In [None]:
dir(fs)

In [None]:
ztemp = 's3://esip-qhub/noaa/NWM2/tmp4'
zf = 's3://esip-qhub/noaa/NWM2/zarr4'

#fs.rm(ztemp, recursive=True)

In [None]:
#fs.rm(zf, recursive=True)

In [None]:
zarr_temp = fsspec.get_mapper(ztemp)
zarr_chunked = fsspec.get_mapper(zf)

In [None]:
zarr_temp = './tmp3'
zarr_chunked = './zarr3'
    try:
        shutil.rmtree(zarr_temp)
        while os.path.exists(zarr_temp): # check if it still exists
            pass
    except:
        pass

    try:
        shutil.rmtree(zarr_chunked)
        while os.path.exists(zarr_chunked): # check if it still exists
            pass
    except:
        pass

In [None]:
array_plan = rechunk(ds_test, chunk_plan, max_mem, zarr_chunked, 
                     temp_store=zarr_temp)

In [None]:
%%time
with performance_report(filename="dask-report.html"):
    result = array_plan.execute(retries=10)

Read resulting Zarr file

In [None]:
#ds_test = zarr.open_consolidated(fsspec.get_mapper('s3://coastalcoupling/noaa/NWM2/test_zarr'),                                        mode='r')
#ds_chunk = xr.open_zarr(fsspec.get_mapper('s3://coastalcoupling/noaa/NWM2/zarr3'))
ds_chunk = xr.open_zarr('zarr3')

In [None]:
ds_chunk.streamflow

## Cluster scale down

When we are temporarily done with the cluster we can scale it down to save on costs

In [None]:
if cluster_type == 'Fargate':
    numWorkers=0
    ecs.update_service(cluster=cluster, service='Dask-Worker', desiredCount=numWorkers)
    ecs.get_waiter('services_stable').wait(cluster=cluster, services=['Dask-Worker'])

In [None]:
if cluster_type == 'Gateway':
    cluster.scale(0)