# Convert lots of small NetCDFs to one big Zarr
The National Water Model writes a new NetCDF file for each hour, resulting in 8760 files for a year, and 227904 files for the entire 26 year reanalysis (1993-01-01 00:00 - 2018-12-31 23:00).  

For small datasets, rechunking the data to Zarr would be as simple as:

```
import xarray as xr
ds = xr.open_mfdataset('*.nc')
ds = ds.chunk({'time':672, 'feature_id':30000})
ds.to_zarr('all_nc.zarr', consolidated=True)
```
For large datasets, this approach is slow and uses too much memory.  Here we process the data in batches of 672 time files at a time (one time chunk).   

For each batch, we create an xarray dataset with open_mfdataset, then use [rechunker](https://github.com/pangeo-data/rechunker), which creates a rechunked Zarr dataset for that batch.  We then append each batch (each time chunk) along the time dimension, building up our overall dataset.   

The nice part of this approach is that if something goes wrong with the batch, we can fix the problem and just carry on appending.

In [None]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
import dask
from dask.distributed import Client, progress, performance_report
import zarr
import time
import fsspec
import s3fs

In [None]:
print(fsspec.__version__)
print(s3fs.__version__)

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
#dates = pd.date_range(start='1993-01-01 00:00',end='2018-12-31 23:00', freq='1h')
dates = pd.date_range(start='2018-01-01 00:00',end='2018-01-31 23:00', freq='1h')

Build a list of filenames for open_mfdataset

In [None]:
@dask.delayed
def s3open(path):
    fs = fsspec.filesystem('s3', anon=True, default_fill_cache=False)
    return fs.open(path)

In [None]:
s3path = 's3://noaa-nwm-retro-v2.0-pds/full_physics/{}/{}.CHRTOUT_DOMAIN1.comp'

In [None]:
fs = fsspec.filesystem('s3', anon=True, default_fill_cache=False)
first_file = s3path.format(dates[0].strftime('%Y'),dates[0].strftime('%Y%m%d%H%M'))
ncfile = fs.open(first_file)

In [None]:
first_file

In [None]:
dset = xr.open_dataset(ncfile, engine='h5netcdf')

In [None]:
dset.streamflow

In [None]:
%%time
files = [s3open(s3path.format(date.strftime('%Y'),date.strftime('%Y%m%d%H%M'))) for date in dates]

In [None]:
len(files)

A nice chunk size for object storage is on the order of 100Mb.   

In [None]:
time_chunk_size = 288
feature_chunk_size = 30000

In [None]:
nh_chunks = len(dset.feature_id)/feature_chunk_size
nh_chunks

In [None]:
nt_chunks = int(np.ceil(len(files)/time_chunk_size))
nt_chunks

In [None]:
(time_chunk_size * feature_chunk_size )*8 / 1e6

... Close enough to 100Mb

Create a function to drop stuff that messes up `open_mfdataset`

In [None]:
def drop_coords(ds):
    ds = ds.drop(['reference_time','feature_id', 'crs'])
    return ds.reset_coords(drop=True)

In [None]:
#cluster.close(); client.close()

In [None]:
#gateway.stop_cluster(cluster_name='932673464290474fb39b0b518655ac63')

In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

?ebd.start_dask_cluster

In [None]:
profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 20
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=False,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='default', worker_profile='Pangeo Worker', propagate_env=True)

In [None]:
cluster

Tell blosc not to use threads since we are using dask to parallelize

In [None]:
numcodecs.blosc.use_threads = False

In [None]:
def delete_s3(url):
    fs = fsspec.open(url, anon=False).fs
    if fs.exists(url):
        fs.rm(url, recursive=True)

In [None]:
chunked_url = 's3://esip-qhub/usgs/zarr/nwm/chunked.zarr'
step_url = 's3://esip-qhub/usgs/zarr/step/step.zarr'
temp_url = 's3://esip-qhub/usgs/zarr/tmp/temp.zarr'

fs = fsspec.filesystem('s3', anon=False)

zarr_chunked = fs.get_mapper(chunked_url)

In [None]:
#client.close; cluster.shutdown()
#client = Client()

In [None]:
#client.close()

In [None]:
dset.nbytes/1e6

In [None]:
dset.streamflow

In [None]:
#client.run(os.path.expanduser, "~")

In [None]:
# Don't run this is you have already started your zarr and are happy with it!!!
delete_s3(chunked_url)

In [None]:
#%%time
#dset.to_zarr(zarr_chunked, mode='w')

In [None]:
#client.retire_workers([k for k, v in client.processing().items() if v])

In [None]:
fs.ls('esip-qhub/usgs/zarr/step/')

In [None]:
#ds = xr.open_zarr(zarr_chunked, consolidated=True)

In [None]:
#fs.ls(chunked_url)

Step our way through the dataset, reading one chunk along the time dimension at a time, to avoid dask reading too many chunks before writing and blowing out memory.  First time chunk is written to zarr, then others are appended. 

#### Set up Rechunker

In [None]:
from rechunker import rechunk

In [None]:
chunk_mem_factor = 0.8  #fraction of worker memory for each chunk (seems to be the max possible)
worker_mem = 5e9/1e9 #cluster.worker_spec[0]['options']['memory_limit']/1e9
max_mem = '{:.2f}GB'.format(chunk_mem_factor * worker_mem)
print(max_mem)

In [None]:
max_mem = '1.6GB'

In [None]:
client

In [None]:
delete_s3(step_url)
delete_s3(temp_url)

#### Do the big loop

In [None]:
#### %%time
for i in range(nt_chunks):
    print(i)
    istart = i * time_chunk_size
    istop = int(np.min([(i+1) * time_chunk_size, len(files)]))
    
#    ds = xr.open_mfdataset(files[istart:istop], parallel=True, preprocess=drop_coords, combine='by_coords', 
#                       concat_dim='time', join='override', engine='h5netcdf', backend_kwargs={'decode_vlen_strings':True})

    ds = xr.open_mfdataset(files[istart:istop], parallel=True, preprocess=drop_coords, combine='by_coords', 
                       concat_dim='time', join='override', engine='h5netcdf')
    
    print('Finished opening for {}'.format(i))
    # add back in the 'feature_id' coordinate removed by preprocessing 
    ds.coords['feature_id'] = dset.coords['feature_id']
    
    # chunk this step to zarr using rechunker
    delete_s3(step_url)
    delete_s3(temp_url)
    zarr_step = fsspec.get_mapper(step_url, anon=False)
    zarr_temp = fsspec.get_mapper(temp_url, anon=False)
    
    chunk_plan={}
    for var in ds.data_vars:
        if len(ds[var].dims)==2:
            var_chunk = (time_chunk_size, feature_chunk_size)
            chunk_plan[var] = var_chunk

    array_plan = rechunk(ds, chunk_plan, max_mem, zarr_step, 
                 temp_store=zarr_temp)
    
    
    print('Executing rechunk for {}'.format(i))
    with performance_report(filename="dask-report.html"):
        result = array_plan.execute(retries=10)

        
    print('Finished rechunk for {}'.format(i))
    # read back in the zarr chunk rechunker wrote
    ds = xr.open_zarr(zarr_step)

    if i==0:
#        compressor = zarr.Blosc(cname='zstd', clevel=5, shuffle=zarr.Blosc.AUTOSHUFFLE)
#        encoding = {v: {'compressor': compressor, '_FillValue': -9999.0 } for v in ds.data_vars}
#        ds.to_zarr(zarr_chunked, consolidated=True, mode='w', encoding=encoding)
        ds.to_zarr(zarr_chunked, consolidated=True, mode='w')
    else:
        ds.to_zarr(zarr_chunked, consolidated=True, append_dim='time')
    
    print('Finished writing for {}'.format(i))


In [None]:
ds2 = xr.open_zarr(zarr_chunked)

The workflow threw an error on the last partial chunk because Rechunker doesn't think the chunk_plan is valid.  But it is valid to have a partial last chunk.   Here we just rechunk the last partial chunk without rechunker and append it to the overall Zarr dataset. 

In [None]:
ds2

In [None]:
test = ds2['streamflow'][:,100]

In [None]:
test.plot()

In [None]:
ds2.to_zarr(zarr_chunked, consolidated=True, append_dim='time')

Check the resulting chunked dataset for correct start time, stop time and for any gaps.  If there are no gaps we should get just a single unique value of 3600s for the difference between the hourly time steps. 

In [None]:
ds1 = xr.open_zarr('/usgs/gamone/data2/rsignell/data/NWM2/zarr/nwm', consolidated=True)
print(ds1.time[0].values)
print(ds1.time[-1].values)

In [None]:
d1 = ds1.time.diff(dim='time').values/1e9   # convert datetime64 nanoseconds to seconds

In [None]:
np.unique(d1)

In [None]:
#cluster.close();  client.close()

In [None]:
import hvplot.xarray
ds1.streamflow[:,1000].plot()

In [None]:
cluster