### Convert HRRR to Zarr
#### Step 3/3: Aggregate NetCDF files, rechunk and store as Zarr
Rechunk a collection of HRRR NetCDF files (converted from GRIB2 using "wgrib2") and convert to Zarr using xarray and rechunker.  We process the first time chunk, write to zarr, then repeat the process for the rest of the time chunks, appending each one. We use xr.open_mfdataset and rechunker on each time chunk, except for the last partial time chunk, where rechunker bombs.  So we rechunk that last partial step without using rechunker. 

In [None]:
from dask.distributed import Client, performance_report
import xarray as xr
import numpy as np
import shutil
import numpy as np

In [None]:
from dask_jobqueue import SLURMCluster
import os
# according to Rich Brey at WHOI, the are 160GB available on each 36 core node
# on poseidon, but need to leave some for system memory (here we leave 16GB)
cluster = SLURMCluster(processes=1, cores=36, memory='144GB',
                    walltime='02:00:00', queue='compute')

print(cluster.job_script())

In [None]:
client = Client(cluster)

In [None]:
cluster.scale(4)
cluster

In [None]:
#client = Client()
#client

In [None]:
%%time
ds = xr.open_mfdataset('./nc/hrrr.20190101*.nc', chunks={'time':1}, concat_dim='time', 
combine='nested', coords='minimal', compat='override', parallel=True)

In [None]:
np.unique(ds.time.diff(dim='time'))/1e6

In [None]:
from rechunker import rechunk

In [None]:
max_mem = '2.5GB'

In [None]:
time_chunk_size = 144
x_chunk_size = 300
y_chunk_size = 300

In [None]:
300*300*144*4/1e6

In [None]:
#cluster.close(); client.close()

In [None]:
zarr_step = '/vortexfs1/usgs/rsignell/HRRR/zarr/step'
zarr_temp = '/vortexfs1/usgs/rsignell/HRRR/zarr/tmp'
zarr_chunked = '/vortexfs1/usgs/rsignell/HRRR/zarr/hrrr'

In [None]:
client

In [None]:
%%time
chunk_plan={}
for var in ds.data_vars:
    if len(ds[var].dims)==3:
        var_chunk = (time_chunk_size, y_chunk_size, x_chunk_size)
        chunk_plan[var] = var_chunk

In [None]:
import glob
files = glob.glob("./nc/hrrr.2019*.nc")
files = np.sort(files)

In [None]:
nt_chunks = int(np.ceil(len(files)/time_chunk_size))
nt_chunks

In [None]:
files[0]

In [None]:
%%time
for i in range(60,nt_chunks):
    print(i)
    istart = i * time_chunk_size
    istop = int(np.min([(i+1) * time_chunk_size, len(files)]))
    
    ds = xr.open_mfdataset(files[istart:istop], concat_dim='time', 
                           combine='by_coords', coords='minimal', 
                           compat='override', parallel=True)
       
    # chunk this step to zarr using rechunker

    # remote the temp and step zarr datasets
    try:
        shutil.rmtree(zarr_temp, ignore_errors=False, onerror=None)
    except:
        pass
    try:
        shutil.rmtree(zarr_step, ignore_errors=False, onerror=None)
    except:
        pass

    chunk_plan={}
    for var in ds.data_vars:
        if len(ds[var].dims)==3:
            var_chunk = (time_chunk_size, y_chunk_size, x_chunk_size)
            chunk_plan[var] = var_chunk

    array_plan = rechunk(ds, chunk_plan, max_mem, zarr_step, 
                     temp_store=zarr_temp)
    
    with performance_report(filename="dask-report.html"):
        result = array_plan.execute(retries=10)

    # read back in the zarr chunk rechunker wrote
    ds = xr.open_zarr(zarr_step)

    if i==0:
        ds.to_zarr(zarr_chunked, consolidated=True, mode='w')
    else:
        ds.to_zarr(zarr_chunked, consolidated=True, append_dim='time')

In [None]:
ds

Write the last partial chunk, not using rechunker

In [None]:
ds1 = ds.chunk({'x':x_chunk_size,'y':y_chunk_size, 'time':time_chunk_size})

In [None]:
ds1.to_zarr('./zarr/last_step', consolidated=True, mode='w')

In [None]:
ds2 = xr.open_zarr('./zarr/last_step', consolidated=True)

In [None]:
ds2.to_zarr(zarr_chunked, consolidated=True, append_dim='time')

In [None]:
#client.close(); cluster.close()