### Convert HRRR to Zarr
#### Step 1/3: Download GRIB2 files and convert to NetCDF
The goal of this workflow is to create a single "best time series" cloud-optimized HRRR dataset in Zarr.  There are 36 hour forecasts available at 0, 6, 12 and 18 hours, and 18 hour forecasts at every hour in between.  To create our "best time series" we use the tau=1, 1 hour forecast data, to allow a bit of dynamic adjustment away from the analysis time.   

The workflow is: 
1. Download the HRRR GRIB2 file corresponding to the tau=1 forecast for each hour
2. Convert to NetCDF using "wgrib2"
3. Fill in gaps with tau>1 from previous forecast cycles (e.g. to (fill a gap at tau=1 in the forecast run at 06:00 with tau=7 in the forecast run at 00:00 )
3. Rechunk the data using rechunker (producing Zarr format)  

In [None]:
import pandas as pd
import fsspec
import dask
from dask.distributed import Client
import subprocess
import os
import xarray as xr

In [None]:
from dask.distributed import Client

In [None]:
from dask_jobqueue import SLURMCluster
import os
cluster = SLURMCluster(processes=1, cores=36, memory='3GB',
                    walltime='23:00:00', queue='compute')

print(cluster.job_script())

In [None]:
client=Client(cluster)

In [None]:
cluster

In [None]:
cluster.scale(4)

In [None]:
fs = fsspec.filesystem('s3', anon=True)

In [None]:
bucket = 'noaa-hrrr-bdp-pds'  # archive'
ldir = '/vortexfs1/usgs/rsignell/HRRR/nc2'

In [None]:
flist = fs.glob(f'{bucket}/hrrr.20190101/conus/*sfcf01*.grib2')
flist

In [None]:
#tmp_file = fsspec.open_local(f'simplecache::s3://{flist[0]}', 
#                              s3=dict(anon=True), simplecache={'cache_storage': '/tmp'})

In [None]:
#ds = xr.open_dataset(tmp_file, engine='cfgrib', 
#                       backend_kwargs=dict(filter_by_keys={'typeOfLevel': 'heightAboveGround', 'level': 2}))

In [None]:
ds

In [None]:
dates = pd.date_range(start='2019-01-01 00:00',end='2019-12-31 23:00', freq='1h')

In [None]:
dates[0]

'noaa-hrrr-bdp-pds/hrrr.20190101/conus/hrrr.t00z.wrfnatf01.grib2'
'noaa-hrrr-bdp-pds/hrrr.20190101/conus/hrrr.t01z.wrfnatf01.grib2',

In [None]:
vmatch = "(:TMP:2 m above ground:|:RH:2 m above ground:|:UGRD:10 m above ground:|:VGRD:10 m above ground:|:PRATE:surface:|:DSWRF:surface:|:DLWRF:surface:|:USWRF:surface:)"

In [None]:
fmissing=[]

In [None]:
@dask.delayed
def hrrr_grib2nc(date):
    yyyymmdd = date.strftime('%Y%m%d')
    hh = date.strftime('%H')
    cfile = f's3://noaa-hrrr-bdp-pds/hrrr.{yyyymmdd}/conus/hrrr.t{hh}z.wrfsfcf01.grib2'
    fname = f'{ldir}/hrrr.{yyyymmdd}{hh}.wrfsfcf01.grib2' 
    if not os.path.exists(fname):
        try:
            fs.download(cfile,fname)
            output = fname.replace(".grib2", ".nc2")
            call = ["wgrib2", fname, "-match", vmatch, "-netcdf", output]
            verbose=False
            ret = False
            # Not very robust check but will allow you to re-run everything in case a single file failed.
            if not os.path.exists(output):
                ret = subprocess.run(call, capture_output=True)
            if verbose:
                print(ret.stdout.decode())
            if ret and ret.returncode == 0:
                print(f"Converted {fname} to {output}.")
        except:
            print(f'{cfile} not found')

    return 

In [None]:
tasks = [hrrr_grib2nc(date) for date in dates]

In [None]:
%%time
dask.compute(tasks);

In [None]:
# client.close(); cluster.close()