### Convert HRRR to Zarr
#### Step 2/3: Fill missing data gaps
Here we use data from previous forecast cycles to fill any gaps that remain after trying to download the `tau=1 hour` data from each hourly forecast. 

In [None]:
import pandas as pd
import fsspec
import dask
from dask.distributed import Client
import subprocess
import os
import xarray as xr
import numpy as np
import datetime

In [None]:
from dask.distributed import Client

In [None]:
from dask_jobqueue import SLURMCluster
import os
cluster = SLURMCluster(processes=1, cores=36, memory='3GB',
                    walltime='23:00:00', queue='compute')

print(cluster.job_script())

In [None]:
client=Client(cluster)

In [None]:
client = Client()

In [None]:
fs = fsspec.filesystem('s3', anon=True)

Use a different directory (here "./nc2") to store the grib and netcdf files used to fill the gaps

In [None]:
bucket = 'noaa-hrrr-bdp-pds'  # archive'
ldir = '/vortexfs1/usgs/rsignell/HRRR/nc2'

In [None]:
fs.glob('s3://noaa-hrrr-bdp-pds/hrrr.20190301/conus/hrrr.t00z.wrfsfcf*.grib2')

In [None]:
#tmp_file = fsspec.open_local(f'simplecache::s3://{flist[0]}', 
#                              s3=dict(anon=True), simplecache={'cache_storage': '/tmp'})

In [None]:
#ds = xr.open_dataset(tmp_file, engine='cfgrib', 
#                       backend_kwargs=dict(filter_by_keys={'typeOfLevel': 'heightAboveGround', 'level': 2}))

In [None]:
dates = pd.date_range(start='2019-01-01 00:00',end='2019-12-31 23:00', freq='1h')

Check every time (every hour), and if a local grib file is not found, append that filename and time to a missing data list

In [None]:
%%time
ldir0 = '/vortexfs1/usgs/rsignell/HRRR/nc'
fmissing = []
dmissing = []
for date in dates:
    yyyymmdd = date.strftime('%Y%m%d')
    hh = date.strftime('%H')
    s3file = f'{bucket}/hrrr.{yyyymmdd}/conus/hrrr.t{hh}z.wrfsfcf01.grib2'
    fname = f'{ldir0}/hrrr.{yyyymmdd}{hh}.wrfsfcf01.grib2' 
    date
    if not os.path.exists(fname):
        fmissing.append(s3file)
        dmissing.append(date)

In [None]:
len(dmissing)

This function takes a date and an offset on input and trys to find a GRIB2 file that will supply the requested forecast from the four preceding longer forecasts (longer 36 hour HRRR forecasts are made on 0,6,12,18 hours.  The regular hourly forecasts are for 18 hours). 

In [None]:
def hrrr_aws(date, foff):
    n = date.hour + foff
    foff6 = int(np.mod(n,6)) 
    date6 = date + datetime.timedelta(hours=(foff-foff6))
    foff = foff + foff6 - 1
    for thour in [6, 12, 18, 24]:
        date_t = date6 - datetime.timedelta(hours=thour)
        foff_t = foff + thour
        hh = date_t.strftime('%H')
        foff_tt = f'f{(foff_t):02d}'
        yyyymmdd = date_t.strftime('%Y%m%d')
        cfile = f's3://noaa-hrrr-bdp-pds/hrrr.{yyyymmdd}/conus/hrrr.t{hh}z.wrfsfc{foff_tt}.grib2'
        flist = fs.ls(cfile)
        if not flist:   # if file listing is empty, keep going
            pass
        else:
            break
    return cfile

Download a specific grib file and convert to NetCDF using wgrib2, saving only certain variables. Here "cfile" is the actual GRIB file we download to fill the gap (as determined by the hrrr_aws function), but we save that grib file with the same name it would have had if it were not missing. 

In [None]:
vmatch = "(:TMP:2 m above ground:|:RH:2 m above ground:|:UGRD:10 m above ground:|:VGRD:10 m above ground:|:PRATE:surface:|:DSWRF:surface:|:DLWRF:surface:|:USWRF:surface:)"

In [None]:
@dask.delayed
def hrrr_grib2nc(cfile, date):
    yyyymmdd = date.strftime('%Y%m%d')
    hh = date.strftime('%H')
    fname = f'{ldir}/hrrr.{yyyymmdd}{hh}.wrfsfcf01.grib2' 
    print(fname)
    if not os.path.exists(fname):
        try:
            fs.download(cfile,fname)
            output = fname.replace(".grib2", ".nc")
            call = ["wgrib2", fname, "-match", vmatch, "-netcdf", output]
            verbose=False
            ret = False
            # Not very robust check but will allow you to re-run everything in case a single file failed.
            if not os.path.exists(output):
                ret = subprocess.run(call, capture_output=True)
            if verbose:
                print(ret.stdout.decode())
            if ret and ret.returncode == 0:
                print(f"Converted {fname} to {output}.")
        except:
            print(f'{cfile} not found')

    return 

Take data from 1st forecast hour, not the 00 analysis time to allow time for dynamic adjustment

In [None]:
%%time
tau = 1  
tasks = [hrrr_grib2nc(hrrr_aws(date, tau), date) for date in dmissing]

In [None]:
%%time
dask.compute(tasks);

In [None]:
# client.close(); cluster.close()