# CONUS404 Temporal Aggregation
Create daily averages from hourly data, write to a zarr dataset


In [None]:
import fsspec
import xarray as xr
import hvplot.xarray
import intake
import os
import warnings
from dask.distributed import LocalCluster, Client
warnings.filterwarnings('ignore')

#### Open dataset from Intake Catalog
* Automatically select on-prem dataset from /caldera if running on prem (Denali/Tallgrass)
* Automatically select cloud data on S3 if not running on prem 

To test whether we are on-prem, we see if SLURM_CLUSTER_NAME is defined.  If SLURM_CLUSTER_NAME is not defined, the user is either not on Denali/Tallgrass on the main node, which they should not be on

In [None]:
url = 'https://raw.githubusercontent.com/hytest-org/hytest/main/dataset_catalog/hytest_intake_catalog.yml'

In [None]:
# open the hytest data intake catalog
hytest_cat = intake.open_catalog(url)
list(hytest_cat)

In [None]:
# open the conus404 sub-catalog
cat = hytest_cat['conus404-catalog']
list(cat)

#### Start as Dask client using an appropriate Dask Cluster
This is an optional step, but can speed up data loading significantly, especially when accessing data from the Cloud

In [None]:
def configure_cluster(machine):
    ''' Helper function to configure cluster
    '''
    if machine == 'denali':
        from dask.distributed import LocalCluster, Client
        cluster = LocalCluster(threads_per_worker=1)
        client = Client(cluster)
    
    elif machine == 'tallgrass':
        from dask.distributed import Client
        from dask_jobqueue import SLURMCluster
        cluster = SLURMCluster(queue='cpu', cores=1, 
                               walltime="01:00:00", account="woodshole",
                               interface='ib0', memory='6GB')
        cluster.adapt(maximum=10)
        client = Client(cluster)
        
    elif machine == 'local':
        import os
        import warnings
        warnings.warn("Running locally can result in costly data transfers!\n")
        n_cores = os.cpu_count() # set to match your machine
        cluster = LocalCluster(threads_per_worker=n_cores)
        client = Client(cluster)
        
    elif machine in ['esip-qhub-gateway-v0.4']:   
        import sys, os
        sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
        import ebdpy as ebd
        aws_profile = 'esip-qhub'
        aws_region = 'us-west-2'
        endpoint = f's3.{aws_region}.amazonaws.com'
        ebd.set_credentials(profile=aws_profile, region=aws_region, endpoint=endpoint)
        worker_max = 10
        client,cluster = ebd.start_dask_cluster(profile=aws_profile, worker_max=worker_max, 
                                              region=aws_region, use_existing_cluster=True,
                                              adaptive_scaling=False, wait_for_cluster=False, 
                                              worker_profile='Medium Worker', propagate_env=True)
        
    return client, cluster

In [None]:
if 'SLURM_CLUSTER_NAME' in os.environ:
    dataset = 'conus404-hourly-onprem'
    machine = os.environ['SLURM_CLUSTER_NAME']
    client, cluster = configure_cluster(machine)
else:
    dataset = 'conus404-hourly-cloud'
    machine = 'esip-qhub-gateway-v0.4'
    client, cluster = configure_cluster(machine)

In [None]:
ds = cat[dataset].to_dask()

In [None]:
ds

In [None]:
ds.SNOW

### Daily averages
Time averages of any type are easy to do with xarray.   Here we do 24 hour averages, and set the time offset to 12 hours, so that the time values are in the middle of the averaging period.   

Digital Earth Africa has a great [Working with Time in Xarray](https://docs.digitalearthafrica.org/fr/latest/sandbox/notebooks/Frequently_used_code/Working_with_time.html) tutorial.

In the example below we just do a few days with a few variables as a quick demo.   

In [None]:
%%time
ds_subset = ds[['T2','U10']].sel(time=slice('2017-01-02','2017-01-13'))

In [None]:
ds_subset_daily = ds_subset.resample(time="24H", loffset="12H").mean()

In [None]:
ds_subset_daily

In [None]:
ds_subset_daily.hvplot.quadmesh(x='lon', y='lat', rasterize=True, 
                             geo=True, tiles='OSM', alpha=0.7, cmap='turbo')

#### Write daily values as a Zarr dataset (to onprem or cloud)

In [None]:
%%time
if 'SLURM_CLUSTER_NAME' in os.environ:     # on prem (Caldera filesystem)
    ds_subset_daily.to_zarr('/caldera/usgs/change-me/conus_subset_daily.zarr', mode='w', consolidated=True)
else:                                      # cloud (AWS S3 nhgf-development bucket)
    fs_s3 = fsspec.filesystem('s3', anon=False)
    ds_subset_daily.to_zarr(fs_s3.get_mapper('s3://esip-qhub/testing/conus_subset_daily.zarr'), mode='w', consolidated=True)

#### Shutdown cluster

In [None]:
cluster.shutdown()