# Read hourly data, write daily data

In [None]:
import xarray as xr
import dask.distributed
from dask.distributed import Client
import fsspec

In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 10
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Pangeo Worker', propagate_env=True)

In [None]:
client

In [None]:
fs1 = fsspec.filesystem('s3', anon=True)
read_url = 's3://noaa-nwm-retro-v2-zarr-pds'
read_mapper = fs1.get_mapper(read_url)

In [None]:
fs2 = fsspec.filesystem('s3', anon=False, profile='esip-qhub')
write_url = 's3://esip-qhub/usgs/rsignell/zarr/zarr_daily'
write_mapper = fs2.get_mapper(write_url)

In [None]:
%%time
ds = xr.open_zarr(read_mapper, consolidated=True)

In [None]:
ds.streamflow

In [None]:
ds2 = ds[['streamflow']].isel(time=slice(0,672), feature_id=slice(0,30000*90))

In [None]:
def daily_mean(da):
    return da.coarsen(time=24).mean()

In [None]:
%%time
ds2.streamflow.unify_chunks().map_blocks(daily_mean, 
                                         template=ds2.streamflow.coarsen(time=24).mean()).compute()

In [None]:
%%time
daily_mean = ds2.resample(time='D').mean().compute()

In [None]:
client

In [None]:
ds2.streamflow

In [None]:
def delete_s3(fs, url):
    fs1 = fs.open(url, anon=False).fs
    if fs1.exists(url):
        fs1.rm(url, recursive=True)

In [None]:
delete_s3(fs2, write_url)

In [None]:
%%time
#a = daily_mean[['streamflow']].to_zarr(write_mapper, compute=False, mode='w', consolidated=True)

In [None]:
da = ds2.streamflow

In [None]:
da

In [None]:
%%time
def func(block):
    template = block.isel(time=slice(0, -1, 24))
    data = block.data.reshape(672 // 24, 24, 30000).mean(axis=1)
    output = template.copy(data=data)
    return output

#b = ds2.unify_chunks().map_blocks(func, template=ds2.unify_chunks().isel(time=slice(0, -1, 24)))

In [None]:
%%time
def func2(block):
    return block.data.reshape(672 // 24, 24, 30000).mean(axis=1)

In [None]:
da = ds2.streamflow

In [None]:
%%time
b = da.unify_chunks().map_blocks(func, template=da.unify_chunks().isel(time=slice(0, -1, 24)))

In [None]:
%%time 
b = ds2.unify_chunks().map_blocks(func2, template=ds2.unify_chunks().isel(time=slice(0, -1, 24)))

In [None]:
%%time
b.compute()

In [None]:
ds2.latitude.encoding['chunks'] = (30000,)
ds2.longitude.encoding['chunks'] = (30000,)
ds2.feature_id.encoding['chunks'] = (30000,)

In [None]:
%%time
c = b.to_zarr(write_mapper, compute=False, mode='w', consolidated=True)

In [None]:
%%time
from dask.distributed import performance_report

with performance_report(filename="dask-report.html"):
    dask.compute(c, retries=10)

In [None]:
client.close(); cluster.shutdown()