In [None]:
from nwm_reanalysis import NWM
from dask.distributed import Client, LocalCluster
# import asyncio

In [None]:
start_date = "2000-01-01"
end_date = "2010-12-31"
# comids = [2043493, 20873280, 7590453]
comids = [2043493, 20873280, 7590453]


nwm = NWM(start_date=start_date, end_date=end_date, comids=comids)
scheduler = LocalCluster(n_workers=10, threads_per_worker=2, processes=False)
scheduler

In [None]:
%%time
# %%prun
nwm.request_timeseries(scheduler=scheduler, optimize=True, nwm_21=True)
nwm.data
# 10y, 1comid: 14min CPU, 19min
# 10y, 2comid: 21min CPU, 30min
# 10y, 3comid: 32min CPU, 46min

In [None]:
%%time
df = nwm.set_output(return_dataframe=False)
nwm.output

In [None]:
%%time
nwm.request_timeseries(scheduler=scheduler, optimize=False, nwm_2=True)
nwm.data

In [None]:
%%time
nwm.set_output()
nwm.output


-----------------------------------

In [1]:
import xarray as xr
import fsspec
import dask
import s3fs
from datetime import datetime
from rechunker import rechunk

from dask.distributed import Client, LocalCluster

In [2]:
n_workers = 10
scheduler = LocalCluster(n_workers=n_workers, threads_per_worker=2, processes=False)
client = Client(scheduler)
client.dashboard_link

'http://10.104.50.239:8787/status'

In [3]:
nwm_url = 's3://noaa-nwm-retrospective-2-1-zarr-pds/chrtout.zarr'
start_date = datetime.fromisoformat("2000-01-01")
end_date = datetime.fromisoformat("2010-12-31")
# comids = [2043493, 20873280, 7590453]
comids = [2043493]
variables = ["streamflow", "velocity"]

start_string = f"{start_date.year}-{start_date.month}-{start_date.day}"
end_string = f"{end_date.year}-{end_date.month}-{end_date.day}"

In [4]:
%%time
# drop_variables = ["elevation", "order", "qBtmVertRunoff", "qBucket", "qSfcLatRunoff", "q_lateral"]
ds = xr.open_zarr(fsspec.get_mapper(nwm_url, anon=True), consolidated=True)

CPU times: total: 11.2 s
Wall time: 55.7 s


In [None]:
%%time
# Option one, default with comid selection and time slice happening at the same time with dask
with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ds_data = ds[variables].sel(feature_id=comids).sel(time=slice(start_string, end_string)).load(optimize_graph=False, traverse=False)
data = ds_data
print(f"Start: {start_string}, End: {end_string}")
data

In [None]:
%%time
ds = ds.sel(time=slice(start_string, end_string))
ds_features_0 = (ds.where(ds.feature_id.isin(comids), drop=True))
ds_features = ds_features_0.copy()
ds_features = ds_features.drop('crs')

# Option two, rechunk the data
dim_chunk_sizes = {'feature_id': len(comids), 'time': len(ds_features.time)}
ds_features = ds_features.chunk(chunks=dim_chunk_sizes)
chunk_plan = {}
for vv in ds_features.variables:
    if vv in ['streamflow', 'velocity']:
        chunk_plan[vv] = tuple((dim_chunk_sizes[tt] for tt in ds_features[vv].dims))
    else: 
        chunk_plan[vv] = ds_features[vv].shape
    ds_features[vv].encoding['chunks'] = None

max_mem = f"{format(0.9 * 56 / n_workers, '.2f')}GB"
ds_features = ds_features.chunk(chunks=dim_chunk_sizes)
intermediate = "intermediate.zarr"
target = "target.zarr"

rechunked = rechunk(ds_features, target_chunks=chunk_plan, target_store=target,
                    max_mem=max_mem,
                    temp_store=intermediate)
result = rechunked.execute(retries=10)
ds = xr.open_zarr(target)
ds_data = ds[variables].load(optimize_graph=True, traverse=False)
data = ds_data
print(f"Start: {start_string}, End: {end_string}")
data

_copy_chunk((slice(0, 1, None),))
_copy_chunk((slice(0, 1, None),))
_copy_chunk((slice(0, 96432, None),))
_copy_chunk((slice(0, 96432, None), slice(0, 1, None)))
_copy_chunk((slice(0, 1, None),))
_copy_chunk((slice(0, 1, None),))
_copy_chunk((slice(0, 96432, None), slice(0, 1, None)))
_copy_chunk((slice(0, 1, None),))
_copy_chunk((slice(0, 1, None),))


In [None]:
%%time
# Option three, no dask slicing
ds_data = ds[variables].sel(feature_id=comids).sel(time=slice(start_string, end_string)).load(optimize_graph=False, traverse=False)
data = ds_data
print(f"Start: {start_string}, End: {end_string}")
data

In [None]:
%%time
data = data.sel(time=slice(start_string, end_string))
data

In [None]:
%%time
# zarr_file = nwm_url
# s3 = s3fs.S3FileSystem(anon=True)
# store = s3fs.S3Map(root=zarr_file, s3=s3, check=False)
# ds_nwm_chrtout = xr.open_zarr(store=store, consolidated=True)
# ds_nwm_chrtout

In [None]:
%%time
ds_nwm_feature = ds_nwm_chrtout.isel(feature_id==f'{self.comids}'.encode())
streamflow_nwm = ds_nwm_feature.streamflow.load()
streamflow_nwm_df = streamflow_nwm.squeeze('feature_id').to_dataframe()
streamflow_nwm_df

In [None]:
%%time
variables = ["streamflow", "velocity"]
start_string = f"{nwm.start_date.year}-{nwm.start_date.month}-{nwm.start_date.day}"
end_string = f"{nwm.end_date.year}-{nwm.end_date.month}-{nwm.end_date.day}"
# with dask.config.set(**{'array.slicing.split_large_chunks': True}):
ds_data = ds[variables].sel(feature_id=nwm.comids).sel(time=slice(start_string, end_string)).load(optimize_graph=False, traverse=False)
data = ds_data
print(f"Start: {start_string}, End: {end_string}")
data

In [None]:
%%time
start_string = f"{nwm.start_date.year}-{nwm.start_date.month}-{nwm.start_date.day}"
end_string = f"{nwm.end_date.year}-{nwm.end_date.month}-{nwm.end_date.day}"
ds_data = ds.sel(feature_id=nwm.comids)
ds_data = ds_data.sel(time=slice(start_string, end_string)).load(optimize_graph=True, traverse=False).load()
ds_data

In [None]:
ds_data = ds_data.sel(time=slice(start_string, end_string)).load(optimize_graph=True, traverse=False).load()
print(f"Start: {start_string}, End: {end_string}")
ds_data

In [None]:
all_variables = ["elevation", "order", "qBtmVertRunoff", "qBucket", "qSfcLatRunoff", "q_lateral", "streamflow", "velocity"]
d_v = list(set(all_variables) - set(variables))
d_v