In [None]:
import xarray as xr
import numpy as np
from datetime import datetime
xr.__version__

## Set up Dask Cluster

In [None]:
from dask.distributed import Client
from dask_kubernetes import KubeCluster
cluster = KubeCluster(n_workers=1)
client = Client(cluster)
cluster

In [None]:
import distributed
from time import sleep, time

def get_nworkers(cores_per_worker=2):
    cl = distributed.get_client()
    ncores = sum(cl.ncores().values())
    return ncores // cores_per_worker

def block_until_scaled(desired_workers):
    cl = distributed.get_client()
    cl.restart()
    cl.cluster.scale(desired_workers)
    while get_nworkers() != desired_workers:
        sleep(5)

## Manually Build OpenDAP URLs

In [None]:
ranges = ['015101-025012', '025101-035012', '035101-045012',
          '045101-055012', '055101-065012']

base = 'https://35.188.100.90.xip.io/thredds/dodsC/test/pr_Amon_GFDL-CM4_piControl_r1i1p1f1_gr1'
urls = [f'{base}_{time_range}.nc'
        for time_range in ranges]
urls

### Functions for Loading Data

In [None]:
def drop_bounds(ds):
    to_drop = [dv for dv in ds.data_vars if 'bnds' in dv]
    return ds.drop(to_drop)

def load_ds_with_chunks(time_chunks):
    ds = xr.open_mfdataset(urls, preprocess=drop_bounds,
                           decode_times=False,
                           chunks={'time': time_chunks})
    return ds

In [None]:
time_chunks = 12
load_ds_with_chunks(12)

## Benchmark Loading Speed

In [None]:
nworkers = [1, 2, 4, 8]
time_chunks = [3, 6, 12, 24, 48]
rows = []
for nw in nworkers:
    for tc in time_chunks:
        block_until_scaled(nw)
        ds = load_ds_with_chunks(tc)
        total_data_size = ds.pr.nbytes/1e6
        tic = time()
        try:
            pr_mean = ds.pr.mean(dim='time').load()
            runtime = time() - tic
        except RuntimeError:
            runTime = np.nan
            break
        row = (datetime.now(), nw, tc, runtime, total_data_size)
        rows.append(row)
        print(', '.join([repr(r) for r in row]))

In [None]:
import pandas as pd
columns = ['timestamp', 'nworkers', 'chunksize', 'runtime', 'datasize']
df = pd.DataFrame(rows, columns=columns)
df

In [None]:
df.to_csv('benchmark.csv')