In [1]:
import xarray as xr
import gcsfs
xr.__version__

'0.11.0'

## Manually Build OpenDAP URLs

In [2]:
ranges = ['015101-025012', '025101-035012', '035101-045012',
          '045101-055012', '055101-065012']

base = 'https://35.188.100.90.xip.io/thredds/dodsC/test/pr_Amon_GFDL-CM4_piControl_r1i1p1f1_gr1'
urls = [f'{base}_{time_range}.nc'
        for time_range in ranges]
urls

['https://35.188.100.90.xip.io/thredds/dodsC/test/pr_Amon_GFDL-CM4_piControl_r1i1p1f1_gr1_015101-025012.nc',
 'https://35.188.100.90.xip.io/thredds/dodsC/test/pr_Amon_GFDL-CM4_piControl_r1i1p1f1_gr1_025101-035012.nc',
 'https://35.188.100.90.xip.io/thredds/dodsC/test/pr_Amon_GFDL-CM4_piControl_r1i1p1f1_gr1_035101-045012.nc',
 'https://35.188.100.90.xip.io/thredds/dodsC/test/pr_Amon_GFDL-CM4_piControl_r1i1p1f1_gr1_045101-055012.nc',
 'https://35.188.100.90.xip.io/thredds/dodsC/test/pr_Amon_GFDL-CM4_piControl_r1i1p1f1_gr1_055101-065012.nc']

## Load with Xarray

In [3]:
# convenience function to drop annoying bounds coordinates
time_chunks = 120

def drop_bounds(ds):
    to_drop = [dv for dv in ds.data_vars if 'bnds' in dv]
    return ds.drop(to_drop)
ds = xr.open_mfdataset(urls, preprocess=drop_bounds,
                       chunks={'time': time_chunks})
ds

  result = decode_cf_datetime(example_value, units, calendar)


<xarray.Dataset>
Dimensions:  (bnds: 2, lat: 180, lon: 288, time: 6000)
Coordinates:
  * bnds     (bnds) float64 1.0 2.0
  * lat      (lat) float64 -89.5 -88.5 -87.5 -86.5 -85.5 ... 86.5 87.5 88.5 89.5
  * lon      (lon) float64 0.625 1.875 3.125 4.375 ... 355.6 356.9 358.1 359.4
  * time     (time) object 0151-01-16 12:00:00 ... 0650-12-16 12:00:00
Data variables:
    pr       (time, lat, lon) float32 dask.array<shape=(6000, 180, 288), chunksize=(120, 180, 288)>
Attributes:
    external_variables:              areacella
    table_id:                        Amon
    history:                         File was processed by fremetar (GFDL ana...
    contact:                         gfdl.climate.model.info@noaa.gov
    comment:                         <null ref>
    tracking_id:                     hdl:21.14100/cdcd7052-c5b4-4e8f-83fc-fe1...
    further_info_url:                https://furtherinfo.es-doc.org/CMIP6.NOA...
    branch_time_in_child:            0.0
    branch_method:           

In [3]:
gc_path = 'pangeo-data/esgf_test/pr_Amon_GFDL-CM4_piControl_r1i1p1f1_gr1'

In [4]:
gcs = gcsfs.GCSFileSystem('pangeo-181919', token='browser')
gcsmap = gcsfs.GCSMap(gc_path, gcs=gcs)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=586241054156-ls4nduknhnelm2u6jtdgii15gsa3iv4v.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdevstorage.full_control&state=iUEBAK4C9NYySYLbdjETEup3oLrIQX&access_type=offline&prompt=consent


Enter the authorization code:  4/oAB2UWEO8nVlXZQKp6swAvoUVvHe_jB1FHmS-VUN89VSNl51hFgdbbI


In [8]:
import zarr
encoding = {dvar: {'compressor': zarr.Zstd(level=3)} for dvar in ds.data_vars}
encoding

{'pr': {'compressor': Zstd(level=3)}}

In [9]:
from dask.diagnostics import ProgressBar

with ProgressBar():
    ds.to_zarr(gcsmap, mode='w', encoding=encoding)

[########################################] | 100% Completed | 40.7s


In [4]:
dsz = xr.open_zarr(gcsfs.GCSMap(gc_path))
dsz

<xarray.Dataset>
Dimensions:  (bnds: 2, lat: 180, lon: 288, time: 6000)
Coordinates:
  * bnds     (bnds) float64 1.0 2.0
  * lat      (lat) float64 -89.5 -88.5 -87.5 -86.5 -85.5 ... 86.5 87.5 88.5 89.5
  * lon      (lon) float64 0.625 1.875 3.125 4.375 ... 355.6 356.9 358.1 359.4
  * time     (time) object 0151-01-16 12:00:00 ... 0650-12-16 12:00:00
Data variables:
    pr       (time, lat, lon) float32 dask.array<shape=(6000, 180, 288), chunksize=(120, 180, 288)>
Attributes:
    Conventions:                     CF-1.7 CMIP-6.0 UGRID-1.0
    DODS_EXTRA.Unlimited_Dimension:  time
    activity_id:                     CMIP
    branch_method:                   standard
    branch_time_in_child:            0.0
    branch_time_in_parent:           54750.0
    comment:                         <null ref>
    contact:                         gfdl.climate.model.info@noaa.gov
    creation_date:                   2018-09-13T23:51:53Z
    data_specs_version:              01.00.27
    experiment:    

In [5]:
from dask.distributed import Client
from dask_kubernetes import KubeCluster
cluster = KubeCluster(n_workers=4)
client = Client(cluster)
cluster

VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [6]:
%time pr_mean = dsz.pr.mean(dim='time').load()

CPU times: user 762 ms, sys: 76 ms, total: 838 ms
Wall time: 6.58 s


In [7]:
%time pr_mean = dsz.pr.mean(dim='time').load()

CPU times: user 550 ms, sys: 44 ms, total: 594 ms
Wall time: 2.81 s


In [9]:
dsz.pr.nbytes/1e9

1.24416