In [1]:
import numpy as np
import xarray as xr
import intake
import gcsfs
import matplotlib.pyplot as plt
import cartopy.crs as ccrs



In [2]:
url = "https://raw.githubusercontent.com/andrewpauling/cmip6hack-so-project/master/catalogs/pangeo-cmip6.json"
col = intake.open_esm_datastore(url)
col.df.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year
0,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,pr,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1...,
1,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,prsn,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1...,
2,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,tas,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1...,
3,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,tasmax,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1...,
4,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,tasmin,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1...,


In [3]:
cat = col.search(experiment_id=['historical'], table_id=['SImon', 'Omon'],
                 grid_label='gn')
cat.df.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year
163,CMIP,AWI,AWI-CM-1-1-MR,historical,r1i1p1f1,Omon,hfds,gn,gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r...,
164,CMIP,AWI,AWI-CM-1-1-MR,historical,r1i1p1f1,Omon,mlotst,gn,gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r...,
165,CMIP,AWI,AWI-CM-1-1-MR,historical,r1i1p1f1,Omon,so,gn,gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r...,
166,CMIP,AWI,AWI-CM-1-1-MR,historical,r1i1p1f1,Omon,sos,gn,gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r...,
167,CMIP,AWI,AWI-CM-1-1-MR,historical,r1i1p1f1,Omon,tauuo,gn,gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r...,


In [4]:
import pprint 
uni_dict = col.unique(['source_id', 'experiment_id', 'table_id', 'member_id'])
pprint.pprint(uni_dict, compact=True)

{'experiment_id': {'count': 29,
                   'values': ['ssp370', 'esm-ssp585', '1pctCO2-bgc', 'hist-bgc',
                              '1pctCO2', 'abrupt-4xCO2', 'historical',
                              'piControl', 'amip', 'esm-hist', 'esm-piControl',
                              'hist-GHG', 'hist-aer', 'hist-nat', 'dcppA-assim',
                              'dcppA-hindcast', 'dcppC-hindcast-noAgung',
                              'dcppC-hindcast-noElChichon',
                              'dcppC-hindcast-noPinatubo', 'highresSST-present',
                              'control-1950', 'hist-1950', 'deforest-globe',
                              'esm-ssp585-ssp126Lu', 'omip1', 'lgm', 'ssp126',
                              'ssp245', 'ssp585']},
 'member_id': {'count': 86,
               'values': ['r1i1p1f1', 'r2i1p1f1', 'r3i1p1f1', 'r1i1p2f1',
                          'r1i1p1f2', 'r4i1p1f2', 'r101i1p1f1', 'r4i1p1f1',
                          'r5i1p1f1', 'r2i1p2f1', 'r3i

Find all the models that have monthly siconc and monthly thetao and so

In [5]:
cat = col.search(experiment_id=['historical'], table_id=['SImon', 'Omon'],
                 grid_label='gn', variable_id=['siconc', 'thetao', 'so'])
cat.df.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year
165,CMIP,AWI,AWI-CM-1-1-MR,historical,r1i1p1f1,Omon,so,gn,gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r...,
169,CMIP,AWI,AWI-CM-1-1-MR,historical,r1i1p1f1,Omon,thetao,gn,gs://cmip6/CMIP/AWI/AWI-CM-1-1-MR/historical/r...,
323,CMIP,BCC,BCC-CSM2-MR,historical,r1i1p1f1,Omon,so,gn,gs://cmip6/CMIP/BCC/BCC-CSM2-MR/historical/r1i...,
325,CMIP,BCC,BCC-CSM2-MR,historical,r1i1p1f1,Omon,thetao,gn,gs://cmip6/CMIP/BCC/BCC-CSM2-MR/historical/r1i...,
586,CMIP,BCC,BCC-ESM1,historical,r1i1p1f1,Omon,so,gn,gs://cmip6/CMIP/BCC/BCC-ESM1/historical/r1i1p1...,


# Get models that have monthly sea ice concentration, thetao and so

In [6]:
models = set(uni_dict['source_id']['values']) # all the models

for table_id in ['SImon', 'Omon']:
    if table_id == 'SImon':
        query = dict(experiment_id='historical', table_id=table_id, 
                     variable_id='siconc', grid_label='gn')  
        cat = col.search(**query)
        models = models.intersection({model for model in cat.df.source_id.unique().tolist()})
    else:
        for variable_id in ['thetao', 'so']:
            query = dict(experiment_id='historical', table_id=table_id, 
                         variable_id=variable_id, grid_label='gn')  
            cat = col.search(**query)
            models = models.intersection({model for model in cat.df.source_id.unique().tolist()})
        

models = list(models)
models

['CNRM-CM6-1',
 'HadGEM3-GC31-LL',
 'GFDL-CM4',
 'MIROC-ES2L',
 'CanESM5',
 'CNRM-ESM2-1',
 'EC-Earth3-Veg',
 'CAMS-CSM1-0',
 'SAM0-UNICON',
 'NESM3',
 'UKESM1-0-LL',
 'IPSL-CM6A-LR',
 'CESM2',
 'MIROC6']

In [7]:
cat = col.search(experiment_id='historical', table_id=['Omon', 'SImon'], 
                 variable_id=['siconc', 'thetao', 'so'], grid_label='gn', source_id=models)
cat.df.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year
838,CMIP,CAMS,CAMS-CSM1-0,historical,r1i1p1f1,Omon,so,gn,gs://cmip6/CMIP/CAMS/CAMS-CSM1-0/historical/r1...,
841,CMIP,CAMS,CAMS-CSM1-0,historical,r1i1p1f1,Omon,thetao,gn,gs://cmip6/CMIP/CAMS/CAMS-CSM1-0/historical/r1...,
847,CMIP,CAMS,CAMS-CSM1-0,historical,r1i1p1f1,SImon,siconc,gn,gs://cmip6/CMIP/CAMS/CAMS-CSM1-0/historical/r1...,
863,CMIP,CAMS,CAMS-CSM1-0,historical,r2i1p1f1,SImon,siconc,gn,gs://cmip6/CMIP/CAMS/CAMS-CSM1-0/historical/r2...,
1455,CMIP,CCCma,CanESM5,historical,r10i1p1f1,Omon,so,gn,gs://cmip6/CMIP/CCCma/CanESM5/historical/r10i1...,


# Make sure each entry has the same ensemble member

In [8]:
filt_dict = dict()

for model in models:
    tmp2 = cat.search(source_id=model)
    tmp2.df.head()
    members = tmp2.df['member_id']
    memlist = list()
    for member in list(members):
        a = tmp2.search(member_id=member, variable_id='siconc').df['activity_id'].empty
        b = tmp2.search(member_id=member, variable_id='thetao').df['activity_id'].empty
        c = tmp2.search(member_id=member, variable_id='so').df['activity_id'].empty
        if not a and not b and not c and member not in memlist:
            memlist.append(member)
    filt_dict[model] =  memlist
            


In [9]:
filt_dict

{'CNRM-CM6-1': ['r10i1p1f2'],
 'HadGEM3-GC31-LL': ['r2i1p1f3'],
 'GFDL-CM4': ['r1i1p1f1'],
 'MIROC-ES2L': ['r1i1p1f2'],
 'CanESM5': ['r10i1p1f1'],
 'CNRM-ESM2-1': ['r2i1p1f2'],
 'EC-Earth3-Veg': ['r1i1p1f1'],
 'CAMS-CSM1-0': ['r1i1p1f1'],
 'SAM0-UNICON': ['r1i1p1f1'],
 'NESM3': ['r1i1p1f1'],
 'UKESM1-0-LL': ['r1i1p1f2'],
 'IPSL-CM6A-LR': ['r10i1p1f1'],
 'CESM2': ['r10i1p1f1',
  'r11i1p1f1',
  'r1i1p1f1',
  'r2i1p1f1',
  'r3i1p1f1',
  'r4i1p1f1',
  'r5i1p1f1',
  'r6i1p1f1',
  'r7i1p1f1',
  'r8i1p1f1',
  'r9i1p1f1'],
 'MIROC6': ['r10i1p1f1']}

# Get the datasets out

In [None]:
bigdict = dict()
for model in models[:3]:
    tmp = cat.search(source_id=model, member_id=filt_dict[model])
    tmpdict = tmp.to_dataset_dict(zarr_kwargs={'consolidated': True},
                                  cdf_kwargs={'chunks': {}})
    bigdict.update(tmpdict)
    tmp = None
    tmpdict = None

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 2 group(s)
--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 2 group(s)


In [12]:
tmpdict.keys()

dict_keys(['CMIP.NOAA-GFDL.GFDL-CM4.historical.Omon.gn', 'CMIP.NOAA-GFDL.GFDL-CM4.historical.SImon.gn'])

In [13]:
tmpdict['CMIP.NOAA-GFDL.GFDL-CM4.historical.Omon.gn']

<xarray.Dataset>
Dimensions:    (bnds: 2, lev: 35, member_id: 1, time: 1980, vertex: 4, x: 1440, y: 1080)
Coordinates:
  * bnds       (bnds) float64 1.0 2.0
  * time       (time) object 1850-01-16 12:00:00 ... 2014-12-16 12:00:00
  * lev        (lev) float64 2.5 10.0 20.0 32.5 ... 5e+03 5.5e+03 6e+03 6.5e+03
  * y          (y) float64 -80.39 -80.31 -80.23 -80.15 ... 89.73 89.84 89.95
  * x          (x) float64 -299.7 -299.5 -299.2 -299.0 ... 59.53 59.78 60.03
  * member_id  (member_id) <U8 'r1i1p1f1'
Dimensions without coordinates: vertex
Data variables:
    lon_bnds   (y, x, vertex) float32 dask.array<chunksize=(1080, 1440, 4), meta=np.ndarray>
    time_bnds  (time, bnds) object dask.array<chunksize=(1980, 2), meta=np.ndarray>
    lon        (y, x) float32 dask.array<chunksize=(1080, 1440), meta=np.ndarray>
    lat        (y, x) float32 dask.array<chunksize=(1080, 1440), meta=np.ndarray>
    lat_bnds   (y, x, vertex) float32 dask.array<chunksize=(1080, 1440, 4), meta=np.ndarray>
    l