In [8]:
import numpy as np
import xarray as xr
import dask
import intake
import gcsfs
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

In [13]:
def get_dictionary():
    """
    Function to get the dictionary of models and ensemble members of the historical runs 
    that have all of siconc, so, and thetao
    
    Returns the dictionary, the appropriate intake-esm catalog and the list of models needed to pass
    to the next function that gets the datasets.
    """
    
    print('opening intake-esm catalog...')
    url = "https://raw.githubusercontent.com/andrewpauling/cmip6hack-so-project/master/catalogs/pangeo-cmip6.json"
    col = intake.open_esm_datastore(url)
    print('done')
    
    cat = col.search(experiment_id=['historical'], table_id=['SImon', 'Omon'],
                 grid_label='gn')
    
    uni_dict = cat.unique(['source_id', 'experiment_id', 'table_id', 'member_id'])
    
    cat = col.search(experiment_id=['historical'], table_id=['SImon', 'Omon'],
                 grid_label='gn', variable_id=['siconc', 'thetao', 'so'])
    
    print('Find the models that have all three variables...')
    models = set(uni_dict['source_id']['values']) # all the models

    for table_id in ['SImon', 'Omon']:
        if table_id == 'SImon':
            query = dict(experiment_id='historical', table_id=table_id, 
                         variable_id='siconc', grid_label='gn')  
            cat = col.search(**query)
            models = models.intersection({model for model in cat.df.source_id.unique().tolist()})
        else:
            for variable_id in ['thetao', 'so']:
                query = dict(experiment_id='historical', table_id=table_id, 
                             variable_id=variable_id, grid_label='gn')  
                cat = col.search(**query)
                models = models.intersection({model for model in cat.df.source_id.unique().tolist()})
                
    models = list(models)
    print('Done')
    
    cat = col.search(experiment_id='historical', table_id=['Omon', 'SImon'], 
                 variable_id=['siconc', 'thetao', 'so'], grid_label='gn', source_id=models)
    
    print('Make sure all three variables have the same ensemble member...')
    filt_dict = dict()

    for model in models:
        tmp2 = cat.search(source_id=model)
        tmp2.df.head()
        members = tmp2.df['member_id']
        memlist = list()
        for member in list(members):
            a = tmp2.search(member_id=member, variable_id='siconc').df['activity_id'].empty
            b = tmp2.search(member_id=member, variable_id='thetao').df['activity_id'].empty
            c = tmp2.search(member_id=member, variable_id='so').df['activity_id'].empty
            if not a and not b and not c and member not in memlist:
                memlist.append(member)
        filt_dict[model] =  memlist
        
    print('Done')
    
    return filt_dict, cat, models
        
def get_datasets(filt_dict, cat, models):
    """
    Function to load the dataset dictionaries for each of the variables siconc, so, thetao. Takes in the output
    of get_dictionary()
    Returns the dataset dictonary for each variable. Separate one for each variable due to problems with intake-esm 
    for some models
    """
        
    icedict = dict()
    sodict = dict()
    thetaodict = dict()
    for model in models:
        print(model)
        tmpice = cat.search(source_id=model, member_id=filt_dict[model], variable_id='siconc')
        tmpdict_ice = tmpice.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False},
                                             cdf_kwargs={'chunks': {}, 'decode_times': False})
        icedict.update(tmpdict_ice)
        tmpice = None
        tmpdictice = None
    
        tmpso = cat.search(source_id=model, member_id=filt_dict[model], variable_id='so')
        tmpdict_so = tmpso.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False},
                                           cdf_kwargs={'chunks': {}, 'decode_times': False})
        sodict.update(tmpdict_so)
        tmpso = None
        tmpdictso = None
    
        tmpthetao = cat.search(source_id=model, member_id=filt_dict[model], variable_id='thetao')
        tmpdict_thetao = tmpthetao.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False},
                                                   cdf_kwargs={'chunks': {}, 'decode_times': False})
        thetaodict.update(tmpdict_thetao)
        tmpthetao = None
        tmpdictthetao = None
    
    return icedict, sodict, thetaodict
    

In [14]:
# filt_dict, cat, models = get_dictionary()
icedict, sodict, thetaodict = get_datasets(filt_dict, cat, models)

CESM2
--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 1 group(s)
--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 1 group(s)
--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 1 group(s)
EC-Earth3-Veg
--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 1 group(s)
--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 1 group(s)
--> The keys in the returned dictionary of datasets are cons