In [1]:
from easy_coloc import lib_easy_coloc
import xarray as xr
import pandas as pd
import cartopy as cart
import matplotlib.pylab as plt
from matplotlib import cm
import datetime
import cmocean
import numpy as np
import dateutil
import intake
import dask

def model_to_line(ovar_name=None,
                model=None,
                cruise_line=None,
                catalog_path='../catalogs/pangeo-cmip6.json',
                qc_path='../qc',
                output_path='../../sections/'):
    '''
    generate_model_section(ovar_name, model)

    ** THIS IS SLOW **

    Input
    ==========
    ovar_name : variable name (eg 'dissic')
    model : model name (eg CanESM5)

    Output
    ===========
    ds : dataset of section output

    Example
    ============
    ds = model_to_line(ovar_name='dissic',
                   model='CanESM5',
                   cruise_line='A16')
    '''
    
    sampled_var = xr.open_mfdataset(f'{output_path}{ovar_name}_{model}_r*f1.nc')


    # load GLODAP station information from csv file
    # drop nans, reset index, and drop uneeded variable
    df = pd.read_csv(f'{qc_path}/GLODAPv2.2019_COORDS.csv')
    df = df.dropna()
    df = df.reset_index().drop('Unnamed: 0', axis=1)
    
    dates = [f'{int(year)}-{int(month):02d}-01' for year,month in zip(df.year,df.month)]
    df['dates'] = dates
    

    # Glodap expo codes
    expc = pd.read_csv(f'{qc_path}/FILTERED_GLODAP_EXPOCODE.csv')

    # rename df to coords
    cruise_x = df[df.cruise.isin( expc['ID'][expc.LINE.str.contains(cruise_line )] )]

    # need to change the Timedelta each day for some reason
    section_dates = [dateutil.parser.parse(date) for date in cruise_x.dates]

    section_dates = xr.DataArray(section_dates,dims='station')

    stations = cruise_x.index
    stations = xr.DataArray(stations,dims='station')

    section = sampled_var.sel(all_stations = stations, time=section_dates)
    #section.attrs['expocode'] = expc[expc.ID == cruise_id].EXPOCODE.values[0]

    return section




# new function

In [1]:
!pip install git+https://github.com/raphaeldussin/easy_coloc.git

Collecting git+https://github.com/raphaeldussin/easy_coloc.git
  Cloning https://github.com/raphaeldussin/easy_coloc.git to /tmp/pip-req-build-rxp3wv19
  Running command git clone -q https://github.com/raphaeldussin/easy_coloc.git /tmp/pip-req-build-rxp3wv19
Building wheels for collected packages: easy-coloc
  Building wheel for easy-coloc (setup.py) ... [?25ldone
[?25h  Created wheel for easy-coloc: filename=easy_coloc-1.2-cp37-none-any.whl size=15236 sha256=fbd5c585f01675f1a9e2f8b56140a7d1899f539f8ef02dc71d8eac9b197a03f5
  Stored in directory: /tmp/pip-ephem-wheel-cache-bklulcy3/wheels/d2/7d/b4/b59fd9036952a1fd5fd53be0197d0765da76ff584b04961e7d
Successfully built easy-coloc
Installing collected packages: easy-coloc
Successfully installed easy-coloc-1.2


In [5]:
from easy_coloc import lib_easy_coloc
import xarray as xr
import pandas as pd
import cartopy as cart
import matplotlib.pylab as plt
from matplotlib import cm
import datetime
import cmocean
import numpy as np
import dateutil
import intake
import dask

def model_to_glodap(ovar_name=None,
                model=None,
                catalog_path='../catalogs/pangeo-cmip6.json',
                qc_path='../qc',
                output_path='../../sections/'):
    '''
    generate_model_section(ovar_name, model)

    ** THIS IS SLOW **

    Input
    ==========
    ovar_name : variable name (eg 'dissic')
    model : model name (eg CanESM5)

    Output
    ===========
    ds : dataset of section output

    Example
    ============
    '''
    
    institue = {'CanESM5':'CCCma',
                'MIROC-ES2L':'MIROC',
                'UKESM1-0-LL':'MOHC',
                'GISS-E2-1-G-CC':'NASA-GISS',
                'CESM2':'NCAR'
               }

    # Get CMIP6 output from intake_esm
    col = intake.open_esm_datastore(catalog_path)
    cat = col.search(experiment_id='historical',
                     table_id='Omon',
                     variable_id=ovar_name,
                     grid_label='gn')

    # dictionary of subset data
    dset_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True},
                                    cdf_kwargs={'chunks': {}})

    # Put data into dataset
    ds = dset_dict[f'CMIP.{institue[model]}.{model}.historical.Omon.gn']


    # Rename olevel to lev
    coord_dict = {'olevel':'lev'} # a dictionary for converting coordinate names
    if 'olevel' in ds.dims:
        ds = ds.rename(coord_dict)

    # load GLODAP station information from csv file
    # drop nans, reset index, and drop uneeded variable
    df = pd.read_csv(f'{qc_path}/GLODAPv2.2019_COORDS.csv')
    df = df.dropna()
    df = df.reset_index().drop('Unnamed: 0', axis=1)

    # Genearte times list and put into dataframe
    dates = [f'{int(year)}-{int(month):02d}-01' for year,month in zip(df.year,df.month)]
    df['dates'] = dates

    # Find unique dates, these are the sample dates
    sample_dates = df['dates'].sort_values().unique()

    # Parse the historical period
    sample_dates = sample_dates[0:125]
    sample_dates = [dateutil.parser.parse(date) for date in sample_dates]

    # shift dates to middle of the month
    ds['time'] = pd.date_range(start=f'{ds.time.dt.year[0].values}-{ds.time.dt.month[0].values:02}',
                               end=f'{ds.time.dt.year[-1].values}-{ds.time.dt.month[-1].values:02}',
                               freq='MS')

    # ==========================================
    # Here we start making the ovar dataset
    # ==========================================
    # Trim the dates to sample_dates
    ovar = ds[ovar_name].sel(time=sample_dates)
    
    if (model != 'CESM2') or (model != 'GISS-E2-1-G-CC'):
        
        ovar['lat'] = ds.lat
        ovar['lon'] = ds.lon
    
    if (model != 'CESM2') and (model != 'GISS-E2-1-G-CC'):
    
        ovar['lat'] = ds.latitude
        ovar['lon'] = ds.longitude
    
 

    # create source grid and target section objects
    # this requires lon,lat from stations and the source grid dataset containing lon,lat
    proj = lib_easy_coloc.projection(df['longitude'].values,df['latitude'].values,grid=ovar,
                                     from_global=True)
    
    realizations = cat.df[cat.df['source_id']==model].member_id.values

    if len(realizations) < 2:
        
        fld = np.zeros((len(sample_dates),len(ovar.lev),len(df)))

        ovar = ovar.squeeze()
        
        for ind in range(5, 130, 5):
            dates = sample_dates[ind-5:ind]
            fld_tem = proj.run(ovar.sel(time=dates)[:])
            fld[ind-5:ind,:,:] = fld_tem

        # create datarray with sampling information
        sampled_var = xr.DataArray(fld,
                                   dims=['time','lev','all_stations'],
                                   coords={'time':ovar['time'],
                                           'lev':ovar['lev'],
                                           'all_stations':df.index.values,
                                           'dx':('all_stations',df.dx.values),
                                           'bearing':('all_stations',df.bearing.values),
                                           'lat':('all_stations',df.latitude.values),
                                           'lon':('all_stations',df.longitude.values),
                                          },
                                   attrs={'units':ovar.units,
                                          'long_name':ovar.long_name
                                         }
                                  )

        ds = sampled_var.to_dataset(name=ovar.name)
        ds.to_netcdf(f'../../../sections/{ovar.name}_{model}_{realizations[0]}.nc')
        
    if len(realizations) > 2:
        
        fld = np.zeros((len(sample_dates),len(ovar.lev),len(df)))

        ovar = ovar[0,].squeeze()
        
        for ind in range(5, 130, 5):
            dates = sample_dates[ind-5:ind]
            fld_tem = proj.run(ovar.sel(time=dates)[:])
            fld[ind-5:ind,:,:] = fld_tem

        # create datarray with sampling information
        sampled_var = xr.DataArray(fld,
                                   dims=['time','lev','all_stations'],
                                   coords={'time':ovar['time'],
                                           'lev':ovar['lev'],
                                           'all_stations':df.index.values,
                                           'dx':('all_stations',df.dx.values),
                                           'bearing':('all_stations',df.bearing.values),
                                           'lat':('all_stations',df.latitude.values),
                                           'lon':('all_stations',df.longitude.values),
                                          },
                                   attrs={'units':ovar.units,
                                          'long_name':ovar.long_name
                                         }
                                  )

        ds = sampled_var.to_dataset(name=ovar.name)
        ds.to_netcdf(f'../../../sections/{ovar.name}_{model}_{realizations[0][0]}.nc')
        
#         fld = np.zeros((len(ovar.member_id),len(sample_dates),len(ovar.lev),len(df)))

#         ovar = ovar.squeeze()

#         for member_ind,realization in enumerate(realizations):
            
#             ovar = ovar.sel(member_ind=member_ind)
            
#             for ind in range(5, 130, 5):
#                 print(ind)
#                 dates = sample_dates[ind-5:ind]
#                 fld_tem = proj.run(ovar.sel(time=dates)[:])
#                 fld[ind-5:ind,:,:] = fld_tem

#             # create datarray with sampling information
#             sampled_var = xr.DataArray(fld,
#                                        dims=['time','lev','all_stations'],
#                                        coords={'time':ovar['time'],
#                                                'lev':ovar['lev'],
#                                                'all_stations':df.index.values,
#                                                'dx':('all_stations',df.dx.values),
#                                                'bearing':('all_stations',df.bearing.values),
#                                                'lat':('all_stations',df.latitude.values),
#                                                'lon':('all_stations',df.longitude.values),
#                                               },
#                                        attrs={'units':ovar.units,
#                                               'long_name':ovar.long_name
#                                              }
#                                       )

#             ds = sampled_var.to_dataset(name=ovar.name)
#             ds.to_netcdf(f'{output_path}{ovar.name}_{model}_{realization[0]}.nc')

In [3]:
# model_to_glodap(model='CanESM5',ovar_name='dissic',catalog_path='../../catalogs/pangeo-cmip6.json',qc_path='../../qc')

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 8 group(s)


In [26]:
ds  = model_to_line(ovar_name='dissic',
                model='CanESM5',
                cruise_line='A01W OVIDE',
                catalog_path='../../catalogs/pangeo-cmip6.json',
                qc_path='../../qc',
                output_path='../../../sections/')

will change. To retain the existing behavior, pass
combine='nested'. To use future default behavior, pass
combine='by_coords'. See
http://xarray.pydata.org/en/stable/combining.html#combining-multi

to use the new `combine_by_coords` function (or the
`combine='by_coords'` option to `open_mfdataset`) to order the datasets
before concatenation. Alternatively, to continue concatenating based
on the order the datasets are supplied in future, please use the new
`combine_nested` function (or the `combine='nested'` option to
open_mfdataset).
  from_openmfds=True,


In [None]:
# 'CanESM5',

In [None]:
ovar_names = ['thetao','so']

models = [
#           'MIROC-ES2L',
#           'UKESM1-0-LL',
          'CESM2',
          'GISS-E2-1-G-CC',
         ]

for ovar_name in ovar_names:
    
    print(ovar_name)
        
    for model in models:
        
        print(model)

        model_to_glodap(model=model,ovar_name=ovar_name,catalog_path='../../catalogs/pangeo-cmip6.json',qc_path='../../qc')

thetao
CESM2
--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 23 group(s)
