### Directly Download from GC and save as netcdf files
- This is for those who cannot use zarr/python for processing the CMIP6 datasets
- Please note that the netcdf files have CF-compliant time grids, but might not be what you are used to

In [59]:
import numpy as np
import pandas as pd
import os
import gcsfs #google cloud file system. 
import xarray as xr
import warnings
from glob import glob # use * !
import scipy.io as sio

from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (8,5)

In [60]:
from utilities import search_df, add_time_info, get_zdict #extra functions

### Initialization

In [61]:
def compute_area_and_seasonal_mean(danom, xlim, ylim, slim, mask=1):
    """
    Weights each grid point by the cos(latitude), computes area mean, normalizing by areaa mean of the weights
    returns:
        DataArray:  global mean for each model
    """  
    xlim = np.array(xlim)
    xlim += (xlim<0 )*360
    if xlim[0]>xlim[1]:
        lon_sel = (danom.lon>xlim[0])+(danom.lon<xlim[1])
    else:
        lon_sel = (danom.lon>xlim[0])*danom.lon<xlim[1]
    
    if type(mask)!=int:
        mask = (xr.ones_like(danom)*mask).isel({'lat': (danom.lat>ylim[0])*(danom.lat<ylim[1]), 'lon': lon_sel})
    
    danom = danom.isel({'lat': (danom.lat>ylim[0])*(danom.lat<ylim[1]), 'lon': lon_sel, 'time': (danom['time'].dt.month >= slim[0])*(danom['time'].dt.month <= slim[1])})
    coslat = np.cos(np.deg2rad(danom.lat))
    weights = xr.ones_like(danom)*coslat*mask
    weight_mean = weights.mean(['lat','lon'], keep_attrs=True)
    area_mean = (danom * weights).mean(['lat','lon'], keep_attrs=True)/weight_mean
    if area_mean.time.dtype!='datetime64[ns]':
        area_mean['time'] = area_mean.indexes['time'].year
        return area_mean.groupby('time').mean(dim='time', keep_attrs=True)
        #area_mean.indexes['time'].to_datetimeindex()
        #there are lots of dftime.DatetimeNoLeap. this would matter a touch if I used a weighted average.
    else:   
        #month_length = danom.time.dt.days_in_month
        #this actually isn't right! Let's just do an unweighted mean for now :(
        #weights = month_length/sum(month_length[slim[0]:slim[1]+1])
        #Sm = (weights*area_mean)
        #return Sm.groupby(grp).sum(dim='time', keep_attrs=True)
        return area_mean.groupby(area_mean.time.dt.year).mean(dim='time', keep_attrs=True)
    
    #technically this is inconsistent handling; one makes a dimension called "time" and the other makes "years". I have to deal with that in my matlab code

In [62]:
# Where to write local netcdf files:
username = os.environ['JUPYTERHUB_USER']
mach = os.uname()[1]

zarr_local = f'/home/{username}/netcdf/cmip6/preprocessed'
if not os.path.exists(zarr_local):
    print(f'Please create the directory {zarr_local}')
    

In [63]:
# This is the master CMIP6 Google Cloud catalog
df_cloud = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')
df_cloud.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version,status,severity,issue_url
0,AerChemMIP,AS-RCEC,TaiESM1,histSST,r1i1p1f1,AERmon,od550aer,gn,gs://cmip6/AerChemMIP/AS-RCEC/TaiESM1/histSST/...,,20200310,good,none,none
1,AerChemMIP,BCC,BCC-ESM1,histSST,r1i1p1f1,AERmon,mmrbc,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/histSST/r1i...,,20190718,good,none,none
2,AerChemMIP,BCC,BCC-ESM1,histSST,r1i1p1f1,AERmon,mmrdust,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/histSST/r1i...,,20191127,good,none,none
3,AerChemMIP,BCC,BCC-ESM1,histSST,r1i1p1f1,AERmon,mmroa,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/histSST/r1i...,,20190809,good,none,none
4,AerChemMIP,BCC,BCC-ESM1,histSST,r1i1p1f1,AERmon,mmrso4,gn,gs://cmip6/AerChemMIP/BCC/BCC-ESM1/histSST/r1i...,,20191127,good,none,none


### Choose basic configuration parameters

In [140]:
# Here we search the CMIP6 data for the datasets you need - using the same keywords as at the ESGF sites
#       https://esgf-node.llnl.gov/search/cmip6/

debug = False

# must choose ONE table_id  (only works for *mon or *day)
table_id = 'Amon'

#must choose LIST of experiments, variables
experiments = ['historical']#,'hist-aer', , 'hist-nat', 'hist-GHG', 'piControl' 'amip-hist',
variables = ['hus']

location = 'Sahel' #Ocean

# can specify 'All' or give a list or string
sources = ['ACCESS-ESM1-5']#'CNRM-ESM2-1']#'NorESM2-LM']# 'GFDL-ESM4'['CanESM5-CanOE']#CMCC-CM2-SR5']#'CIESM']#MCM-UA-1-0']  #AWI-CM-1-1-MR']#SAM0-UNICON']# omit the [] to get all models with CESM2 in their name
#sources = 'All'
members = ['r10i1p1f1']#'r9i1p1f2']#
#members = 'All'

In [141]:
search = {'table_id':table_id}
search['experiment_id'] = experiments
search['variable_id'] = variables
if sources != 'All':
    search['source_id'] = sources
if members != 'All':
    search['member_id'] = members
    
df_available = search_df(df_cloud, **search)

print('number of matching datasets',len(df_available))

#523 historical simulations < 536 on the cite directly. Do I want to figure out which simulations are missing? 
# Or do I trust that they are missing for a reason?

number of matching datasets 1


In [142]:
df_available

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version,status,severity,issue_url
43839,CMIP,CSIRO,ACCESS-ESM1-5,historical,r10i1p1f1,Amon,hus,gn,gs://cmip6/CMIP/CSIRO/ACCESS-ESM1-5/historical...,,20200605,good,none,none


In [143]:
# For proper debugging, it is helpful to add time grid information to dataframe:
if debug:
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        dfa = add_time_info(df_available)
else:
    dfa = df_available.copy()

In [144]:
#only use the MASK code for TS!
if location=='Ocean':
    search_mask = {'table_id':'fx'}
    search_mask['experiment_id'] = ['historical', 'piControl', '1pctCO2','hist-resIPO','hist-1950HC']
    search_mask['variable_id'] = ['sftlf']
    if sources != 'All':
        search_mask['source_id'] = sources
    if members != 'All':
        search_mask['member_id'] = members
    historical_mask = search_df(df_cloud, **search_mask)

    print('number of mask datasets',len(historical_mask))

In [145]:
#only use MASK code for TS!
if location=='Ocean':
    def get_ids(dfa, id_name):
        zdicts = list(map(get_zdict, list(dfa.zstore.values)))
        return set(map(lambda x: x[id_name], zdicts))

    id_name = 'source_id'
    historical_mask_models = get_ids(historical_mask, id_name)
    sst_models = get_ids(dfa, id_name)

    maskable_models = sst_models.intersection(historical_mask_models)

    gsurls = np.array([gsurl for gsurl in dfa.zstore.values if get_zdict(gsurl)['source_id'] in maskable_models])
    missing = np.array([gsurl for gsurl in dfa.zstore.values if not get_zdict(gsurl)['source_id'] in maskable_models])

    masks = historical_mask.groupby('source_id').first()

    gsurls


In [146]:
if location=='Ocean':
    missing

In [147]:
# for pr:
if location=='Sahel':
    gsurls = np.array(dfa.zstore.values)

In [148]:
if debug:
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

    dm = dfa[['experiment_id','source_id','member_id','variable_id','start','stop']].groupby([
             'experiment_id','start','stop','source_id']).nunique()[['member_id']]

    table = pd.DataFrame.pivot_table(dm,
                                     values='member_id',
                                     index=['source_id','start','stop'],
                                     columns=['experiment_id'],
                                     aggfunc=np.sum,
                                     fill_value=0)
    print(table)

In [149]:
dfa.issue_url.unique()

array(['none'], dtype=object)

In [150]:
fs = gcsfs.GCSFileSystem(token='anon', access='read_only') #the actual files, not the list of files woohoo FILE SYSTEM
#fs.get_mapper()

In [151]:
#gsurls = dfa.zstore.values #zstore is the url where the data is stored

ds_list = []
ds_failed_list = []

def update_vars(ds):
    for var in [var for var in ds.coords]:
        if 'bounds' in var:
            nvar = var.replace('bounds','bnds')
            #print(var,nvar)
            ds = ds.rename({var:nvar})
        if 'latitude' in var:
            nvar = var.replace('latitude','lat')
            #print(var,nvar)
            ds = ds.rename({var:nvar})
        if 'longitude' in var:
            nvar = var.replace('longitude','lon')
            #print(var,nvar)
            ds = ds.rename({var:nvar})
    return ds        
    
    
for gsurl in gsurls:
    zdict = get_zdict(gsurl) #naomi func for metadata
    institution = zdict['institution_id']
    model = zdict['source_id']
    run = zdict['member_id']
    variable = zdict['variable_id']
    expt = zdict['experiment_id']
    filename = f'{variable}_{institution}_{model}_{run}'
    ncdir = f'{zarr_local}/{expt}'
    ncfile = f'{ncdir}/{filename}.nc'
    
    replace = False
    if(not replace):
        ncfiles = glob(ncfile) #check not to double-download files
        if len(ncfiles) > 0:
            print(ncfiles, 'already exists')
            continue
        
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        ds0 = xr.open_zarr(fs.get_mapper(gsurl),consolidated=True) #gets info about the file. get_mapper! always use consolidated=True
        if variable=='ts':
            mask_ds = xr.open_zarr(fs.get_mapper(masks['zstore'][model]),consolidated=True) 
        
    #month_length = ds.time.dt.days_in_month #for some reason the first one doesn't have this...
        
    ds0 = update_vars(ds0)
    
    if location=='Ocean':
        mask_ds = update_vars(mask_ds)
        ls_mask = np.floor(1-mask_ds.sftlf.values/100)
    
    try:
        if variable=='hus':
            ds = ds0.isel({'plev':ds0.plev==8.5e4})
        else:
            ds = ds0
        if location=='Ocean':
            NA = compute_area_and_seasonal_mean(ds, [-75,-15], [10,40], [7,9])
            GT = compute_area_and_seasonal_mean(ds, [0,360],[-20,20],[7,9], mask=ls_mask)
            Sm = NA
            Sm = Sm.rename_vars({'ts':'NA'})
            Sm['GT'] = GT.ts
            Sm['NARI'] = NA.ts - GT.ts
        else:
            Sm = compute_area_and_seasonal_mean(ds, [-20,40], [12,18], [7,9])
  
    except ValueError:
        print(f'value error for {ncfile}')
        ds_failed_list += [ds]
        continue
        
    #have to customize this to the variable I'm using!
    if variable=='ts':
        if not ds.ts.attrs['units']=='K':
            print("cannot comprehend units ({}), skipping model {}".format(ds.ts.attrs['units'], model))
            continue
    elif variable=='pr':
        if ds.pr.attrs['units'] == 'kg m-2 s-1':
            Sm *= 86400
        else:
            print("cannot comprehend units ({}), skipping model {}".format(ds.pr.attrs['units'], model))
            continue
    elif (variable=='huss'):
        Sm *= float(ds.huss.attrs['units'])*1000 #convert to g/kg
    elif (variable=='hus'):
        Sm *= float(ds.hus.attrs['units'])*1000 #convert to g/kg
    else:
        print("need to make new units case for variable {}".format(variable))

    os.system(f'mkdir -p {ncdir}')
    try:
        Sm.to_netcdf(ncfile,mode='w',unlimited_dims=['time','year'])  #saves the file. Don't have to do this before I'm ready! But ds is replaced each time...
        ds_list += [Sm]
        print(f'saving file {ncfile}')
    except ValueError:
        print(f'value error for {ncfile}')
        ds_failed_list += [Sm]
        continue    
        
    #ok I got an error for a model which uses i/j coordinates instead of lat lon! OY VEY...
    

['/home/rebecca/netcdf/cmip6/preprocessed/historical/hus_CSIRO_ACCESS-ESM1-5_r10i1p1f1.nc'] already exists


In [152]:
ds0 = update_vars(ds0)
ds_problem = ds0.isel({'plev':ds0.plev==8.5e4})

In [159]:
Sm = compute_area_and_seasonal_mean(ds, [-20,40], [12,18], [7,9])
Sm_problem = compute_area_and_seasonal_mean(ds_problem, [-20,40], [12,18], [7,9])

In [162]:
Sm_problem.hus.values



array([[0.00901301],
       [0.00894047],
       [0.00863619],
       [0.00843658],
       [0.00841754],
       [0.00851721],
       [0.00836657],
       [0.00911873],
       [0.00872886],
       [0.00879034],
       [0.00791434],
       [0.00909695],
       [0.00846504],
       [0.00862799],
       [0.00822676],
       [0.00810332],
       [0.00886661],
       [0.00813118],
       [0.00801133],
       [0.00872469],
       [0.00844564],
       [0.00787069],
       [0.00828803],
       [0.00820704],
       [0.00794621],
       [0.00812895],
       [0.00823282],
       [0.00834062],
       [0.0085458 ],
       [0.00846531],
       [0.00845579],
       [0.00835821],
       [0.00757988],
       [0.00846941],
       [0.00838911],
       [0.00803835],
       [0.00772928],
       [0.007924  ],
       [0.00890369],
       [0.00809326],
       [0.00776524],
       [0.00824889],
       [0.00858564],
       [0.0085038 ],
       [0.0082632 ],
       [0.00808441],
       [0.00853819],
       [0.008

In [158]:
ds.hus.values



array([[[[       nan,        nan,        nan, ...,        nan,
                 nan,        nan],
         [       nan,        nan,        nan, ...,        nan,
                 nan,        nan],
         [       nan,        nan,        nan, ...,        nan,
                 nan,        nan],
         ...,
         [0.00040919, 0.00040919, 0.00040919, ..., 0.00040296,
          0.00040919, 0.00040919],
         [0.00039056, 0.00039056, 0.00039056, ..., 0.00039056,
          0.00039056, 0.00039056],
         [0.00035301, 0.00035301, 0.00035301, ..., 0.00035301,
          0.00035301, 0.00035301]]],


       [[[       nan,        nan,        nan, ...,        nan,
                 nan,        nan],
         [       nan,        nan,        nan, ...,        nan,
                 nan,        nan],
         [       nan,        nan,        nan, ...,        nan,
                 nan,        nan],
         ...,
         [0.00039921, 0.00039921, 0.00039921, ..., 0.00039685,
          0.00039921, 0

In [None]:
xr.concat(ds_list, dim='dataset') #this isn't working bc different variables. Exist other ways to combine.

In [None]:
#regridding: xesmf (earth system modelling federation). works for lat-lon, but not time.

In [92]:
np.array_equal(ds2.sftlf, ds1.sftlf)

True

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (8,5)

ds.ts[0].plot()
(ds*xr.ones_like(ds)*np.cos(np.deg2rad(ds.lat))*ls_mask).ts[0].plot()

In [None]:
! tree -L 9 ~/CMIP6-downloads #unix tree of created files; I didn't create any.

In [None]:
! du -sh ~/CMIP6-downloads/*/*/*/*

In [None]:
ds = xr.open_dataset('/home/naomi/CMIP6-downloads/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/sfcWind/gn/sfcWind.nc')

In [None]:
ds.sfcWind.plot(vmin=0,vmax=12)

In [None]:
#Convert longitude coordinates from 0-359 to -180-179:

ds2 = ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180)).sortby('lon')

#or

ds.coords['lon'] = (ds.coords['lon'] + 180) % 360 - 180
ds = ds.sortby(ds.lon)

In [None]:
ds2.sfcWind[0].plot()