In [1]:
# filter some warning messages
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import xarray as xr
import pandas as pd

import os
import glob
import pickle

In [3]:
from intake import open_esm_datastore
col = open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")

In [4]:
datapd = pd.read_csv('all_new.csv')

In [30]:
list_not_on_cloud = []
for i in range(0, len(datapd)):
    name = datapd.at[i, 'source_id']
    print("{} {}".format(i, name))
    dslist = col.search(variable_id = ['hfds'], table_id = 'Omon', source_id = name, member_id = datapd.at[i, 'member_id'], experiment_id = 'piControl', grid_label = datapd.at[i, 'grid_label'])
    if len(dslist.df) == 0:
        newlist = col.search(variable_id = ['hfds'], table_id = 'Omon', source_id = name, experiment_id = 'piControl')
        if len(newlist.df) == 0:
            list_not_on_cloud.append(name)
            print("    [ ] not on cloud.")
        else:
            list_not_on_cloud.append(name)
            print("    [ ] different grid/member id.")
    elif len(dslist.df) >1 :
        print("    [ ] more than 1 file.")
    else:
        print("    [v] data on cloud.")

0 BCC-CSM2-MR
    [ ] not on cloud.
1 BCC-ESM1
    [ ] not on cloud.
2 CAMS-CSM1-0
    [v] data on cloud.
3 ACCESS-ESM1-5
    [v] data on cloud.
4 ACCESS-CM2
    [v] data on cloud.
5 GFDL-CM4
    [v] data on cloud.
6 GFDL-ESM4
    [v] data on cloud.
7 KIOST-ESM
    [ ] not on cloud.
8 NESM3
    [v] data on cloud.
9 CanESM5
    [v] data on cloud.
10 CanESM5-1
    [ ] not on cloud.
11 CanESM5-CanOE
    [v] data on cloud.
12 CMCC-CM2-SR5
    [v] data on cloud.
13 CMCC-ESM2
    [v] data on cloud.
14 EC-Earth3
    [v] data on cloud.
15 EC-Earth3-CC
    [ ] not on cloud.
16 EC-Earth3-LR
    [v] data on cloud.
17 EC-Earth3-Veg
    [v] data on cloud.
18 EC-Earth3-Veg-LR
    [v] data on cloud.
19 EC-Earth3-AerChem
    [ ] not on cloud.
20 CNRM-CM6-1
    [v] data on cloud.
21 CNRM-CM6-1-HR
    [v] data on cloud.
22 CNRM-ESM2-1
    [v] data on cloud.
23 HadGEM3-GC31-LL
    [v] data on cloud.
24 HadGEM3-GC31-MM
    [v] data on cloud.
25 UKESM1-0-LL
    [v] data on cloud.
26 UKESM1-1-LL
    [ ] not

In [32]:
list_not_on_cloud

['BCC-CSM2-MR',
 'BCC-ESM1',
 'KIOST-ESM',
 'CanESM5-1',
 'EC-Earth3-CC',
 'EC-Earth3-AerChem',
 'UKESM1-1-LL',
 'IPSL-CM5A2-INCA',
 'IPSL-CM6A-MR1',
 'MIROC6',
 'MIROC-ES2L',
 'MIROC-ES2H',
 'CAS-ESM2-0',
 'GISS-E2-1-H',
 'GISS-E2-2-H',
 'INM-CM4-8',
 'NorESM2-MM',
 'MPI-ESM-1-2-HAM',
 'ICON-ESM-LR',
 'E3SM-2-0',
 'E3SM-2-0-NARRM']

In [5]:
def ispickleexists(n, p0):
    p = p0 + n + '.pickle'
    if os.path.exists(p):
        # print('    [o] {} exists.'.format(p))
        return True
    else:
        return False

def openpickle(n, p0):
    p = p0 + n + '.pickle'
    d = pd.read_pickle(p)
    # with open(p, 'rb') as df:
    #     d = pickle.load(df)
    return d

def savepickle(n, p0, sf):
    p = p0 + n + '.pickle'
    with open(p, 'wb') as wf:
        pickle.dump(sf, wf, pickle.HIGHEST_PROTOCOL)

def open_from_cloud(dslist):
    ds = dslist[list(dslist)[0]].to_dask()
    for c in ['member_id', 'dcpp_init_year']:
        if c in ds.coords:
            ds = ds.squeeze(c).reset_coords(c, drop = True)
    return ds

def open_hf_nc(mf):
    return xr.open_mfdataset(mf, use_cftime=True)

def select_month(da, n):
    return da.isel(time=(da.time.dt.month == n))

def skip_unstructure_grid(datapd, i):
    if pd.isna(datapd.at[i, 'xname']):
        print("    [x] doesn't have regular grid.")
        return True
    else:
        return False



In [None]:
p_nc = '../../../data/model/CMIP6/'

for i in range(0, len(datapd)):
    name = datapd.at[i, 'source_id']
    print("{} {}".format(i, name))
    if skip_unstructure_grid(datapd, i):
        continue
    if pd.isna(datapd.at[i, 'zstore_hfds']):
        data_path = p_nc + 'hfds_Omon_' + datapd.at[i, 'source_id'] + '_piControl_' + '*' + '.nc'
        matching_files = glob.glob(data_path)
        if len(matching_files) == 0:
            print("    [x] no data.")
            continue
        else:
            ds_hf = open_hf_nc(matching_files)
    else:
        dslist = col.search(variable_id = ['hfds'], table_id = 'Omon', source_id = name, member_id = datapd.at[i, 'member_id'], experiment_id = 'piControl', grid_label = datapd.at[i, 'grid_label'])    
        if len(dslist.df) == 0:
            print("    [ ] error.")
            continue
        ds_hf = open_from_cloud(dslist)

    mld = openpickle(name, 'data_mld/')
    try:
        hf_mld = ds_hf.hfds.where(mld>=2000)
        if len(hf_mld.time) != len(mld.time):
            print("    [!] time length not the same.")
    except Exception as e:
        print("    An exception occurred:", e) 

    # break

In [54]:
for i in range(0, len(datapd)):
    name = datapd.at[i, 'source_id']
    print("{} {}".format(i, name))
    if skip_unstructure_grid(datapd, i):
        continue
    if (name in list_not_on_cloud) or (name in ['CAMS-CSM1-0']):
        data_path = 'mnt/d/CMIP6/hfds_Omon_' + datapd.at[i, 'source_id'] + '_piControl_' + datapd.at[i, 'member_id'] + '*' + datapd.at[i, 'grid_label'] + '*' + '.nc'
        matching_files = glob.glob(data_path)
        if len(matching_files) == 0:
            if name not in ['BCC-CSM2-MR', 'BCC-ESM1','GISS-E2-1-H', 'GISS-E2-2-H','INM-CM4-8', 'MIROC-ES2L', 'MIROC-ES2H']:
                print("    [!] nc file missing")
            else:
                print("    [x] no data.")
            continue
        else:
            ds_hf = open_hf_nc(matching_files)
    else:
        dslist = col.search(variable_id = ['hfds'], table_id = 'Omon', source_id = name, member_id = datapd.at[i, 'member_id'], experiment_id = 'piControl', grid_label = datapd.at[i, 'grid_label'])    
        if len(dslist.df) == 0:
            print("    [ ] error.")
            continue
        ds_hf = open_from_cloud(dslist)

    mld = openpickle(name, 'data_mld/')
    try:
        hf_mld = ds_hf.hfds.where(mld>=2000)
        if len(hf_mld.time) != len(mld.time):
            print("    [!] time length not the same.")
    except Exception as e:
        print("    An exception occurred:", e) 

    # break

0 BCC-CSM2-MR
    [ ] not on cloud.
1 BCC-ESM1
    [ ] not on cloud.
2 CAMS-CSM1-0
    [ ] not on cloud.
3 ACCESS-ESM1-5
4 ACCESS-CM2
5 GFDL-CM4
6 GFDL-ESM4
7 KIOST-ESM
    [ ] not on cloud.
8 NESM3
9 CanESM5
10 CanESM5-1
    [ ] not on cloud.
11 CanESM5-CanOE
12 CMCC-CM2-SR5
13 CMCC-ESM2
14 EC-Earth3
15 EC-Earth3-CC
    [ ] not on cloud.
16 EC-Earth3-LR
17 EC-Earth3-Veg
    [!] time length not the same.
18 EC-Earth3-Veg-LR
19 EC-Earth3-AerChem
    [ ] not on cloud.
20 CNRM-CM6-1
    An exception occurred: cannot reindex or align along dimension 'y' because of conflicting dimension sizes: {76, 294}
21 CNRM-CM6-1-HR
    An exception occurred: cannot reindex or align along dimension 'y' because of conflicting dimension sizes: {296, 1050}
22 CNRM-ESM2-1
    An exception occurred: cannot reindex or align along dimension 'y' because of conflicting dimension sizes: {76, 294}
23 HadGEM3-GC31-LL
24 HadGEM3-GC31-MM
25 UKESM1-0-LL
26 UKESM1-1-LL
    [ ] not on cloud.
27 IPSL-CM5A2-INCA
    [ ] n

In [76]:
for i in [42]:
    name = datapd.at[i, 'source_id']
    print("{} {}".format(i, name))
    if skip_unstructure_grid(datapd, i):
        continue
    if (name in list_not_on_cloud) or (name in ['CAMS-CSM1-0', 'SAM0-UNICON']):
        continue
        data_path = 'mnt/d/CMIP6/hfds_Omon_' + datapd.at[i, 'source_id'] + '_piControl_' + datapd.at[i, 'member_id'] + '*' + datapd.at[i, 'grid_label'] + '*' + '.nc'
        matching_files = glob.glob(data_path)
        if len(matching_files) == 0:
            if name not in ['BCC-CSM2-MR', 'BCC-ESM1','GISS-E2-1-H', 'GISS-E2-2-H','INM-CM4-8', 'MIROC-ES2L', 'MIROC-ES2H']:
                print("    [!] nc file missing")
            else:
                print("    [x] no data.")
            continue
        else:
            ds_hf = open_hf_nc(matching_files)

    else:
        dslist = col.search(variable_id = ['hfds'], table_id = 'Omon', source_id = name, member_id = datapd.at[i, 'member_id'], experiment_id = 'piControl', grid_label = datapd.at[i, 'grid_label'])    
        if len(dslist.df) == 0:
            print("    [ ] error.")
            continue
        ds_hf = open_from_cloud(dslist)

    hf = ds_hf.hfds


    mld = openpickle(name, 'data_mld/')
    try:
        hf_mld = hf.where(mld>=2000)
        if len(hf_mld.time) != len(mld.time):
            print("    [!] time length not the same.")
    except Exception as e:
        print("    An exception occurred:", e) 

    # break

42 SAM0-UNICON
    [!] time length not the same.
