# Test CMIP6



In [1]:
import xarray as xr
from clisops.core import subset
from clisops.ops import subset as subset_op
from clisops.utils.dataset_utils import open_xr_dataset

import time
import os

In [2]:
basedir_cmip6 = "/mnt/lustre/work/ik1017/CMIP6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r1i1p1f1/day/ta/gn/v20190710"

cmip6_nc = f"{basedir_cmip6}/ta_day_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_20100101-20141231.nc"
cmip6_nc

'/mnt/lustre/work/ik1017/CMIP6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r1i1p1f1/day/ta/gn/v20190710/ta_day_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_20100101-20141231.nc'

## ncdump - CMIP6

In [3]:
! du -sh {cmip6_nc}

1.6G	/mnt/lustre/work/ik1017/CMIP6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r1i1p1f1/day/ta/gn/v20190710/ta_day_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_20100101-20141231.nc


In [4]:
! ncdump -sh {cmip6_nc}

netcdf ta_day_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_20100101-20141231 {
dimensions:
	time = UNLIMITED ; // (1826 currently)
	plev = 8 ;
	lat = 192 ;
	lon = 384 ;
	bnds = 2 ;
variables:
	double time(time) ;
		time:bounds = "time_bnds" ;
		time:units = "days since 1850-1-1 00:00:00" ;
		time:calendar = "proleptic_gregorian" ;
		time:axis = "T" ;
		time:long_name = "time" ;
		time:standard_name = "time" ;
		time:_Storage = "chunked" ;
		time:_ChunkSizes = 1 ;
		time:_Endianness = "little" ;
	double time_bnds(time, bnds) ;
		time_bnds:_Storage = "chunked" ;
		time_bnds:_ChunkSizes = 1, 2 ;
		time_bnds:_Shuffle = "true" ;
		time_bnds:_DeflateLevel = 1 ;
		time_bnds:_Endianness = "little" ;
	double plev(plev) ;
		plev:units = "Pa" ;
		plev:axis = "Z" ;
		plev:positive = "down" ;
		plev:long_name = "pressure" ;
		plev:standard_name = "air_pressure" ;
		plev:_Storage = "contiguous" ;
		plev:_Endianness = "little" ;
	double lat(lat) ;
		lat:bounds = "lat_bnds" ;
		lat:units = "degrees_north" ;
	

## helper functions

In [5]:
def ds_info(ds):
    # compression levels
    print("data vars with compression:")
    for var in ds.data_vars:
        # print(var, ds[var].encoding, "\n")
        complevel = ds[var].encoding.get("complevel", 0)
        print(var, "compression level =", complevel)
        
    # fill values
    print("\nfill values:")
    var_list = list(ds.coords) + list(ds.data_vars)
    for var in var_list:
        fill_value = ds[var].encoding.get("_FillValue")
        print(var, "fill value =", fill_value)
        
    # string attributes with compression
    print("\nstring attributes with compression:")
    for cvar in [
            "member_id",
            "gcm_variant",
            "gcm_model",
            "gcm_institution",
            "rcm_variant",
            "rcm_model",
            "rcm_institution",
        ]:
            for en in ["zlib", "shuffle", "complevel"]:
                try:
                    print(cvar, en, ds[cvar].encoding[en])
                except KeyError:
                    pass

## xarray - CMIP6



In [6]:
ds_cmip6 = xr.open_dataset(cmip6_nc)
ds_cmip6

In [7]:
ds_info(ds_cmip6)

data vars with compression:
time_bnds compression level = 1
lat_bnds compression level = 1
lon_bnds compression level = 1
ta compression level = 1

fill values:
time fill value = None
plev fill value = None
lat fill value = None
lon fill value = None
time_bnds fill value = None
lat_bnds fill value = None
lon_bnds fill value = None
ta fill value = 1e+20

string attributes with compression:


In [8]:
ds = ds_cmip6.isel(time=slice(0, 12), lon=slice(30, 50), lat=slice(50, 70))
ds

In [9]:
# clean up outputs

! rm /tmp/output_*

In [10]:
ds.to_netcdf("/tmp/output_cmip6_xarray.nc")


## clisops-core - cmip6



In [11]:
ds_cmip6 = open_xr_dataset(cmip6_nc)
ds_cmip6

In [12]:
ds = subset.subset_bbox(
    ds_cmip6, lat_bnds=[45, 50], lon_bnds=[-60, -55],
    start_date='2013-01-01', end_date='2013-01-30')
ds

In [13]:
ds.to_netcdf("/tmp/output_cmip6_clisops_core.nc")


## clisops-ops - cmip6 - subset

TODO: it takes too long ... 21 secs (when used together with bbox)

TODO: needs a patch to convert start/end_time to start/end_date



In [14]:
# monkeypatch for subset time parameters
# convert start/end_time to start/end_date

from clisops.ops.subset import Subset
from clisops.parameter import parameterise
from dateutil.parser import parse

def convert_to_date(dt):
    d = parse(dt).date().strftime("%Y-%m-%d")
    return d

# Define the new monkeypatched method
def new_resolve_params(self, **params):
    print("Monkeypatched _resolve_params called")
    
    """Generates a dictionary of subset parameters."""
    time = params.get("time", None)
    area = params.get("area", None)
    level = params.get("level", None)
    time_comps = params.get("time_components", None)

    # Set up args dictionary to be used by `self._calculate()`
    args = dict()

    parameters = parameterise(
        collection=self.ds,
        time=time,
        area=area,
        level=level,
        time_components=time_comps,
    )

    # For each required parameter, check if the parameter can be accessed as a tuple
    # If not: then use the dictionary representation for it
    for param_name in ["time", "area", "level", "time_components"]:
        param_value = parameters.get(param_name)
        if param_value.value is not None:
            args.update(param_value.asdict())

    # Rename start_time and end_time to start_date and end_date to
    # match clisops.core.subset function parameters.
    if "start_time" in args:
        start_time = args.pop("start_time")
        start_date = convert_to_date(start_time)
        args["start_date"] = start_date
        print(f"params use start_date {start_date} instead of start_time {start_time}")

    if "end_time" in args:
        end_time = args.pop("end_time")
        end_date = convert_to_date(end_time)
        args["end_date"] = end_date
        print(f"params use end_date {end_date} instead of end_time {end_time}")

    self.params = args
    

# Apply the monkeypatch
Subset._resolve_params = new_resolve_params



In [15]:
# clean up outputs

! rm /tmp/output_*

In [16]:
ds_cmip6 = open_xr_dataset(cmip6_nc)
ds_cmip6

In [17]:
# check that monkeypatch works

from clisops.ops.subset import Subset

op = Subset(
    ds=ds_cmip6,
    time="2013-01-01/2013-01-30",
    # area=(0.0, 49.0, 10.0, 65.0),
    output_type="nc",
    # output_type="xarray",
    output_dir="/tmp",
    split_method="time:auto",
    file_namer="simple"
)

kwargs = op.params
print(kwargs)

start = time.time()

ds = subset.subset_bbox(ds_cmip6, **kwargs)
# ds = subset.subset_bbox(ds_cmip6, start_date="2013-01-01", end_date="2013-01-30")

duration = time.time() - start
print(f"duration: {duration} secs")

ds

Monkeypatched _resolve_params called
params use start_date 2013-01-01 instead of start_time 2013-01-01T00:00:00
params use end_date 2013-01-30 instead of end_time 2013-01-30T23:59:59
{'start_date': '2013-01-01', 'end_date': '2013-01-30'}
duration: 0.019814491271972656 secs


In [18]:
start = time.time()

outputs = subset_op(
    ds=ds_cmip6,
    time="2013-01/2013-01",
    # time_components="year:2013",
    area=(0.0, 49.0, 10.0, 65.0),
    output_type="nc",
    # output_type="xarray",
    output_dir="/tmp",
    split_method="time:auto",
    file_namer="simple"
)

duration = time.time() - start
print(f"duration: {duration} secs")

print("Number of output files: ", len(outputs))
outputs[0]

Monkeypatched _resolve_params called
params use start_date 2013-01-01 instead of start_time 2013-01-01T00:00:00
params use end_date 2013-01-31 instead of end_time 2013-01-31T23:59:59
duration: 21.24138617515564 secs
Number of output files:  1


'/tmp/output_001.nc'

In [19]:
file_size = os.path.getsize(outputs[0])
print("File Size is :", file_size/(1024*1024), "MB")

File Size is : 0.14179420471191406 MB


In [20]:
ds = xr.open_dataset(outputs[0])
ds

In [21]:
ds_info(ds)

data vars with compression:
time_bnds compression level = 1
lat_bnds compression level = 1
lon_bnds compression level = 1
ta compression level = 1

fill values:
time fill value = None
plev fill value = None
lat fill value = None
lon fill value = None
time_bnds fill value = None
lat_bnds fill value = None
lon_bnds fill value = None
ta fill value = 1e+20

string attributes with compression:


## clisops-ops - cmip6 - subset with another patch

TODO: the subsetting takes too long ... here is a patch to fix this.

In [22]:
# monkey patch for clisops

from clisops.utils import dataset_utils

def custom_is_time(coord):
    print(f"Custom behavior for is_time with input: {coord.name}")
    
    import numpy as np
    
    if "time" in coord.cf.coordinates and coord.name in coord.cf.coordinates["time"]:
        return True

    if (
        "time" in coord.cf.standard_names
        and coord.name in coord.cf.standard_names["time"]
    ):
        return True

    if np.issubdtype(coord.dtype, np.datetime64):
        return True

    # TODO: this code leads to memory overflow when applied on a data variable!
    print(f"skip np.atleast_1d(coord.values) on: {coord.name}")
    _check_coord = False
    if _check_coord and isinstance(np.atleast_1d(coord.values)[0], cftime.datetime):
        return True

    if hasattr(coord, "axis"):
        if coord.axis == "T":
            return True

    return False


# Monkey patch the function
dataset_utils.is_time = custom_is_time


In [23]:
# clean up outputs

! rm /tmp/output_*

In [24]:
ds_cmip6 = open_xr_dataset(cmip6_nc)
ds_cmip6

In [25]:
start = time.time()

outputs = subset_op(
    ds=ds_cmip6,
    time="2013-01/2013-01",
    area=(0.0, 50.0, 10.0, 60.0),
    output_type="nc",
    # output_type="xarray",
    output_dir="/tmp",
    split_method="time:auto",
    file_namer="simple"
)

duration = time.time() - start
print(f"duration: {duration} secs")

outputs[0]

Monkeypatched _resolve_params called
params use start_date 2013-01-01 instead of start_time 2013-01-01T00:00:00
params use end_date 2013-01-31 instead of end_time 2013-01-31T23:59:59
Custom behavior for is_time with input: time
Custom behavior for is_time with input: time_bnds
skip np.atleast_1d(coord.values) on: time_bnds
Custom behavior for is_time with input: lat_bnds
skip np.atleast_1d(coord.values) on: lat_bnds
Custom behavior for is_time with input: lon_bnds
skip np.atleast_1d(coord.values) on: lon_bnds
Custom behavior for is_time with input: ta
skip np.atleast_1d(coord.values) on: ta
Custom behavior for is_time with input: time
Custom behavior for is_time with input: time_bnds
skip np.atleast_1d(coord.values) on: time_bnds
Custom behavior for is_time with input: lat_bnds
skip np.atleast_1d(coord.values) on: lat_bnds
Custom behavior for is_time with input: lon_bnds
skip np.atleast_1d(coord.values) on: lon_bnds
Custom behavior for is_time with input: ta
skip np.atleast_1d(coord.va

'/tmp/output_001.nc'

In [26]:
file_size = os.path.getsize(outputs[0])
print("File Size is :", file_size/(1024*1024), "MB")

File Size is : 0.10758399963378906 MB


In [27]:
ds = xr.open_dataset(outputs[0])
ds

## check numpy function

this code line makes troubles:

#if isinstance(np.atleast_1d(coord.values)[0], cftime.datetime):

In [28]:
import numpy as np

In [29]:
ds = open_xr_dataset(cmip6_nc)
ds

### check time coord

In [30]:
time_coord = ds.coords["time"]
time_coord

In [31]:
time_coord.name

'time'

In [32]:
len(time_coord.values)

1826

In [33]:
np.atleast_1d(time_coord.values)[0]

cftime.DatetimeProlepticGregorian(2010, 1, 1, 12, 0, 0, 0, has_year_zero=True)

### check also ta data variable

This will explode the memory ... in this case "only" up to 9 GB

In [34]:
start = time.time()

print("numer of values", len(ds.ta.values))

duration = time.time() - start
print(f"duration: {duration} secs")

numer of values 1826
duration: 21.325840711593628 secs


In [35]:
np.atleast_1d(ds.ta.values)[0]

array([[[260.1153 , 260.10944, 260.10553, ..., 260.13287, 260.127  ,
         260.12115],
        [259.75983, 259.72467, 259.69147, ..., 259.8614 , 259.8282 ,
         259.79303],
        [260.09772, 260.05865, 260.0196 , ..., 260.2149 , 260.17584,
         260.13678],
        ...,
        [248.84186, 249.04108, 249.26959, ..., 248.43756, 248.53717,
         248.67194],
        [250.836  , 250.88483, 250.94147, ..., 250.74225, 250.76569,
         250.79694],
        [251.05865, 251.08014, 251.10358, ..., 250.99811, 251.01764,
         251.03717]],

       [[252.19507, 252.19116, 252.1853 , ..., 252.21265, 252.20679,
         252.20093],
        [251.84937, 251.81616, 251.78296, ..., 251.94897, 251.91577,
         251.88257],
        [252.17749, 252.14038, 252.10327, ..., 252.29077, 252.25366,
         252.2146 ],
        ...,
        [241.81812, 241.9685 , 242.12476, ..., 241.37866, 241.5232 ,
         241.66772],
        [244.10132, 244.19702, 244.29468, ..., 243.82593, 243.91577,
   

In [36]:
ds.dims

Frozen({'time': 1826, 'bnds': 2, 'plev': 8, 'lat': 192, 'lon': 384})

In [37]:
ds.ta.cf.coordinates

{'longitude': ['lon'],
 'latitude': ['lat'],
 'vertical': ['plev'],
 'time': ['time']}

In [38]:
ds.ta.dims

('time', 'plev', 'lat', 'lon')

In [39]:
ds.coords["time"].dims

('time',)