# CMIP6 Consistent Time Coordinate

**Following steps are included in this script:**

1. Load netCDF files
2. Create a consistent time coordinate
3. Save and replace netcdf files

In [1]:
# ========== Packages ==========
import xarray as xr
import pandas as pd
import numpy as np
import dask
import os
import glob

### Functions

In [2]:
def consis_time(ds_dict, ref_ds):
    """
    Creates consistent time coordinate based on a reference dataset

    Args:
        ds_dict (dict): A dictionary of xarray datasets, where each key is the name of the dataset 
                        and each value is the dataset itself.
        ref_ds (xarray): A xarray dataset as reference for the consistent time coordinate

    Returns:
        dict: A dictionary with a new time coordinate depending on the reference dataset.
    """
    time = ref_ds.time
    
    for i, (name, ds) in enumerate(ds_dict.items()):
        # Create consistent time coordinate using the first time coordinate for all following models
        if not ds['time'].equals(time):
            ds['time'] = time
            # Add comment about changes to data 
            if 'log' in ds.attrs:
                log_old = ds.attrs['log']
                ds.attrs['log'] = f'Time coordinate changed to format cftime.DatetimeNoLeap(1850, 1, 16, 12, 0, 0, 0, has_year_zero=True). // {log_old}'
            else:
                ds.attrs['log'] = 'Time coordinate changed to format cftime.DatetimeNoLeap(1850, 1, 16, 12, 0, 0, 0, has_year_zero=True).'
        else:
            print('Time variable is already in the requested format')
            
        # Update the dictionary with the modified dataset
        ds_dict[name] = ds
            
    return ds_dict

In [3]:
def save_file(save_file, folder, save_var=True):
    """
    Save files as netCDF.

    Args:
        savefile (dict or dataset): Dictionary of xarray datasets or dataset.
        folder (string): Name of folder data is saved in.
        save_var (boolean): If True, data is saved separately for each variable. If false, one file is saved with all variables.
        

    Returns:
        nc_out: Path were data is saved in.
    """
    
    if save_var:
        for key, ds in ds_dict.items():
            for var in ds:
                # Variable to keep
                variable_to_keep = var
                dimensions_to_keep = {'time', 'lat', 'lon'}
                coordinates_to_keep = {'time', 'lat', 'lon'}

                if any('depth' in ds[var].dims for var in ds.variables):
                    dimensions_to_keep.add('depth')
                    coordinates_to_keep.add('depth')

                # Create a new dataset with only the desired variable
                ds_var = ds[[variable_to_keep]]

                # Keep only the desired dimensions
                ds_var = ds_var.isel({dim: slice(None) for dim in dimensions_to_keep.intersection(ds_var.dims)})

                # Set the desired coordinates
                coords_to_set = set(ds_var.variables).intersection(coordinates_to_keep)
                ds_var = ds_var.set_coords(list(coords_to_set))

                savepath = f'../../data/CMIP6/{ds_var.experiment_id}/{folder}/{var}/'
                filename = f'CMIP.{ds_var.source_id}.{ds_var.experiment_id}.{var}.nc'
                nc_out = os.path.join(savepath, filename)
                os.makedirs(savepath, exist_ok=True) 
                if os.path.exists(nc_out):
                    #    inp = input(f"Delete old file {filename} (y/n):")
                    #    if inp.lower() in ["y"]:
                            os.remove(nc_out)
                            print(f"File  with path: {nc_out} removed")
                    #    else:
                    #        filename = "temp_file.nc"
                    #        nc_out = os.path.join(savepath, filename)
                    #        print(f"Filename change to {filename}")

                # Save to netcdf file
                with dask.config.set(scheduler='threads'):
                    ds_var.to_netcdf(nc_out)
                    print(f"File with path: {nc_out} saved")
       
    else:
        for key in save_file.keys():
            ds_in = save_file[key]
            filename = f'CMIP.{ds_in.source_id}.{ds_in.experiment_id}.nc'
            savepath = f'../data/CMIP6/{ds_in.experiment_id}/{folder}'
            nc_out = os.path.join(savepath, filename)
            os.makedirs(savepath, exist_ok=True) 
            if os.path.exists(nc_out):
                inp = input(f"Delete old file {filename} (y/n):")
                if inp.lower() in ["y"]:
                    os.remove(nc_out)
                    print(f"File  with path: {nc_out} removed")
                else:
                    filename = "temp_file.nc"
                    nc_out = os.path.join(savepath, filename)
                    print(f"Filename change to {filename}")

            # Save to netcdf file
            with dask.config.set(scheduler='threads'):
                ds_in.to_netcdf(nc_out)

    return nc_out

In [4]:
# ========= Create a helper function to open the dataset ========
def open_dataset(filename):
    ds = xr.open_dataset(filename)
    return ds

# Define a helper function to open and merge datasets
def open_and_merge_datasets(folder, model, experiment_id, variables):
    filepaths = []
    for var in variables:
        path = f'../../data/CMIP6/{experiment_id}/{folder}/{var}'
        fp = glob.glob(os.path.join(path, f'CMIP.{model}.{experiment_id}.{var}.nc'))
        if fp:
            filepaths.append(fp[0])
        else:
            print(f"No file found for variable '{var}' in model '{model}'.")
            print(fp)

    datasets = [xr.open_dataset(fp) for fp in filepaths]
    ds = xr.merge(datasets)
    return ds

### 1. Load netCDF files

In [10]:
# ========= Define period, models and path ==============
variable=['mrso_1m']
experiment_id = 'ssp370'
source_id = ['TaiESM1', 'BCC-CSM2-MR',  'CanESM5', 'CNRM-CM6-1', 'CNRM-ESM2-1', 'IPSL-CM6A-LR', 'UKESM1-0-LL', 'MPI-ESM1-2-LR', 'CESM2-WACCM', 'NorESM2-MM'] #
folder='raw'

# ========= Use Dask to parallelize computations ==========
dask.config.set(scheduler='processes')

# Create dictionary using a dictionary comprehension and Dask
ds_dict = dask.compute({model: open_and_merge_datasets(folder, model, experiment_id, variable) for model in source_id})[0]

No file found for variable 'mrso_1m' in model 'TaiESM1'.
[]
No file found for variable 'mrso_1m' in model 'BCC-CSM2-MR'.
[]
No file found for variable 'mrso_1m' in model 'CanESM5'.
[]
No file found for variable 'mrso_1m' in model 'CNRM-CM6-1'.
[]
No file found for variable 'mrso_1m' in model 'CNRM-ESM2-1'.
[]
No file found for variable 'mrso_1m' in model 'IPSL-CM6A-LR'.
[]
No file found for variable 'mrso_1m' in model 'UKESM1-0-LL'.
[]
No file found for variable 'mrso_1m' in model 'MPI-ESM1-2-LR'.
[]
No file found for variable 'mrso_1m' in model 'CESM2-WACCM'.
[]
No file found for variable 'mrso_1m' in model 'NorESM2-MM'.
[]


In [6]:
# ========= Have a look into the dictionary =======
print(list(ds_dict.keys()))
ds_dict[list(ds_dict.keys())[1]]

['TaiESM1', 'BCC-CSM2-MR', 'CanESM5', 'CNRM-CM6-1', 'CNRM-ESM2-1', 'IPSL-CM6A-LR', 'UKESM1-0-LL', 'MPI-ESM1-2-LR', 'CESM2-WACCM', 'NorESM2-MM']


### 2. Create consistent time coordinates

In [7]:
# =========== Create consistent time coordinate ==========
# Define reference dataset with desired time coordinate and set variables as some variables seem to have different time coordinates even from the same model
ref_ds = xr.open_dataset(f'../../data/CMIP6/{experiment_id}/preprocessed/mrsol/CMIP.NorESM2-MM.{experiment_id}.mrsol.nc')

In [8]:
ref_ds

In [9]:
# Apply time coordinate on dictionary
ds_dict = consis_time(ds_dict, ref_ds)

Time variable is already in the requested format
Time variable is already in the requested format
Time variable is already in the requested format
Time variable is already in the requested format
Time variable is already in the requested format
Time variable is already in the requested format
Time variable is already in the requested format
Time variable is already in the requested format
Time variable is already in the requested format
Time variable is already in the requested format


### 3. Save and replace netcdf files

In [17]:
# =========== Store file and remove any former one ==========
folder='preprocessed'
nc_out = save_file(ds_dict, folder=folder)

File with path: ../../data/CMIP6/ssp370/preprocessed/hurs/CMIP.TaiESM1.ssp370.hurs.nc saved
File with path: ../../data/CMIP6/ssp370/preprocessed/hurs/CMIP.BCC-CSM2-MR.ssp370.hurs.nc saved
File with path: ../../data/CMIP6/ssp370/preprocessed/hurs/CMIP.CanESM5.ssp370.hurs.nc saved
File with path: ../../data/CMIP6/ssp370/preprocessed/hurs/CMIP.CNRM-CM6-1.ssp370.hurs.nc saved
File with path: ../../data/CMIP6/ssp370/preprocessed/hurs/CMIP.CNRM-ESM2-1.ssp370.hurs.nc saved
File with path: ../../data/CMIP6/ssp370/preprocessed/hurs/CMIP.IPSL-CM6A-LR.ssp370.hurs.nc saved
File with path: ../../data/CMIP6/ssp370/preprocessed/hurs/CMIP.UKESM1-0-LL.ssp370.hurs.nc saved
File with path: ../../data/CMIP6/ssp370/preprocessed/hurs/CMIP.MPI-ESM1-2-LR.ssp370.hurs.nc saved
File with path: ../../data/CMIP6/ssp370/preprocessed/hurs/CMIP.CESM2-WACCM.ssp370.hurs.nc saved
File with path: ../../data/CMIP6/ssp370/preprocessed/hurs/CMIP.NorESM2-MM.ssp370.hurs.nc saved


In [55]:
# =========== Check stored file ==============
xr.open_dataset(nc_out)