# CMIP6 Drop Redundant Coordinates or Dimensions

**Following steps are included in this script:**

1. Open file
2. Drop redundant coordinates and variables
3. Save data to netcdf files

In [1]:
# ========== Packages ==========
import xarray as xr
import dask
import os
import pandas as pd
import numpy as np

### Functions

In [23]:
def drop_redundant(ds_dict, drop_list): 
    """
    Remove redundant coordinates and variables from datasets in a dictionary.

    Parameters:
    ds_dict (dict): Dictionary containing dataset names as keys and xarray.Dataset objects as values.
    drop_list (list): List of redundant coordinate or variable names to be removed from the datasets.

    Returns:
    dict: Dictionary with the same keys as the input ds_dict and modified xarray.Dataset objects with redundant elements removed.
    """
    for ds_name, ds_data in ds_dict.items():
        
        if 'sdepth' in ds_data.coords:
            if 'depth' in ds_data.coords:
                ds_data = ds_data.drop('depth')
            if 'depth' in ds_data.dims:
                ds_data = ds_data.drop_dims('depth')
            ds_data = ds_data.rename({'sdepth': 'depth'})
            print(f'sdepth changed to depth for model {ds_data.source_id}')
            # Add comment about changes to data 
            if 'log' in ds_data.attrs:
                log_old = ds_data.attrs['log']
                ds_data.attrs['log'] = f'Coordinate name changed from sdepth to depth. // {log_old}'
            else:
                ds_data.attrs['log'] = 'Coordinate name changed from sdepth to depth.'
            
        if 'solth' in ds_data.coords:
            if 'depth' in ds_data.coords:
                ds_data = ds_data.drop('depth')
            if 'depth' in ds_data.dims:
                ds_data = ds_data.drop_dims('depth')
            ds_data = ds_data.rename({'solth': 'depth'})
            print(f'solth changed to depth for model {ds_data.source_id}')
            # Add comment about changes to data 
            if 'log' in ds_data.attrs:
                log_old = ds_data.attrs['log']
                ds_data.attrs['log'] = f'Coordinate name changed from solth to depth. // {log_old}'
            else:
                ds_data.attrs['log'] = 'Coordinate name changed from solth to depth.'
   
        
        if 'mrsol' in ds_data and 'depth' in drop_list or 'tsl' in ds_data and 'depth' in drop_list:
            drop_list.remove('depth')
                      
        for coord in drop_list:
            if coord in ds_data.coords:
                ds_data = ds_data.drop(coord).squeeze()
                print(f'Dropped coordinate: {coord}')
                # Add comment about changes to data 
                if 'log' in ds_data.attrs:
                    log_old = ds_data.attrs['log']
                    ds_data.attrs['log'] = f'Dropped: {coord}. // {log_old}'
                else:
                    ds_data.attrs['log'] = f'Dropped: {coord}.'
            if coord in ds_data.variables:
                ds_data = ds_data.drop_vars(coord).squeeze()
                print(f'Dropped variable: {coord}')
                # Add comment about changes to data 
                if 'log' in ds_data.attrs:
                    log_old = ds_data.attrs['log']
                    ds_data.attrs['log'] = f'Dropped: {coord}. // {log_old}'
                else:
                    ds_data.attrs['log'] = f'Dropped: {coord}.'
            
        # Check if the coords were dropped successfully and use squeeze if their length is 1
        for coord in drop_list:
            if coord in ds_data.dims:
                print(f"Coordinate {coord} was not dropped.")
                if ds_data.dims[coord] == 1:
                    ds_data = ds_data.squeeze(coord, drop=True)
                    print(f"Squeezed coordinate: {coord}")
                    # Add comment about changes to data 
                    if 'log' in ds_data.attrs:
                        log_old = ds_data.attrs['log']
                        ds_data.attrs['log'] = f'Dropped: {coord}. // {log_old}'
                    else:
                        ds_data.attrs['log'] = f'Dropped: {coord}.'
            
        # Update the dictionary with the modified dataset
        ds_dict[ds_name] = ds_data
    
    return ds_dict

In [3]:
def save_file(save_file, folder, save_var=True):
    """
    Save files as netCDF.

    Args:
        savefile (dict or dataset): Dictionary of xarray datasets or dataset.
        folder (string): Name of folder data is saved in.
        save_var (boolean): If True, data is saved separately for each variable. If false, one file is saved with all variables.
        

    Returns:
        nc_out: Path were data is saved in.
    """
    
    if save_var:
        for key, ds in ds_dict.items():
            for var in ds:
                # Variable to keep
                variable_to_keep = var
                dimensions_to_keep = {'time', 'lat', 'lon'}
                coordinates_to_keep = {'time', 'lat', 'lon'}

                if any('depth' in ds[var].dims for var in ds.variables):
                    dimensions_to_keep.add('depth')
                    coordinates_to_keep.add('depth')

                # Create a new dataset with only the desired variable
                ds_var = ds[[variable_to_keep]]

                # Keep only the desired dimensions
                ds_var = ds_var.isel({dim: slice(None) for dim in dimensions_to_keep.intersection(ds_var.dims)})

                # Set the desired coordinates
                coords_to_set = set(ds_var.variables).intersection(coordinates_to_keep)
                ds_var = ds_var.set_coords(list(coords_to_set))

                savepath = f'../../data/CMIP6/{ds_var.experiment_id}/raw/{var}/'
                filename = f'CMIP.{ds_var.source_id}.{ds_var.experiment_id}.{var}.nc'
                nc_out = os.path.join(savepath, filename)
                os.makedirs(savepath, exist_ok=True) 
                if os.path.exists(nc_out):
                       # inp = input(f"Delete old file {filename} (y/n):")
                       # if inp.lower() in ["y"]:
                            os.remove(nc_out)
                            print(f"File with path: {nc_out} removed")
                       # else:
                       #     filename = "temp_file.nc"
                       #     nc_out = os.path.join(savepath, filename)
                       #     print(f"Filename change to {filename}")

                # Save to netcdf file
                with dask.config.set(scheduler='threads'):
                    ds_var.to_netcdf(nc_out)
                    print(f"File with path: {nc_out} saved")
       
    else:
        for key in save_file.keys():
            ds_in = save_file[key]
            filename = f'CMIP.{ds_in.source_id}.{ds_in.experiment_id}.nc'
            savepath = f'../data/CMIP6/{ds_in.experiment_id}/{folder}'
            nc_out = os.path.join(savepath, filename)
            os.makedirs(savepath, exist_ok=True) 
            if os.path.exists(nc_out):
                inp = input(f"Delete old file {filename} (y/n):")
                if inp.lower() in ["y"]:
                    os.remove(nc_out)
                    print(f"File  with path: {nc_out} removed")
                else:
                    filename = "temp_file.nc"
                    nc_out = os.path.join(savepath, filename)
                    print(f"Filename change to {filename}")

            # Save to netcdf file
            with dask.config.set(scheduler='threads'):
                ds_in.to_netcdf(nc_out)

    return nc_out

### 1. Open files

In [15]:
# ========= Define period, models and path ==============
variable='tsl'
experiment_id = 'historical'
source_id = ['AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CanESM5', 'CESM2-FV2', 'CESM2-WACCM-FV2', 'CESM2-WACCM', 'CESM2', 'CNRM-CM6-1-HR','CNRM-CM6-1', 'CNRM-ESM2-1', 'IPSL-CM6A-LR', 'NorESM2-MM', 'TaiESM1', 'UKESM1-0-LL'] # 'SAM0-UNICON''AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CanESM5', 'CESM2-FV2', 'CESM2-WACCM-FV2', 'CESM2-WACCM', 'CESM2', 'CNRM-CM6-1-HR','CNRM-CM6-1', 'CNRM-ESM2-1', ''IPSL-CM6A-LR', 'NorESM2-MM', 'TaiESM1', 'UKESM1-0-LL', 'SAM0-UNICON'], ['BCC-CSM2-MR', 'CESM2', 'CNRM-CM6-1-HR','NorESM2-MM', 'SAM0-UNICON', 'TaiESM1'] 
savepath = f'../../data/CMIP6/{experiment_id}/raw/{variable}/'

# ========= Use Dask to parallelize computations ==========
dask.config.set(scheduler='processes')

# ========= Create a helper function to open the dataset ========
def open_dataset(filename):
    ds = xr.open_dataset(filename)
    return ds

# ========= Create dictionary using a dictionary comprehension and Dask =======
ds_dict, = dask.compute({model: open_dataset(os.path.join(savepath, f'CMIP.{model}.{experiment_id}.{variable}.nc'))
                        for model in source_id})

### 2. Drop redundant coordinates and variables

In [24]:
# =========== Drop redundant coordinates and variables ================

# Define redundant coordinates and variables
drop_list = ['member_id','nbnd', 'bnds', 'height', 'depth', 'lat_bnds', 'lon_bnds', 'time_bnds', 'time_bounds', 'depth_bnds', 'sdepth_bounds', 'depth_bounds', 'hist_interval', 'axis_nbounds'] #depth is not dropped for datasets with variable mrsol

# Drop redundant coordinates and variables
ds_dict = drop_redundant(ds_dict, drop_list)

solth changed to depth for model IPSL-CM6A-LR


In [26]:
# =========== Check dictionary for consistency =============
print(ds_dict.keys())
ds_dict[list(ds_dict.keys())[11]]

dict_keys(['AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CanESM5', 'CESM2-FV2', 'CESM2-WACCM-FV2', 'CESM2-WACCM', 'CESM2', 'CNRM-CM6-1-HR', 'CNRM-CM6-1', 'CNRM-ESM2-1', 'IPSL-CM6A-LR', 'NorESM2-MM', 'TaiESM1', 'UKESM1-0-LL'])


### 3. Save data to netcdf files

In [27]:
# =========== Store file and remove any former one ==========
nc_out = save_file(ds_dict, folder='raw')

File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.AWI-ESM-1-1-LR.historical.tsl.nc removed
File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.AWI-ESM-1-1-LR.historical.tsl.nc saved
File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.BCC-CSM2-MR.historical.tsl.nc removed
File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.BCC-CSM2-MR.historical.tsl.nc saved
File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.BCC-ESM1.historical.tsl.nc removed
File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.BCC-ESM1.historical.tsl.nc saved
File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.CanESM5.historical.tsl.nc removed
File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.CanESM5.historical.tsl.nc saved
File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.CESM2-FV2.historical.tsl.nc removed
File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.CESM2-FV2.historical.tsl.nc saved
File with path: ../../data/CMIP6/historical/raw/tsl/CMIP.CESM2-WACCM-FV2

In [None]:
# =========== Check stored file ==============
xr.open_dataset(nc_out)