# CMIP6 Split Data into Variables

**Following steps are included in this script:**

1. Load Data
2. Split and Save New Data Files

This is necessary if you saved all your variables in one data file and want to store them seperatly now. Please drop all redundent dimensions and coordinates before and be carefull as here only the dimensions time, lat, lon and depth are considered.

In [1]:
# ========== Packages ==========
import xarray as xr
import dask
import os

### Functions

In [2]:
def split_and_save(ds_dict): 
    """
    Split and save xarray datasets in a dictionary into separate netCDF files.

    Args:
    ds_dict (dict): A dictionary containing xarray datasets.
   
    Returns:
    None
    """

    for key, ds in ds_dict.items():
        for var in ds:
            # Variable to keep
            variable_to_keep = var
            dimensions_to_keep = {'time', 'lat', 'lon'}
            coordinates_to_keep = {'time', 'lat', 'lon'}

            if any('depth' in ds[var].dims for var in ds.variables):
                dimensions_to_keep.add('depth')
                coordinates_to_keep.add('depth')

            # Create a new dataset with only the desired variable
            ds_var = ds[[variable_to_keep]]

            # Keep only the desired dimensions
            ds_var = ds_var.isel({dim: slice(None) for dim in dimensions_to_keep.intersection(ds_var.dims)})

            # Set the desired coordinates
            coords_to_set = set(ds_var.variables).intersection(coordinates_to_keep)
            ds_var = ds_var.set_coords(list(coords_to_set))

            savepath = f'../../data/CMIP6/{ds_var.experiment_id}/raw/{var}/'
            filename = f'CMIP.{ds_var.source_id}.{ds_var.experiment_id}.{var}.nc'
            nc_out = os.path.join(savepath, filename)
            os.makedirs(savepath, exist_ok=True) 
            if os.path.exists(nc_out):
                 #   inp = input(f"Delete old file {filename} (y/n):")
                 #   if inp.lower() in ["y"]:
                        os.remove(nc_out)
                        print(f"File  with path: {nc_out} removed")
                 #   else:
                 #       filename = "temp_file.nc"
                 #       nc_out = os.path.join(savepath, filename)
                 #       print(f"Filename change to {filename}")
                        
            # Save to netcdf file
            with dask.config.set(scheduler='threads'):
                ds_var.to_netcdf(nc_out)
                print(f"File with path: {nc_out} saved")

### 1. Load Data

In [23]:
# ========= Define period, models and path ==============
experiment_id = 'historical'
source_id = ['SAM0-UNICON'] # 'TaiESM1', 'AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CanESM5', 'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'UKESM1-0-LL', 'CESM2', 'CESM2-FV2', 'CESM2-WACCM', 'NorESM2-MM'], 
savepath = f'../../data/CMIP6/{experiment_id}/raw/'

# ========= Use Dask to parallelize computations ==========
dask.config.set(scheduler='processes')

# ========= Create a helper function to open the dataset ========
def open_dataset(filename):
    ds = xr.open_dataset(filename)
    return ds

# ========= Create dictionary using a dictionary comprehension and Dask =======
ds_dict, = dask.compute({model: open_dataset(os.path.join(savepath, f'CMIP.{model}.{experiment_id}.nc'))
                        for model in source_id})

In [24]:
# ========= Have a look into the dictionary =======
print(list(ds_dict.keys()))
ds_dict[list(ds_dict.keys())[0]]

['SAM0-UNICON']


### 2. Split and Save New Data Files

In [25]:
nc_out = split_and_save(ds_dict)

File with path: ../../data/CMIP6/historical/raw/mrsol/CMIP.SAM0-UNICON.historical.mrsol.nc saved
File with path: ../../data/CMIP6/historical/raw/evspsbl/CMIP.SAM0-UNICON.historical.evspsbl.nc saved
File with path: ../../data/CMIP6/historical/raw/hurs/CMIP.SAM0-UNICON.historical.hurs.nc saved
File with path: ../../data/CMIP6/historical/raw/huss/CMIP.SAM0-UNICON.historical.huss.nc saved
File with path: ../../data/CMIP6/historical/raw/pr/CMIP.SAM0-UNICON.historical.pr.nc saved
File with path: ../../data/CMIP6/historical/raw/evspsblsoi/CMIP.SAM0-UNICON.historical.evspsblsoi.nc saved
File with path: ../../data/CMIP6/historical/raw/evspsblveg/CMIP.SAM0-UNICON.historical.evspsblveg.nc saved
File with path: ../../data/CMIP6/historical/raw/gpp/CMIP.SAM0-UNICON.historical.gpp.nc saved
File with path: ../../data/CMIP6/historical/raw/lai/CMIP.SAM0-UNICON.historical.lai.nc saved
File with path: ../../data/CMIP6/historical/raw/mrro/CMIP.SAM0-UNICON.historical.mrro.nc saved
File with path: ../../data

In [10]:
# ========= Have a look into the new data =======
var = 'mrsol'
open_dataset(os.path.join(savepath,var, f'CMIP.{ds_dict[list(ds_dict.keys())[0]].source_id}.{ds_dict[list(ds_dict.keys())[0]].experiment_id}.{var}.nc'))