# CMIP6 Add Variables to Dataset

**Following steps are included in this script:**

1. Open file
2. Add Variables to Dataset
3. Save data to netcdf files

In [1]:
# ========== Packages ==========
import xarray as xr
import dask
import os
import pandas as pd
import numpy as np

### Functions

In [None]:
def save_file(save_file, folder, save_var=True):
    """
    Save files as netCDF.

    Args:
        savefile (dict or dataset): Dictionary of xarray datasets or dataset.
        folder (string): Name of folder data is saved in.
        save_var (boolean): If True, data is saved separately for each variable. If false, one file is saved with all variables.
        

    Returns:
        nc_out: Path were data is saved in.
    """
    
    if save_var:
        for key, ds in ds_dict.items():
            for var in ds:
                # Variable to keep
                variable_to_keep = var
                dimensions_to_keep = {'time', 'lat', 'lon'}
                coordinates_to_keep = {'time', 'lat', 'lon'}

                if any('depth' in ds[var].dims for var in ds.variables):
                    dimensions_to_keep.add('depth')
                    coordinates_to_keep.add('depth')

                # Create a new dataset with only the desired variable
                ds_var = ds[[variable_to_keep]]

                # Keep only the desired dimensions
                ds_var = ds_var.isel({dim: slice(None) for dim in dimensions_to_keep.intersection(ds_var.dims)})

                # Set the desired coordinates
                coords_to_set = set(ds_var.variables).intersection(coordinates_to_keep)
                ds_var = ds_var.set_coords(list(coords_to_set))

                savepath = f'../../data/CMIP6/{ds_var.experiment_id}/raw/{var}/'
                filename = f'CMIP.{ds_var.source_id}.{ds_var.experiment_id}.{var}.nc'
                nc_out = os.path.join(savepath, filename)
                os.makedirs(savepath, exist_ok=True) 
                if os.path.exists(nc_out):
                        inp = input(f"Delete old file {filename} (y/n):")
                        if inp.lower() in ["y"]:
                            os.remove(nc_out)
                            print(f"File with path: {nc_out} removed")
                        else:
                            filename = "temp_file.nc"
                            nc_out = os.path.join(savepath, filename)
                            print(f"Filename change to {filename}")

                # Save to netcdf file
                with dask.config.set(scheduler='threads'):
                    ds_var.to_netcdf(nc_out)
                    print(f"File with path: {nc_out} saved")
       
    else:
        for key in save_file.keys():
            ds_in = save_file[key]
            filename = f'CMIP.{ds_in.source_id}.{ds_in.experiment_id}.nc'
            savepath = f'../data/CMIP6/{ds_in.experiment_id}/{folder}'
            nc_out = os.path.join(savepath, filename)
            os.makedirs(savepath, exist_ok=True) 
            if os.path.exists(nc_out):
                inp = input(f"Delete old file {filename} (y/n):")
                if inp.lower() in ["y"]:
                    os.remove(nc_out)
                    print(f"File  with path: {nc_out} removed")
                else:
                    filename = "temp_file.nc"
                    nc_out = os.path.join(savepath, filename)
                    print(f"Filename change to {filename}")

            # Save to netcdf file
            with dask.config.set(scheduler='threads'):
                ds_in.to_netcdf(nc_out)

    return nc_out

### 1. Load data

In [None]:
# ========= Load model which needs to be updated ==============
experiment_id = 'historical'
source_id = ['BCC-CSM2-MR']
savepath = f'../../data/CMIP6/{experiment_id}/preprocessed'

# ========= Use Dask to parallelize computations ==========
dask.config.set(scheduler='processes')

# ========= Create a helper function to open the dataset ========
def open_dataset(filename):
    ds = xr.open_dataset(filename)
    return ds

# ========= Create dictionary using a dictionary comprehension and Dask =======
ds_dict, = dask.compute({model: open_dataset(os.path.join(savepath, f'CMIP.{model}.{experiment_id}.nc'))
                        for model in source_id})

In [None]:
# =========== Check dictionary =============
print(ds_dict.keys())
ds_dict[list(ds_dict.keys())[8]]

### 2. Add new variable to dataset 

In [None]:
# ========= Create a dictionary with the computed monthly mean and the loaded model data ==========

ds_dict_all = {}
ds_dict_all['dataset_one'] = ds_dict[list(ds_dict.keys())[0]]
ds_dict_all['dataset_two'] = ds_dict_[list(ds_dict_.keys())[0]]

In [None]:
# ========= Merge data ======================
ds_dict_all = merge_source_id_data(ds_dict_all)

In [None]:
# =========== Check dictionary =============
print(ds_dict_all.keys())
ds_dict_all[list(ds_dict_all.keys())[8]]

### 3. Store netcdf files

In [None]:
# =========== Store file and remove any former one ==========
nc_out = save_file(ds_dict_all, folder='raw')

In [None]:
# =========== Check stored file ==============
xr.open_dataset(nc_out)