# CMIP6 Convert Units

**Following steps are included in this script:**

1. Load netCDF files
2. Convert Units to Specified Format
3. Save and replace netcdf files

In [1]:
# ========== Packages ==========
import xarray as xr
import dask
import os
from dask.delayed import delayed
import pandas as pd
import glob

### Functions

In [2]:
def create_log(ds, var, old_unit):
        if 'log' in ds.attrs:
            log_old = ds.attrs['log']
            ds.attrs['log'] = f'Unit of {var} converted from {old_unit} to {ds[var].units}. // {log_old}'
        else:
            ds.attrs['log'] = f'Unit of {var} converted from {old_unit} to {ds[var].units}.'

        print(f"Unit of {var} converted from {old_unit} to {ds[var].units}.")
        return ds

In [3]:
def set_units(ds_dict, conv_units):
    """
     Convert units for specified variables
    """
    
    for i, (name, ds) in enumerate(ds_dict.items()):

        for var in list(conv_units.keys()):

            if var in ds.variables:
                old_unit = ds[var].units
                
                if conv_units[var] == ds[var].units:
                    print('Unit already in the requested format')
                    
                elif var == 'lai':
                    # Keep existing attributes and only modify the units attribute
                    attrs = ds[var].attrs
                    attrs['units'] = conv_units[var]
                    attrs['equation'] = 'leaf area / ground area'
                    ds[var].attrs = attrs
                    ds = create_log(ds, var, old_unit)


                elif conv_units[var] == 'gC/m²/day':
                    if ds[var].units == 'kg/m²/s' or 'kg m-2 s-1':
                    
                        # Keep existing attributes and only modify the units attribute
                        attrs = ds[var].attrs
                        attrs['units'] = conv_units[var]
                        ds[var] = ds[var] * 1000 * 60 * 60 * 24 
                        ds[var].attrs = attrs
                        ds = create_log(ds, var, old_unit)
                        
                elif conv_units[var] == 'mm/day':
                    if ds[var].units == 'kg/m²/s' or ds[var].units =='kg m-2 s-1':
    
                        # Keep existing attributes and only modify the units attribute
                        attrs = ds[var].attrs
                        attrs['units'] = conv_units[var]
                        ds[var] = ds[var] * 60 * 60 * 24 
                        ds[var].attrs = attrs
                        ds = create_log(ds, var, old_unit)
                
                elif conv_units[var] == 'hPa':
                    if ds[var].units == 'Pa':
    
                        # Keep existing attributes and only modify the units attribute
                        attrs = ds[var].attrs
                        attrs['units'] = conv_units[var]
                        ds[var] = ds[var] / 100 
                        ds[var].attrs = attrs
                        ds = create_log(ds, var, old_unit)
                        
                elif conv_units[var] == 'ppm':
                    if ds[var].units == 'kg':
                        # Keep existing attributes and only modify the units attribute
                        attrs = ds[var].attrs
                        attrs['units'] = conv_units[var]
                        attrs['long_name'] = 'CO2 concentration'
                        # Constants
                        molar_mass_co2 = 44.01  # Molar mass of CO2 in grams per mole (g/mol)
                        moles_of_air = 2.13e20  # Volume of the atmosphere in moles (may vary, check CMIP6 documentation)
                        # Convert co2mass from kg to moles
                        ds[var] = ((ds[var] / molar_mass_co2) / moles_of_air) * 1e6
                        ds[var].attrs = attrs
                        ds = create_log(ds, var, old_unit)
                            
                elif conv_units[var] == '°C':
                    if ds[var].units == 'K':
                        # Keep existing attributes and only modify the units attribute
                        attrs = ds[var].attrs
                        attrs['units'] = conv_units[var]
                        # Convert co2mass from kg to moles
                        ds[var] = ds[var] - 273.15
                        ds[var].attrs = attrs
                        ds = create_log(ds, var, old_unit)
                
                elif conv_units[var] == 'mm':
                    if ds[var].units == 'kg/m²':
                        # Keep existing attributes and only modify the units attribute
                        attrs = ds[var].attrs
                        attrs['units'] = conv_units[var]
                        # Convert co2mass from kg to moles
                        ds[var] = ds[var] / 1e3  # This now represents mm of water
                        ds[var].attrs = attrs
                        ds = create_log(ds, var, old_unit)

                else: 
                    raise ValueError(f"No unit conversion for variable '{var}' specified.")

            else:
                raise ValueError(f"No variable '{var}' in ds_dict.")
        
        ds_dict[name] = ds
                
    return ds_dict

In [4]:
def save_file(save_file, folder, save_var=True):
    """
    Save files as netCDF.

    Args:
        savefile (dict or dataset): Dictionary of xarray datasets or dataset.
        folder (string): Name of folder data is saved in.
        save_var (boolean): If True, data is saved separately for each variable. If false, one file is saved with all variables.
        

    Returns:
        nc_out: Path were data is saved in.
    """
    
    if save_var:
        for key, ds in ds_dict.items():
            for var in ds:
                # Variable to keep
                variable_to_keep = var
                dimensions_to_keep = {'time', 'lat', 'lon'}
                coordinates_to_keep = {'time', 'lat', 'lon'}

                if any('depth' in ds[var].dims for var in ds.variables):
                    dimensions_to_keep.add('depth')
                    coordinates_to_keep.add('depth')

                # Create a new dataset with only the desired variable
                ds_var = ds[[variable_to_keep]]

                # Keep only the desired dimensions
                ds_var = ds_var.isel({dim: slice(None) for dim in dimensions_to_keep.intersection(ds_var.dims)})

                # Set the desired coordinates
                coords_to_set = set(ds_var.variables).intersection(coordinates_to_keep)
                ds_var = ds_var.set_coords(list(coords_to_set))

                savepath = f'../../data/CMIP6/{ds_var.experiment_id}/{folder}/{var}/'
                filename = f'CMIP.{ds_var.source_id}.{ds_var.experiment_id}.{var}_regridded.nc'
                nc_out = os.path.join(savepath, filename)
                os.makedirs(savepath, exist_ok=True) 
                if os.path.exists(nc_out):
                       # inp = input(f"Delete old file {filename} (y/n):")
                       # if inp.lower() in ["y"]:
                            os.remove(nc_out)
                            print(f"File  with path: {nc_out} removed")
                        #else:
                        #    filename = "temp_file.nc"
                        #    nc_out = os.path.join(savepath, filename)
                        #    print(f"Filename change to {filename}")

                # Save to netcdf file
                with dask.config.set(scheduler='threads'):
                    ds_var.to_netcdf(nc_out)
                    print(f"File with path: {nc_out} saved")
       
    else:
        for key in save_file.keys():
            ds_in = save_file[key]
            filename = f'CMIP.{ds_in.source_id}.{ds_in.experiment_id}.nc'
            savepath = f'../../data/CMIP6/{ds_in.experiment_id}/{folder}'
            nc_out = os.path.join(savepath, filename)
            os.makedirs(savepath, exist_ok=True) 
            if os.path.exists(nc_out):
                inp = input(f"Delete old file {filename} (y/n):")
                if inp.lower() in ["y"]:
                    os.remove(nc_out)
                    print(f"File  with path: {nc_out} removed")
                else:
                    filename = "temp_file.nc"
                    nc_out = os.path.join(savepath, filename)
                    print(f"Filename change to {filename}")

            # Save to netcdf file
            with dask.config.set(scheduler='threads'):
                ds_in.to_netcdf(nc_out)

    return nc_out

In [5]:
# ========= Helper function to open the dataset ========
def open_dataset(filename):
    ds = xr.open_dataset(filename)
    return ds

In [6]:
# Define a helper function to open and merge datasets
def open_and_merge_datasets(folder, model, experiment_id, variables):
    filepaths = []
    for var in variables:
        path = f'../../data/CMIP6/{experiment_id}/{folder}/{var}'
        fp = glob.glob(os.path.join(path, f'CMIP.{model}.{experiment_id}.{var}_regridded.nc'))
        if fp:
            filepaths.append(fp[0])
        else:
            #print(f"No file found for variable '{var}' in model '{model}'.")
            print(fp)

    datasets = [xr.open_dataset(fp) for fp in filepaths]
    ds = xr.merge(datasets)
    return ds

### 1. Load netCDF files

In [7]:
# ========= Define period, models and path ==============
variables=['vpd', 'lmrso_1m', 'lmrso_2m']
experiment_id = 'historical'
source_id = ['CESM2-WACCM', 'NorESM2-MM', 'Ensemble mean', 'Ensemble median'] #
#source_id = ['TaiESM1', 'BCC-CSM2-MR',  'CanESM5', 'CNRM-CM6-1', 'CNRM-ESM2-1', 'IPSL-CM6A-LR', 'UKESM1-0-LL', 'MPI-ESM1-2-LR', 'CESM2-WACCM', 'NorESM2-MM', 'Ensemble mean', 'Ensemble median'] #

folder='preprocessed'

# ========= Use Dask to parallelize computations ==========
dask.config.set(scheduler='processes')

# Create dictionary using a dictionary comprehension and Dask
ds_dict = dask.compute({model: open_and_merge_datasets(folder, model, experiment_id, variables) for model in source_id})[0]

In [8]:
# ========= Have a look into the dictionary =======
print(list(ds_dict.keys()))
ds_dict[list(ds_dict.keys())[3]][variable]

['CESM2-WACCM', 'NorESM2-MM', 'Ensemble mean', 'Ensemble median']


NameError: name 'variable' is not defined

### 2. Convert units

In [9]:
# ========== Convert units ============

# New unit conversion must be defined in function
conv_units = {#'pr': 'mm/day',
            #'evspsbl': 'mm/day',
            #'evspsblsoi': 'mm/day', 
            #'evspsblveg': 'mm/day', 
            #'mrro': 'mm/day', 
            #'mrros': 'mm/day',
            #'gpp': 'gC/m²/day', 
            #'npp': 'gC/m²/day',
            #'tran': 'mm/day'
            #'lai': ''
            'vpd': 'hPa',
            #'tas': '°C'
            'lmrso_1m': 'mm',
            'lmrso_2m': 'mm'
            }

In [10]:
ds_dict = set_units(ds_dict, conv_units)

Unit of vpd converted from Pa to hPa.
Unit already in the requested format
Unit of lmrso_2m converted from kg/m² to mm.
Unit of vpd converted from Pa to hPa.
Unit already in the requested format
Unit of lmrso_2m converted from kg/m² to mm.
Unit of vpd converted from Pa to hPa.
Unit of lmrso_1m converted from kg/m² to mm.
Unit of lmrso_2m converted from kg/m² to mm.
Unit of vpd converted from Pa to hPa.
Unit of lmrso_1m converted from kg/m² to mm.
Unit of lmrso_2m converted from kg/m² to mm.


### 3. Save and replace netcdf files

In [11]:
# =========== Store file and remove any former one ==========
nc_out = save_file(ds_dict, folder=folder)

File  with path: ../../data/CMIP6/historical/preprocessed/vpd/CMIP.CESM2-WACCM.historical.vpd_regridded.nc removed
File with path: ../../data/CMIP6/historical/preprocessed/vpd/CMIP.CESM2-WACCM.historical.vpd_regridded.nc saved
File  with path: ../../data/CMIP6/historical/preprocessed/lmrso_1m/CMIP.CESM2-WACCM.historical.lmrso_1m_regridded.nc removed
File with path: ../../data/CMIP6/historical/preprocessed/lmrso_1m/CMIP.CESM2-WACCM.historical.lmrso_1m_regridded.nc saved
File  with path: ../../data/CMIP6/historical/preprocessed/lmrso_2m/CMIP.CESM2-WACCM.historical.lmrso_2m_regridded.nc removed
File with path: ../../data/CMIP6/historical/preprocessed/lmrso_2m/CMIP.CESM2-WACCM.historical.lmrso_2m_regridded.nc saved
File  with path: ../../data/CMIP6/historical/preprocessed/vpd/CMIP.NorESM2-MM.historical.vpd_regridded.nc removed
File with path: ../../data/CMIP6/historical/preprocessed/vpd/CMIP.NorESM2-MM.historical.vpd_regridded.nc saved
File  with path: ../../data/CMIP6/historical/preproces

In [None]:
# =========== Check stored file ==============
xr.open_dataset(nc_out)