# CMIP6 Soil Moisture

**Following steps are included in this script:**

1. Load netCDF files
2. Compute Soil Moisture for top 1 and 2 meters by interpolation
3. Save and replace netcdf files

In [1]:
# ========== Packages ==========
import xarray as xr
import pandas as pd
import numpy as np
import dask
import os

### Functions

In [2]:
import matplotlib.pyplot as plt
def plot_sm_profile(ds_depth, save_fig=False, xlim_bound=3, ylim_bound=1000):
    """
    Plots soil moisture profile.

    Args:
        ds_depth (dict): A dictionary of xarray datasets with depth and mean mean soil water content per layer (mrsol).
        save_fig (bool): If True, save the figure to a file. Default is False.
        xlim_bound (float): A value to set the max for the x-axis. Default is 3.
        ylim_bound (float): A value to set the max for the y-axis. Default is 1000.
    """
    
    fig, ax = plt.subplots(figsize=(30, 15))

    plt.xlim(0, xlim_bound)
    plt.ylim(0, ylim_bound)

    # Define the marker size for the plot
    marker_size = 150

    for i, (name, ds) in enumerate(ds_depth.items()):

        data_to_plot = ds.squeeze()
        data_lines = ax.plot(data_to_plot['depth'], data_to_plot.variable, linestyle='--', label=f"{name}")
        data_color = data_lines[0].get_color()
        data_markers = data_to_plot.plot.scatter(x='depth', y='variable', s=marker_size, c=data_color, ax=ax, label=None)

    plt.legend()

    if save_fig:
        fig.savefig(f'../results/CMIP6/soil_moisture_profile.png', dpi=300)

In [3]:
def soil_moisture_profile(ds_dict, plot_fig=True, save_fig=False, xlim_bound=3, ylim_bound=1000):
    """
    Plots soil moisture profile.

    Args:
        ds_depth (dict): A dictionary of xarray datasets for computing the and mean soil water content per layer (mrsol).
        plot_fig (bool): If True, plot the figure. Default is True.
        save_fig (bool): If True, save the figure to a file. Default is False. plot_fig has to be True as well to save figure.
        xlim_bound (float): A value to set the max for the x-axis. Default is 3.
        ylim_bound (float): A value to set the max for the y-axis. Default is 1000.

    Returns:
        dict: A dictionary with computed statistic for each dataset.
    """
    
    ds_depth = {}
    
    for i, (name, ds) in enumerate(ds_dict.items()):
        
        mean_time = getattr(ds.tsl, 'mean')("time", keep_attrs=True, skipna=True)
        mean_time_space = getattr(mean_time, 'mean')(("lon", "lat"), keep_attrs=True, skipna=True)
        ds_depth[ds.source_id] = mean_time_space
    
    if plot_fig:
        plot_sm_profile(ds_depth, save_fig=save_fig)

    return ds_depth

In [4]:
def ms_1_and_2m(ds_dict):

    for i, (name, ds) in enumerate(ds_dict.items()):
        if 'mrsol100cm' in ds:
            ds['mrsol1m'] = ds['mrsol100cm']
            ds_dict[name] = ds.drop('mrsol100cm')

    for i, (name, ds) in enumerate(ds_dict.items()):
        if 'mrsol200cm' in ds:
            ds['mrsol2m'] = ds['mrsol200cm']
            ds_dict[name] = ds.drop('mrsol200cm')

    depth=1.0

    # Get only mrsol100cm data
    mrsol1m_dict = {}
    mrsol2m_dict = {}

    for i, (name, ds) in enumerate(ds_dict.items()):
        max_depth_below = ds.depth.where(ds.depth < depth).max().compute().values

        if not 'depth' in ds_dict[name].mrsol1m.dims:
            mrsol1m = ds.mrsol1m.expand_dims({'depth': [depth]})
        else:
            mrsol1m  = ds.mrsol1m

        mrsol1m_dict[name], = dask.compute(xr.concat([ds_dict[name].mrsol.sel(depth=slice(None, max_depth_below)), 
                                        mrsol1m], dim='depth'))

    for i, (name, ds) in enumerate(ds_dict.items()):
        mrsol1m_dict[name]=mrsol1m_dict[name].rename({'depth': 'depth_1m'})

        ds['mrsol1m'] = mrsol1m_dict[name]

    depth=2.0

    for i, (name, ds) in enumerate(ds_dict.items()):
        max_depth_below = ds.depth.where(ds.depth < depth).max().compute().values

        if not 'depth' in ds_dict[name].mrsol2m.dims:
            mrsol2m = ds.mrsol2m.expand_dims({'depth': [depth]})
        else:
            mrsol2m  = ds.mrsol2m

        mrsol2m_dict[name], = dask.compute(xr.concat([ds_dict[name].mrsol.sel(depth=slice(None, max_depth_below)), 
                                        mrsol2m], dim='depth'))

    for i, (name, ds) in enumerate(ds_dict.items()):
        mrsol2m_dict[name]=mrsol2m_dict[name].rename({'depth': 'depth_2m'})

        ds['mrsol2m'] = mrsol2m_dict[name]
        
    return 

In [5]:
def plot_diff_mrsol(ds_dict):

    fig, ax = plt.subplots(figsize=(30, 15))

    plt.xlim(0, 10)
    #plt.ylim(0, ylim_bound)

    # Define the marker size for the plot
    marker_size = 150
    name=list(ds_dict.keys())[0]

    data_to_plot_1 = ds_dict[list(ds_dict.keys())[0]].mrsol.isel(time=100, lat=90, lon=200).squeeze()
    data_lines_1 = ax.plot(data_to_plot_1['depth'], data_to_plot_1, linestyle='--', label=f"{name}")
    data_color_1 = data_lines_1[0].get_color()
    data_markers_1 = data_to_plot_1.plot.scatter(x='depth', y='variable', s=marker_size, c=data_color_1, ax=ax, label=None)

    data_to_plot_2 = ds_dict[list(ds_dict.keys())[0]].mrsol1m.isel(time=100, lat=90, lon=200).squeeze()
    data_lines_2 = ax.plot(data_to_plot_2['depth_1m'], data_to_plot_2, linestyle='--', label=f"{name}")
    data_color_2 = data_lines_2[0].get_color()
    data_markers_2 = data_to_plot_2.plot.scatter(x='depth_1m', y='variable', s=marker_size, c=data_color_2, ax=ax, label=None)

    data_to_plot_3 = ds_dict[list(ds_dict.keys())[0]].mrsol2m.isel(time=100, lat=90, lon=200).squeeze()
    data_lines_3 = ax.plot(data_to_plot_3['depth_2m'], data_to_plot_3, linestyle='--', label=f"{name}")
    data_color_3 = data_lines_3[0].get_color()
    data_markers_3 = data_to_plot_3.plot.scatter(x='depth_2m', y='variable', s=marker_size, c=data_color_3, ax=ax, label=None)

    plt.legend()

    fig.savefig(f'../results/CMIP6/TaiESM1_mrsol_+_1m_+_2m_time100_lat90_lon200.png', dpi=300)

In [6]:
def save_file(save_file, folder, save_var=True):
    """
    Save files as netCDF.

    Args:
        savefile (dict or dataset): Dictionary of xarray datasets or dataset.
        folder (string): Name of folder data is saved in.
        save_var (boolean): If True, data is saved separately for each variable. If false, one file is saved with all variables.
        

    Returns:
        nc_out: Path were data is saved in.
    """
    
    if save_var:
        for key, ds in ds_dict.items():
            for var in ds:
                # Variable to keep
                variable_to_keep = var
                dimensions_to_keep = {'time', 'lat', 'lon'}
                coordinates_to_keep = {'time', 'lat', 'lon'}

                if any('depth' in ds[var].dims for var in ds.variables):
                    dimensions_to_keep.add('depth')
                    coordinates_to_keep.add('depth')

                # Create a new dataset with only the desired variable
                ds_var = ds[[variable_to_keep]]

                # Keep only the desired dimensions
                ds_var = ds_var.isel({dim: slice(None) for dim in dimensions_to_keep.intersection(ds_var.dims)})

                # Set the desired coordinates
                coords_to_set = set(ds_var.variables).intersection(coordinates_to_keep)
                ds_var = ds_var.set_coords(list(coords_to_set))

                savepath = f'../../data/CMIP6/{ds_var.experiment_id}/{folder}/{var}/'
                filename = f'CMIP.{ds_var.source_id}.{ds_var.experiment_id}.{var}.nc'
                nc_out = os.path.join(savepath, filename)
                os.makedirs(savepath, exist_ok=True) 
                if os.path.exists(nc_out):
                      #  inp = input(f"Delete old file {filename} (y/n):")
                      #  if inp.lower() in ["y"]:
                            os.remove(nc_out)
                            print(f"File  with path: {nc_out} removed")
                      #  else:
                      #      filename = "temp_file.nc"
                      #      nc_out = os.path.join(savepath, filename)
                      #      print(f"Filename change to {filename}")

                # Save to netcdf file
                with dask.config.set(scheduler='threads'):
                    ds_var.to_netcdf(nc_out)
                    print(f"File with path: {nc_out} saved")
       
    else:
        for key in save_file.keys():
            ds_in = save_file[key]
            filename = f'CMIP.{ds_in.source_id}.{ds_in.experiment_id}.nc'
            savepath = f'../../data/CMIP6/{ds_in.experiment_id}/{folder}'
            nc_out = os.path.join(savepath, filename)
            os.makedirs(savepath, exist_ok=True) 
            if os.path.exists(nc_out):
                inp = input(f"Delete old file {filename} (y/n):")
                if inp.lower() in ["y"]:
                    os.remove(nc_out)
                    print(f"File  with path: {nc_out} removed")
                else:
                    filename = "temp_file.nc"
                    nc_out = os.path.join(savepath, filename)
                    print(f"Filename change to {filename}")

            # Save to netcdf file
            with dask.config.set(scheduler='threads'):
                ds_in.to_netcdf(nc_out)

    return nc_out

### 1. Load netCDF files

In [8]:
experiment_id = 'ssp370'
folder='preprocessed'

In [9]:
# ========= Define period, models and path ==============
variable='mrsol'
savepath = f'../../data/CMIP6/{experiment_id}/{folder}/{variable}' 
#source_id = ['IPSL-CM6A-LR', 'UKESM1-0-LL', 'MPI-ESM1-2-LR', 'CESM2-WACCM', 'NorESM2-MM']

source_id = ['TaiESM1', 'BCC-CSM2-MR',  'CanESM5', 'CNRM-CM6-1', 'CNRM-ESM2-1']#, 'IPSL-CM6A-LR', 'UKESM1-0-LL', 'MPI-ESM1-2-LR', 'CESM2-WACCM', 'NorESM2-MM']

# ========= Use Dask to parallelize computations ==========
dask.config.set(scheduler='processes')

# ========= Create a helper function to open the dataset ========
def open_dataset(filename):
    ds = xr.open_dataset(filename)
    return ds

# ========= Create dictionary using a dictionary comprehension and Dask =======
ds_dict, = dask.compute({model: open_dataset(os.path.join(savepath, f'CMIP.{model}.{experiment_id}.{variable}_regridded.nc'))
                        for model in source_id})

In [10]:
# ========= Define period, models and path ==============
variable='tsl' #
savepath = f'../../data/CMIP6/{experiment_id}/{folder}/{variable}' 

# ========= Use Dask to parallelize computations ==========
dask.config.set(scheduler='processes')

# ========= Create a helper function to open the dataset ========
def open_dataset(filename):
    ds = xr.open_dataset(filename)
    return ds

# ========= Create dictionary using a dictionary comprehension and Dask =======
ds_dict_tsl, = dask.compute({model: open_dataset(os.path.join(savepath, f'CMIP.{model}.{experiment_id}.{variable}_regridded.nc'))
                        for model in source_id})

FileNotFoundError: [Errno 2] No such file or directory: b'/work/ch0636/g300115/data/CMIP6/ssp370/preprocessed/tsl/CMIP.TaiESM1.ssp370.tsl_regridded.nc'

In [241]:
ds_dict_tsl[list(ds_dict_tsl.keys())[0]].time

In [242]:
ds_dict = {}

for i, (name, ds) in enumerate(ds_dict_tsl.items()):
    ds_dict[name] = xr.merge([ds_dict_tsl[name], ds_dict_mrsol[name]])

In [10]:
# ========= Have a look into the dictionary =======
print(list(ds_dict.keys()))
ds_dict[list(ds_dict.keys())[2]].depth

['IPSL-CM6A-LR', 'UKESM1-0-LL', 'MPI-ESM1-2-LR', 'CESM2-WACCM', 'NorESM2-MM']


In [24]:
ds_dict[list(ds_dict.keys())[0]].depth.attrs = {'name': 'depth','units': 'm','positive': 'down'}

### 2. Compute Soil Moisture/Temperature for top 1 and 2 meters by interpolation and isolate liquid soil moisture

#### Compute temperature for top 1 and 2 meters 

In [245]:
# ============ Compute temperature for top 1 and 2 meters =============
var='tsl' #tsl mrsol
var_1m='tsl1m' #tsl1m mrsol1m
var_2m='tsl2m' #tsl2m mrsol2m


for i, (name, ds) in enumerate(ds_dict.items()):
    if ds["depth"].attrs["positive"] == "down":
        depth = ds["depth"]
    else:
        depth = -ds["depth"]
        
    # Interpolate mrsol/tsl at 1 meters depth
    target_depth = 1  # Depth in meter (m) (depending on the dataset's units)
    ds[var_1m] = ds[var].interp(depth=target_depth, method="linear")
    
    # Interpolate mrsol/tsl at 1 meters depth
    target_depth = 2  # Depth in meter (m) (depending on the dataset's units)
    ds[var_2m] = ds[var].interp(depth=target_depth, method="linear")

In [246]:
# Get only 1m data
dict_1m = {}


depth=1.0

for i, (name, ds) in enumerate(ds_dict.items()):
    max_depth_below = ds.depth.where(ds.depth < depth).max().compute().values

    if not 'depth' in ds_dict[name][var_1m].dims:
        ds1m = ds[var_1m].expand_dims({'depth': [depth]})
    else:
        ds1m  = ds[var_1m]

    dict_1m[name], = dask.compute(xr.concat([ds_dict[name][var].sel(depth=slice(None, max_depth_below)), 
                                    ds1m], dim='depth'))

for i, (name, ds) in enumerate(ds_dict.items()):
    dict_1m[name]=dict_1m[name].rename({'depth': 'depth_1m'})

    ds[var_1m] = dict_1m[name]

In [247]:
# Get only 2m data

dict_2m = {}
depth=2.0

for i, (name, ds) in enumerate(ds_dict.items()):
    max_depth_below = ds.depth.where(ds.depth < depth).max().compute().values

    if not 'depth' in ds_dict[name][var_2m].dims:
        ds2m = ds[var_2m].expand_dims({'depth': [depth]})
    else:
        ds2m  = ds[var_2m]

    dict_2m[name], = dask.compute(xr.concat([ds_dict[name][var].sel(depth=slice(None, max_depth_below)), 
                                    ds2m], dim='depth'))

for i, (name, ds) in enumerate(ds_dict.items()):
    dict_2m[name]=dict_2m[name].rename({'depth': 'depth_2m'})

    ds[var_2m] = dict_2m[name]

#### Compute soil moisture for top 1 and 2 meters 

In [None]:
# ============ Compute soil moisture for top 1 and 2 meters =============
var='mrsol' #tsl mrsol
var_1m='mrsol1m' #tsl1m mrsol1m
var_2m='mrsol2m' #tsl2m mrsol2m


for i, (name, ds) in enumerate(ds_dict.items()):
    if ds["depth"].attrs["positive"] == "down":
        depth = ds["depth"]
    else:
        depth = -ds["depth"]
        
    # Interpolate mrsol/tsl at 1 meters depth
    target_depth = 1  # Depth in meter (m) (depending on the dataset's units)
    ds[var_1m] = ds[var].interp(depth=target_depth, method="linear")
    
    # Interpolate mrsol/tsl at 1 meters depth
    target_depth = 2  # Depth in meter (m) (depending on the dataset's units)
    ds[var_2m] = ds[var].interp(depth=target_depth, method="linear")

In [None]:
# Get only 1 m data
dict_1m = {}
depth=1.0

for i, (name, ds) in enumerate(ds_dict.items()):
    max_depth_below = ds.depth.where(ds.depth < depth).max().compute().values

    if not 'depth' in ds_dict[name][var_1m].dims:
        ds1m = ds[var_1m].expand_dims({'depth': [depth]})
    else:
        ds1m  = ds[var_1m]

    dict_1m[name], = dask.compute(xr.concat([ds_dict[name][var].sel(depth=slice(None, max_depth_below)), 
                                    ds1m], dim='depth'))

for i, (name, ds) in enumerate(ds_dict.items()):
    dict_1m[name]=dict_1m[name].rename({'depth': 'depth_1m'})

    ds[var_1m] = dict_1m[name]

In [None]:
# Get only 2m data
dict_2m = {}

depth=2.0

for i, (name, ds) in enumerate(ds_dict.items()):
    max_depth_below = ds.depth.where(ds.depth < depth).max().compute().values

    if not 'depth' in ds_dict[name][var_2m].dims:
        ds2m = ds[var_2m].expand_dims({'depth': [depth]})
    else:
        ds2m  = ds[var_2m]

    dict_2m[name], = dask.compute(xr.concat([ds_dict[name][var].sel(depth=slice(None, max_depth_below)), 
                                    ds2m], dim='depth'))

for i, (name, ds) in enumerate(ds_dict.items()):
    dict_2m[name]=dict_2m[name].rename({'depth': 'depth_2m'})

    ds[var_2m] = dict_2m[name]

#### Compute column soil moisture (frozen and liquid)

In [None]:
# Compute cumulative liquid soil moisture 
for i, (name, ds) in enumerate(ds_dict.items()):
    ds_dict[name]['mrso_1m'] = ds['mrsol1m'].sum(dim='depth_1m')

In [None]:
for i, (name, ds) in enumerate(ds_dict.items()):
    ds_dict[name]['mrso_2m'] = ds['mrsol2m'].sum(dim='depth_2m')

In [None]:
ds_dict[list(ds_dict.keys())[0]].mrso_1m.attrs = {'standard_name': 'mass_content_of_liquid_and_frozen__water_in_1m_soil_column',
                                               'long_name': 'Total Soil Moisture Content of 1 m Column',
                                                'comment': 'The mass per unit area  (summed over all soil layers until 1 m depth) of liquid and frozen water.',
                                                'units': 'kg/m²'
                                               }

In [None]:
ds_dict[list(ds_dict.keys())[0]].mrso_2m.attrs = {'standard_name': 'mass_content_of_liquid_and_frozen_water_in_2m_soil_column',
                                               'long_name': 'Total Soil Moisture Content of 2 m Column',
                                                'comment': 'The mass per unit area  (summed over all soil layers until 2 m depth) of liquid and frozen water.',
                                                'units': 'kg/m²'
                                               }

In [None]:
ds_dict[list(ds_dict.keys())[0]]

In [None]:
for name, ds in ds_dict.items():
    ds_dict[name] = ds_dict[name].drop(['mrsol', 'depth', 'depth_1m', 'mrsol1m', 'depth_2m', 'mrsol2m']).squeeze()

#### Compute column liquid soil moisture

In [251]:
# ============ First compute soil temperature for top 1 and 2 meters =============

# Set the freezing point of water in the soil (in Kelvin)
freezing_point = 273.15

In [252]:
# Create a mask where tsl is less than or equal to the freezing_point
for i, (name, ds) in enumerate(ds_dict.items()):
    frozen_soil_mask = ds["tsl1m"] <= freezing_point
    ds_dict[name]["tsl1mfrozen"]= frozen_soil_mask

In [253]:
for i, (name, ds) in enumerate(ds_dict.items()):
    frozen_soil_mask = ds["tsl2m"] <= freezing_point
    ds_dict[name]["tsl2mfrozen"]= frozen_soil_mask

In [254]:
# Calculate the liquid soil moisture per layer
for i, (name, ds) in enumerate(ds_dict.items()):
    liquid_soil_moisture_per_layer = ds["mrsol1m"].where(~ds_dict[name]['tsl1mfrozen'])
    ds_dict[name]['mrsol1m_liquid'] = liquid_soil_moisture_per_layer

In [255]:
for i, (name, ds) in enumerate(ds_dict.items()):
    liquid_soil_moisture_per_layer = ds["mrsol2m"].where(~ds_dict[name]['tsl2mfrozen'])
    ds_dict[name]['mrsol2m_liquid'] = liquid_soil_moisture_per_layer

In [256]:
# Compute cumulative liquid soil moisture 
for i, (name, ds) in enumerate(ds_dict.items()):
    ds_dict[name]['lmrso_1m'] = ds['mrsol1m_liquid'].sum(dim='depth_1m')

In [257]:
for i, (name, ds) in enumerate(ds_dict.items()):
    ds_dict[name]['lmrso_2m'] = ds['mrsol2m_liquid'].sum(dim='depth_2m')

In [258]:
ds_dict[list(ds_dict.keys())[0]].lmrso_1m.attrs = {'standard_name': 'mass_content_of_liquid_water_in_1m_soil_column',
                                               'long_name': 'Total Liquid Soil Moisture Content of 1 m Column',
                                                'comment': 'The mass per unit area  (summed over all soil layers until 1 m depth) of liquid water.',
                                                'units': 'kg/m²'
                                               }

In [259]:
ds_dict[list(ds_dict.keys())[0]].lmrso_2m.attrs = {'standard_name': 'mass_content_of_liquid_water_in_2m_soil_column',
                                               'long_name': 'Total Liquid Soil Moisture Content of 2 m Column',
                                                'comment': 'The mass per unit area  (summed over all soil layers until 2 m depth) of liquid water.',
                                                'units': 'kg/m²'
                                               }

In [275]:
ds_dict[list(ds_dict.keys())[0]] = ds_dict[list(ds_dict.keys())[0]].drop(['tsl', 'mrsol', 'depth', 'depth_1m', 'depth_2m', 'tsl1m', 'tsl2m', 'mrsol1m', 'mrsol2m', 'tsl1mfrozen', 'tsl2mfrozen', 'mrsol1m_liquid','mrsol2m_liquid']).squeeze()


#ds_dict[list(ds_dict.keys())[0]] = ds_dict[list(ds_dict.keys())[0]].drop(['tsl', 'mrsol', 'depth', 'depth_1m', 'tsl1m', 'mrsol1m', 'tsl1mfrozen', 'mrsol1m_liquid']).squeeze()
#ds_dict[list(ds_dict.keys())[0]] = ds_dict[list(ds_dict.keys())[0]].drop(['tsl', 'mrsol', 'depth', 'depth_2m', 'tsl2m', 'mrsol2m', 'tsl2mfrozen','mrsol2m_liquid']).squeeze()

In [276]:
ds_dict[list(ds_dict.keys())[0]]

### Save and replace netcdf files

In [20]:
# =========== Store file and remove any former one ==========
nc_out = save_file(ds_dict, folder='preprocessed')

File with path: ../../data/CMIP6/historical/preprocessed/mrso_1m/CMIP.IPSL-CM6A-LR.historical.mrso_1m.nc saved
File with path: ../../data/CMIP6/historical/preprocessed/mrso_2m/CMIP.IPSL-CM6A-LR.historical.mrso_2m.nc saved
File with path: ../../data/CMIP6/historical/preprocessed/mrso_1m/CMIP.UKESM1-0-LL.historical.mrso_1m.nc saved
File with path: ../../data/CMIP6/historical/preprocessed/mrso_2m/CMIP.UKESM1-0-LL.historical.mrso_2m.nc saved
File with path: ../../data/CMIP6/historical/preprocessed/mrso_1m/CMIP.MPI-ESM1-2-LR.historical.mrso_1m.nc saved
File with path: ../../data/CMIP6/historical/preprocessed/mrso_2m/CMIP.MPI-ESM1-2-LR.historical.mrso_2m.nc saved
File with path: ../../data/CMIP6/historical/preprocessed/mrso_1m/CMIP.CESM2-WACCM.historical.mrso_1m.nc saved
File with path: ../../data/CMIP6/historical/preprocessed/mrso_2m/CMIP.CESM2-WACCM.historical.mrso_2m.nc saved
File with path: ../../data/CMIP6/historical/preprocessed/mrso_1m/CMIP.NorESM2-MM.historical.mrso_1m.nc saved
File 

In [None]:
# =========== Check stored file ==============
xr.open_dataset(nc_out)