# CMIP6 Daily to Monthly and merging

**Following steps are included in this script:**

1. Open file
2. Compute monthly mean and merge with rest of the model data
3. Save data to netcdf files

In [1]:
# ========== Packages ==========
import xarray as xr
import dask
import os
import pandas as pd
import numpy as np

### Functions

In [10]:
def daily_to_monthly(ds_dict_daily, ds_dict_merged=1):
    """
    Compute mothly data from daily data and merge the variable with the rest of the model. Reason for this function is that the TaiESM1 model has no
    monthly data for the variable hurs. Can be applied to any model that has a similar issue.

    Args:
        ds_dict_daily (dict): Dictionary of xarray dataset with missing variable in daily resolution.
        ds_dict_merged (dict, optional): Dictionary of xarray datasets with all variables in monthly resolution except of the missing variable. 
                                            If no dict is passed to the function, simply the monthly mean is calculated. 

        
    Returns:
        ds_dict_monthly: A dictionary with dataarrays in monthly resolution.
            
            or
        
        ds_dict_all: A merged dictionary with dataarrays of each variable of the respective model.
    """
    ds_dict_monthly = {}
    
    if ds_dict_merged==1:
        for ds_name, ds_data in ds_dict_daily.items():
            source_id = ds_data.attrs['source_id']
            ds_data = ds_data.sortby('time')
            # Compute monthly values
            ds_data_mon = ds_data.resample(time='1MS').mean()
            # Put in dict
            ds_dict_monthly[source_id] = ds_data_mon
        
        return ds_dict_monthly
    
    else:
        for ds_name, ds_data in ds_dict_daily.items():
            source_id = ds_data.attrs['source_id']
            ds_data = ds_data.sortby('time')
            # Compute monthly values
            ds_data_mon = ds_data.resample(time='1MS').mean()
            # Replace coordinates of dataset when different to datasets in ds_dict_merged
            ds_data_mon = replace_coordinates(ds_dict_merged[source_id], ds_data_mon)
            # Put in dict
            ds_dict_monthly[source_id] = ds_data_mon

        # Merge computed monthly average with rest of model dict 
        ds_dict_all = {}

        for dataset_name, dataset in ds_dict_monthly.items():
            with dask.config.set(**{'array.slicing.split_large_chunks': False}):
                ds_dict_all[dataset_name] = xr.merge([dataset, ds_dict_merged[dataset_name]])
        
        return ds_dict_all

In [None]:
def save_file(save_file, folder, save_var=True):
    """
    Save files as netCDF.

    Args:
        savefile (dict or dataset): Dictionary of xarray datasets or dataset.
        folder (string): Name of folder data is saved in.
        save_var (boolean): If True, data is saved separately for each variable. If false, one file is saved with all variables.
        

    Returns:
        nc_out: Path were data is saved in.
    """
    
    if save_var:
        for key, ds in ds_dict.items():
            for var in ds:
                # Variable to keep
                variable_to_keep = var
                dimensions_to_keep = {'time', 'lat', 'lon'}
                coordinates_to_keep = {'time', 'lat', 'lon'}

                if any('depth' in ds[var].dims for var in ds.variables):
                    dimensions_to_keep.add('depth')
                    coordinates_to_keep.add('depth')

                # Create a new dataset with only the desired variable
                ds_var = ds[[variable_to_keep]]

                # Keep only the desired dimensions
                ds_var = ds_var.isel({dim: slice(None) for dim in dimensions_to_keep.intersection(ds_var.dims)})

                # Set the desired coordinates
                coords_to_set = set(ds_var.variables).intersection(coordinates_to_keep)
                ds_var = ds_var.set_coords(list(coords_to_set))

                savepath = f'../../data/CMIP6/{ds_var.experiment_id}/raw/{var}/'
                filename = f'CMIP.{ds_var.source_id}.{ds_var.experiment_id}.{var}.nc'
                nc_out = os.path.join(savepath, filename)
                os.makedirs(savepath, exist_ok=True) 
                if os.path.exists(nc_out):
                        inp = input(f"Delete old file {filename} (y/n):")
                        if inp.lower() in ["y"]:
                            os.remove(nc_out)
                            print(f"File with path: {nc_out} removed")
                        else:
                            filename = "temp_file.nc"
                            nc_out = os.path.join(savepath, filename)
                            print(f"Filename change to {filename}")

                # Save to netcdf file
                with dask.config.set(scheduler='threads'):
                    ds_var.to_netcdf(nc_out)
                    print(f"File with path: {nc_out} saved")
       
    else:
        for key in save_file.keys():
            ds_in = save_file[key]
            filename = f'CMIP.{ds_in.source_id}.{ds_in.experiment_id}.nc'
            savepath = f'../data/CMIP6/{ds_in.experiment_id}/{folder}'
            nc_out = os.path.join(savepath, filename)
            os.makedirs(savepath, exist_ok=True) 
            if os.path.exists(nc_out):
                inp = input(f"Delete old file {filename} (y/n):")
                if inp.lower() in ["y"]:
                    os.remove(nc_out)
                    print(f"File  with path: {nc_out} removed")
                else:
                    filename = "temp_file.nc"
                    nc_out = os.path.join(savepath, filename)
                    print(f"Filename change to {filename}")

            # Save to netcdf file
            with dask.config.set(scheduler='threads'):
                ds_in.to_netcdf(nc_out)

    return nc_out

### 1. Open files

In [None]:
# ========= Define period, models and path ==============
variable='mrsol'
experiment_id = 'historical'
source_id = ['AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CanESM5', 'CESM2-FV2', 'CESM2-WACCM-FV2', 'CESM2-WACCM', 'CESM2', 'CNRM-CM6-1-HR','CNRM-CM6-1', 'IPSL-CM6A-LR', 'NorESM2-MM', 'SAM0-UNICON', 'TaiESM1', 'UKESM1-0-LL'] # 'AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CanESM5', 'CESM2-FV2', 'CESM2-WACCM-FV2', 'CESM2-WACCM', 'CESM2', 'CNRM-CM6-1-HR','CNRM-CM6-1', 'IPSL-CM6A-LR', 'NorESM2-MM', SAM0-UNICON, 'TaiESM1', 'UKESM1-0-LL'], 
savepath = f'../../data/CMIP6/{experiment_id}/raw/{variable}/'

# ========= Use Dask to parallelize computations ==========
dask.config.set(scheduler='processes')

# ========= Create a helper function to open the dataset ========
def open_dataset(filename):
    ds = xr.open_dataset(filename)
    return ds

# ========= Create dictionary using a dictionary comprehension and Dask =======
ds_dict, = dask.compute({model: open_dataset(os.path.join(savepath, f'CMIP.{model}.{experiment_id}.{variable}.nc'))
                        for model in source_id})

### 2. Mean and Merge 

In [None]:
# =========== Compute montly mean for daily datasets (optional: merge with rest of the datasets) =============

ds_dict_monthly = daily_to_monthly(ds_dict) # optionally include ,ds_dict_merged

In [None]:
# =========== Merge data if loading all data of one model is not possible ==============
# Name of second dictionary ds_dict_

# Only use this command if loading at once is not possible

#ds_dict[f"{list(ds_dict_.keys())[0]}_"]=ds_dict_[list(ds_dict_.keys())[0]]

In [None]:
# =========== Check dictionary =============
print(ds_dict.keys())
ds_dict[list(ds_dict.keys())[8]]

### 3. Save data to netcdf files

In [None]:
# =========== Store file and remove any former one ==========
nc_out = save_file(ds_dict, folder='raw')

In [None]:
# =========== Check stored file ==============
xr.open_dataset(nc_out)