# CMIP6 Preprocess Data

**Following steps are included in this script:**

1. Load netCDF files
2. Regrid data to 1x1° format
3. Create consistent time coordinates
4. Convert units
5. Appyl landmask
6. Compute Soil Moisture for top 1 and 2 meters by interpolation

Save and replace netcdf files

In [1]:
# ========== Packages ==========
import xarray as xr
import pandas as pd
import numpy as np
import xesmf as xe
import intake
import dask
import os

import matplotlib.pyplot as plt

%matplotlib inline

### 1. Load netCDF files

In [None]:
folder='preprocessed'

In [2]:
# ========= Define period, models and path ==============
experiment_id = 'historical'
source_id = ['CESM2-WACCM'] # 'TaiESM1', 'AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CanESM5', 'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'UKESM1-0-LL', 'CESM2', 'CESM2-FV2', 'CESM2-WACCM', 'NorESM2-MM'], 
savepath = f'../data/CMIP6/{experiment_id}/{folder}'

# ========= Use Dask to parallelize computations ==========
dask.config.set(scheduler='processes')

# ========= Create a helper function to open the dataset ========
def open_dataset(filename):
    ds = xr.open_dataset(filename)
    return ds

# ========= Create dictionary using a dictionary comprehension and Dask =======
ds_dict, = dask.compute({model: open_dataset(os.path.join(savepath, f'CMIP.{model}.{experiment_id}.nc'))
                        for model in source_id})

In [17]:
# ========= Have a look into the dictionary =======
print(list(ds_dict.keys()))
ds_dict[list(ds_dict.keys())[0]]

['CESM2-WACCM']


### 7. Add new variables

In [None]:
# Water Use Efficiency
for i, (name, ds) in enumerate(ds_dict.items()):
    ds_dict[name]['wue'] = ds['gpp']/ds['tran']
    ds_dict[name]['wue'].attrs = {'long_name': 'Water Use Efficiency (GPP/Tr)'}
    


In [None]:
nc_out = save_file(ds_dict, folder=folder')

In [None]:
#test if data is correct
xr.open_dataset(nc_out)