# Script to Import CMIP6 data using Intake-ESM

This script demonstrates the process of loading, preprocessing, and saving CMIP6 climate model data.
For detailed documentation, refer to: https://intake-esm.readthedocs.io/en/stable/

**Following steps are included in this script:**

1. Define data attributes for selection.
2. Open the catalog and filter data.
3. Load datasets into a dictionary.
4. Perform preliminary preprocessing:
    - Drop redundant coordinates and variables.
    - Merge datasets with different `table_id` for the same `source_id`.
5. Save the processed data to a specified path.

In [None]:
# ========== Import Required Libraries ==========
import dask
import os
import sys

In [None]:
# ========== Configure Paths ==========
# Define the full path to the directories containing utility scripts and configurations
data_handling_dir = '../../src/data_handling'
config_file = '../../src'

# Add directories to sys.path for importing custom modules
sys.path.append(data_handling_dir)
sys.path.append(config_file)

# Import custom utility functions and configurations
import load_data as ld
import save_data_as_nc as sd
from config import DATA_DIR, DEFAULT_EXPERIMENT, DEFAULT_ACTIVITY_ID, DEFAULT_MEMBER_ID, DEFAULT_TABLE_ID, DEFAULT_MODEL, DEFAULT_VARIABLE

### Step 1: Define Data Selection Attributes

In [None]:
attrs = {
    "experiment_id": DEFAULT_EXPERIMENT,  # Use "historical" or "ssp370"
    "activity_id": [DEFAULT_ACTIVITY_ID],  # For historical data, use ["CMIP"]. For SSP data, use ["ScenarioMIP"]
    "member_id": DEFAULT_MEMBER_ID,  # Member IDs from Supplementary Table S1
    "table_id": [DEFAULT_TABLE_ID],  # For monthly data, use ["Amon", "Lmon"]. For daily data, use ["day"]
    "source_id": DEFAULT_MODEL,  # Replace with other models as needed
    "variable_id": [
        DEFAULT_VARIABLE
        # Variables used in this study: "tas", "ps", "pr", "huss", "evspsbl", "tran", "mrso", "lai", "gpp", "mrro"
    ]
}

### Step 2: Load Data Catalog 

In [None]:
# Define available CMIP6 catalog paths. The DKRZ Catalog is accessible only to users with accounts on the 'levante' system.
# The Pangeo Catalog is publicly accessible but BCC-CSM2-MR and MPI-ESM1-2-LR data used in this analysis are partly not available on the Pangeo catalog. 
# CESM2 and CMCC-CM2-SR5 data is used in this analysis are partly not available on the DKRZ catalog. 
# In case you want to reproduce all my results feel free to contact me via mail: simon.heselschwerdt@hereon.de.
    
catalog_paths = [
    "https://storage.googleapis.com/cmip6/pangeo-cmip6.json",  # Pangeo Catalog
    "/work/ik1017/Catalogs/dkrz_cmip6_disk.json",  # DKRZ Catalog
]

# Open the selected catalog (choose index 0 or 1 based on your access permissions)
cat_cmip6 = ld.open_catalog(catalog_paths[0])  # Example uses Pangeo Catalog

# Search the catalog based on the defined attributes
selection = cat_cmip6.search(**attrs)

# Display selection metadata
selection.df

### Step 3: Load Data into a Dictionary

In [None]:
# Configure Dask to handle cftime and consolidated metadata
with dask.config.set(use_cftime=True, decode_times=True, consolidated=True):
    ds_dict = selection.to_dataset_dict(preprocess=ld.pre_preprocessing)

### Step 4: Preliminary Preprocessing of Data

In [None]:
# Step 4.1: Define redundant coordinates and variables
drop_list = [
    "member_id", "type", "nbnd", "bnds", "height", "depth", "lat_bnds",
    "lon_bnds", "time_bnds", "time_bounds", "depth_bnds", "sdepth_bounds",
    "depth_bounds", "hist_interval", "axis_nbounds", "dcpp_init_year"
]

# Drop the defined coordinates and variables
ds_dict = ld.drop_redundant(ds_dict, drop_list)

# Step 4.2: Merge datasets with different `table_id` but the same `source_id`
ds_dict = ld.merge_source_id_data(ds_dict)

### Step 5: Define Output File Path

In [None]:
# Determine temporal resolution based on `table_id`
data_state = "raw"
if "Amon" in attrs["table_id"] or "Lmon" in attrs["table_id"]:
    temp_res = "month"
elif "day" in attrs["table_id"]:
    temp_res = "day"
else:
    temp_res = "unknown"  # Fallback for unexpected cases

# Construct the output file path
data_path = f"{data_state}/CMIP6/{attrs['experiment_id']}/{temp_res}/"
file_path = os.path.join(DATA_DIR, data_path)
print(f"Saving files to: {file_path}")

### Step 6: Save Data

In [None]:
# Save the processed datasets and remove any existing files at the target path
sd.save_files(ds_dict, file_path)