# Script to Import CMIP6 data using Intake-ESM

This script demonstrates the process of loading, processing, and saving CMIP6 climate model data.
For detailed documentation, refer to: https://intake-esm.readthedocs.io/en/stable/

**Following steps are included in this script:**

1. Define data attributes for selection.
2. Open the catalog and filter data.
3. Load datasets into a dictionary.
4. Perform first preprocessing:
    - Drop redundant coordinates and variables.
    - Merge datasets with different `table_id` for the same `source_id`.
5. Save the processed data to a specified path.

Dependencies: Ensure `intake-esm`, `xarray`, `dask`, and `gcsfs` are installed.

In [None]:
# ========== Import Required Libraries ==========
import xarray as xr
import intake
import dask
import os
import pandas as pd
import numpy as np
import gcsfs
import sys

In [None]:
# ========== Configure Paths ==========
# Define the full path to the directories containing utility scripts and configurations
data_handling_dir = '../../src/data_handling'
config_file = '../../src'

# Add directories to sys.path for importing custom modules
sys.path.append(data_handling_dir)
sys.path.append(config_file)

# Import custom utility functions and configurations
import load_data as ld
import save_data_as_nc as sd
from config import BASE_DIR, DEFAULT_MODEL, DEFAULT_VARIABLE, DEFAULT_TEMPORAL_RES, DEFAULT_EXPERIMENT

### Step 1: Define Data Selection Attributes

In [None]:
attrs = {
    "experiment_id": "historical",  # Use "ssp370" for future scenarios
    "activity_id": ["CMIP"],  # For SSP data, use ["ScenarioMIP"]
    "member_id": "r1i1p1f1",  # Member IDs from Supplementary Table S1
    "table_id": ["Amon", "Lmon"],  # For daily data, use ["day"]
    "source_id": "BCC-CSM2-MR",  # Replace with other models as needed
    "variable_id": [
        "ps"#, "pr", "huss", "evspsbl", "tran", "mrso", "lai", "gpp", "mrro"
    ]
}

### Step 2: Load Data Catalog 

In [None]:
# Define available CMIP6 catalog paths
catalog_paths = [
    "https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json",  # NCAR Catalog
    "https://storage.googleapis.com/cmip6/pangeo-cmip6.json",  # Pangeo Catalog
    "/work/ik1017/Catalogs/dkrz_cmip6_disk.json",  # DKRZ Catalog
]

# Open the selected catalog (choose index 0, 1, or 2 based on your preference)
cat_cmip6 = ld.open_catalog(catalog_paths[2])  # Example uses DKRZ Catalog

# Search the catalog based on the defined attributes
selection = cat_cmip6.search(**attrs)

# Display selection metadata
selection.df

### Step 3: Load Data into a Dictionary

In [None]:
# Configure Dask to handle cftime and consolidated metadata
with dask.config.set(use_cftime=True, decode_times=True, consolidated=True):
    ds_dict = selection.to_dataset_dict(preprocess=ld.pre_preprocessing)

### Step 4: Preprocess Data

In [None]:
# Step 4.1: Define redundant coordinates and variables
drop_list = [
    "member_id", "type", "nbnd", "bnds", "height", "depth", "lat_bnds",
    "lon_bnds", "time_bnds", "time_bounds", "depth_bnds", "sdepth_bounds",
    "depth_bounds", "hist_interval", "axis_nbounds", "dcpp_init_year"
]

# Drop the defined coordinates and variables
ds_dict = ld.drop_redundant(ds_dict, drop_list)

# Step 4.2: Merge datasets with different `table_id` but the same `source_id`
ds_dict = ld.merge_source_id_data(ds_dict)

### Step 5: Define Output File Path

In [None]:
# Determine temporal resolution based on `table_id`
data_state = "raw"
if "Amon" in attrs["table_id"] or "Lmon" in attrs["table_id"]:
    temp_res = "month"
elif "day" in attrs["table_id"]:
    temp_res = "day"
else:
    temp_res = "unknown"  # Fallback for unexpected cases

# Construct the output file path
data_path = f"{data_state}/CMIP6/{attrs['experiment_id']}/{temp_res}/"
file_path = os.path.join(BASE_DIR, data_path)
print(f"Saving files to: {file_path}")

### Step 6: Save Data

In [None]:
# Save the processed datasets and remove any existing files at the target path
sd.save_files(ds_dict, file_path)