This notebook filters and selects the required CMIP6 datasets based on specified models, variables, scenarios, and locations. It then downloads the relevant data and saves it locally in .nc (NetCDF) format for further analysis.

In [1]:
from pathlib import Path
import pandas as pd
import xarray as xr
import logging
from concurrent.futures import ThreadPoolExecutor

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Define constants
CATALOG_URL = "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
OUTPUT_DIR = Path("datasets")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Read the CMIP6 data catalog
logging.info("Downloading CMIP6 catalog...")
df = pd.read_csv(CATALOG_URL)

# Convert catalog to dictionary for faster lookups
dataset_lookup = {
    tuple(row[['source_id', 'experiment_id', 'variable_id', 'member_id', 'table_id']]): row['zstore']
    for _, row in df.iterrows()
}

# Define model configurations
model_configs = {
    "UKESM1-0-LL": "r1i1p1f2",
    "CESM2-WACCM": "r1i1p1f1",
    "MIROC6": "r1i1p1f1"
}
experiments = ["historical", "ssp245", "ssp585"]
variables = ["tas", "pr"]
table_id = "Amon"

# Generate full list of datasets
datasets = []
for model, member in model_configs.items():
    for exp in experiments:
        for var in variables:
            datasets.append({
                "source": model,
                "experiment": exp,
                "variable": var,
                "member": member,
                "table": table_id
            })

def download_cmip6_data(dataset):
    """Download CMIP6 data and save as NetCDF."""
    try:
        source, experiment, variable, member, table = dataset.values()
        output_path = OUTPUT_DIR / f"cmip6_{variable}_{experiment}_{source}.nc"
        
        if output_path.exists():
            logging.info(f"File already exists, skipping: {output_path}")
            return

        dataset_key = (source, experiment, variable, member, table)
        zarr_url = dataset_lookup.get(dataset_key)

        if not zarr_url:
            logging.warning(f"No data found for {dataset_key}")
            return

        logging.info(f"Downloading {experiment} {variable} data for {source}...")
        ds = xr.open_zarr(zarr_url, consolidated=True)
        ds.to_netcdf(output_path)
        logging.info(f"Saved dataset to {output_path}")

    except Exception as e:
        logging.error(f"Error downloading {dataset}: {e}")

# Parallel download
with ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(download_cmip6_data, datasets)

logging.info("All downloads complete.")

2025-04-09 23:07:06,518 - INFO - Downloading CMIP6 catalog...
2025-04-09 23:08:07,234 - INFO - Downloading historical tas data for UKESM1-0-LL...
2025-04-09 23:08:07,234 - INFO - Downloading historical pr data for UKESM1-0-LL...
2025-04-09 23:08:07,235 - INFO - Downloading ssp245 tas data for UKESM1-0-LL...
2025-04-09 23:08:07,236 - INFO - Downloading ssp245 pr data for UKESM1-0-LL...
HDF5-DIAG: Error detected in HDF5 (1.14.3) thread 1:
  #000: H5Adeprec.c line 140 in H5Acreate1(): unable to create attribute
    major: Attribute
    minor: Unable to initialize object
  #001: H5VLcallback.c line 1034 in H5VL_attr_create(): attribute create failed
    major: Virtual Object Layer
    minor: Unable to create file
  #002: H5VLcallback.c line 1001 in H5VL__attr_create(): attribute create failed
    major: Virtual Object Layer
    minor: Unable to create file
  #003: H5VLnative_attr.c line 110 in H5VL__native_attr_create(): unable to create attribute
    major: Attribute
    minor: Unable to 