This notebook downloads all the datasets needed 

In [2]:
from pathlib import Path
import pandas as pd
import xarray as xr

# Define constants
CATALOG_URL = "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
OUTPUT_DIR = Path("cmip6_datasetlon_245_ta[s")  # Use pathlib for better path handling

# Ensure the output directory exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Read the CMIP6 data catalog
df = pd.read_csv(CATALOG_URL)

def download_cmip6_data(source_id, experiment_id, variable_id, member_id, table_id, output_path):
    """Filter and download CMIP6 data from Google Cloud Storage."""
    query = f"source_id == '{source_id}' & experiment_id == '{experiment_id}' & variable_id =='{variable_id}' & member_id == '{member_id}' & table_id == '{table_id}'"
    df_filtered = df.query(query)

    if df_filtered.empty:
        print(f"No data found for {experiment_id}")
        return

    zarr_url = df_filtered['zstore'].values[0]  # Extract Zarr URL
    ds = xr.open_zarr(zarr_url, consolidated=True)  # Open dataset

    # Save dataset as NetCDF
    ds.to_netcdf(output_path)
    print(f"Downloaded {experiment_id} data to {output_path}")

# Download datasets for SSP245 and SSP585
download_cmip6_data("UKESM1-0-LL", "ssp245", "tas", "r1i1p1f2", "Amon", OUTPUT_DIR / "cmip6_tas_ssp245_UKESM1-0-LL.nc")
download_cmip6_data("UKESM1-0-LL", "ssp585", "tas", "r1i1p1f2", "Amon", OUTPUT_DIR / "cmip6_tas_ssp585_UKESM1-0-LL.nc")
download_cmip6_data("UKESM1-0-LL", "ssp245", "pr", "r1i1p1f2", "Amon", OUTPUT_DIR / "cmip6_pr_ssp245_UKESM1-0-LL.nc")
download_cmip6_data("UKESM1-0-LL", "ssp585", "pr", "r1i1p1f2", "Amon", OUTPUT_DIR / "cmip6_pr_ssp585_UKESM1-0-LL.nc")
download_cmip6_data("CESM2-WACCM", "ssp245", "tas", "r1i1p1f1", "Amon", OUTPUT_DIR / "cmip6_tas_ssp245_CESM2-WACCM.nc")
download_cmip6_data("CESM2-WACCM", "ssp585", "tas", "r1i1p1f1", "Amon", OUTPUT_DIR / "cmip6_tas_ssp585_CESM2-WACCM.nc")
download_cmip6_data("CESM2-WACCM", "ssp245", "pr", "r1i1p1f1", "Amon", OUTPUT_DIR / "cmip6_pr_ssp245_CESM2-WACCM.nc")
download_cmip6_data("CESM2-WACCM", "ssp585", "pr", "r1i1p1f1", "Amon", OUTPUT_DIR / "cmip6_pr_ssp585_CESM2-WACCM.nc")
download_cmip6_data("MIROC6", "ssp245", "tas", "r1i1p1f1", "Amon", OUTPUT_DIR / "cmip6_tas_ssp245_MIROC6.nc")
download_cmip6_data("MIROC6", "ssp585", "tas", "r1i1p1f1", "Amon", OUTPUT_DIR / "cmip6_tas_ssp585_MIROC6.nc")
download_cmip6_data("MIROC6", "ssp245", "pr", "r1i1p1f1", "Amon", OUTPUT_DIR / "cmip6_pr_ssp245_MIROC6.nc")
download_cmip6_data("MIROC6", "ssp585", "pr", "r1i1p1f1", "Amon", OUTPUT_DIR / "cmip6_pr_ssp585_MIROC6.nc")

Downloaded ssp245 data to cmip6_datasets/cmip6_tas_ssp245_UKESM1-0-LL.nc
Downloaded ssp585 data to cmip6_datasets/cmip6_tas_ssp585_UKESM1-0-LL.nc
Downloaded ssp245 data to cmip6_datasets/cmip6_pr_ssp245_UKESM1-0-LL.nc
Downloaded ssp585 data to cmip6_datasets/cmip6_pr_ssp585_UKESM1-0-LL.nc
Downloaded ssp245 data to cmip6_datasets/cmip6_tas_ssp245_CESM2-WACCM.nc
Downloaded ssp585 data to cmip6_datasets/cmip6_tas_ssp585_CESM2-WACCM.nc
Downloaded ssp245 data to cmip6_datasets/cmip6_pr_ssp245_CESM2-WACCM.nc


  ds = xr.open_zarr(zarr_url, consolidated=True)  # Open dataset


Downloaded ssp585 data to cmip6_datasets/cmip6_pr_ssp585_CESM2-WACCM.nc
Downloaded ssp245 data to cmip6_datasets/cmip6_tas_ssp245_MIROC6.nc
Downloaded ssp585 data to cmip6_datasets/cmip6_tas_ssp585_MIROC6.nc
Downloaded ssp245 data to cmip6_datasets/cmip6_pr_ssp245_MIROC6.nc
Downloaded ssp585 data to cmip6_datasets/cmip6_pr_ssp585_MIROC6.nc


In [10]:
# Read the CMIP6 data catalog
df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')

# Filter for the specific dataset
df_ta = df.query("source_id == 'UKESM1-0-LL' & experiment_id == 'historical' & variable_id =='tas' & member_id == 'r1i1p1f2' & table_id == 'Amon'")


In [12]:
df.columns

Index(['activity_id', 'institution_id', 'source_id', 'experiment_id',
       'member_id', 'table_id', 'variable_id', 'grid_label', 'zstore',
       'dcpp_init_year', 'version'],
      dtype='object')

In [5]:
df_ta

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
70736,CMIP,MOHC,UKESM1-0-LL,historical,r1i1p1f2,Amon,tas,gn,gs://cmip6/CMIP6/CMIP/MOHC/UKESM1-0-LL/histori...,,20190406


In [6]:
# Extract the URL of the dataset
zarr_url = df_ta['zstore'].values[0]  # First match, assuming there is only one

# Open the dataset using xarray
ds = xr.open_zarr(f"gs://{zarr_url}", consolidated=True)

# Print dataset info
print(ds)

<xarray.Dataset> Size: 219MB
Dimensions:    (lat: 144, bnds: 2, lon: 192, time: 1980)
Coordinates:
    height     float64 8B ...
  * lat        (lat) float64 1kB -89.38 -88.12 -86.88 ... 86.88 88.12 89.38
    lat_bnds   (lat, bnds) float64 2kB dask.array<chunksize=(144, 2), meta=np.ndarray>
  * lon        (lon) float64 2kB 0.9375 2.812 4.688 6.562 ... 355.3 357.2 359.1
    lon_bnds   (lon, bnds) float64 3kB dask.array<chunksize=(192, 2), meta=np.ndarray>
  * time       (time) object 16kB 1850-01-16 00:00:00 ... 2014-12-16 00:00:00
    time_bnds  (time, bnds) object 32kB dask.array<chunksize=(1980, 2), meta=np.ndarray>
Dimensions without coordinates: bnds
Data variables:
    tas        (time, lat, lon) float32 219MB dask.array<chunksize=(600, 144, 192), meta=np.ndarray>
Attributes: (12/47)
    Conventions:            CF-1.7 CMIP-6.2
    activity_id:            CMIP
    branch_method:          standard
    branch_time_in_child:   0.0
    branch_time_in_parent:  144000.0
    cmor_version:

In [3]:
from pathlib import Path
import pandas as pd
import xarray as xr
import logging
from concurrent.futures import ThreadPoolExecutor

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Define constants
CATALOG_URL = "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
OUTPUT_DIR = Path("datasets")  # Use pathlib for better path handling
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)  # Ensure directory exists

# Read the CMIP6 data catalog
logging.info("Downloading CMIP6 catalog...")
df = pd.read_csv(CATALOG_URL)

# Convert the dataframe into a dictionary for fast lookups
dataset_lookup = {tuple(row[['source_id', 'experiment_id', 'variable_id', 'member_id', 'table_id']]): row['zstore']
                  for _, row in df.iterrows()}

# List of datasets to download
datasets = [
    {"source": "UKESM1-0-LL", "experiment": "ssp245", "variable": "tas", "member": "r1i1p1f2", "table": "Amon"},
    {"source": "UKESM1-0-LL", "experiment": "ssp585", "variable": "tas", "member": "r1i1p1f2", "table": "Amon"},
    {"source": "UKESM1-0-LL", "experiment": "ssp245", "variable": "pr", "member": "r1i1p1f2", "table": "Amon"},
    {"source": "UKESM1-0-LL", "experiment": "ssp585", "variable": "pr", "member": "r1i1p1f2", "table": "Amon"},
    {"source": "CESM2-WACCM", "experiment": "ssp245", "variable": "tas", "member": "r1i1p1f1", "table": "Amon"},
    {"source": "CESM2-WACCM", "experiment": "ssp585", "variable": "tas", "member": "r1i1p1f1", "table": "Amon"},
    {"source": "CESM2-WACCM", "experiment": "ssp245", "variable": "pr", "member": "r1i1p1f1", "table": "Amon"},
    {"source": "CESM2-WACCM", "experiment": "ssp585", "variable": "pr", "member": "r1i1p1f1", "table": "Amon"},
    {"source": "MIROC6", "experiment": "ssp245", "variable": "tas", "member": "r1i1p1f1", "table": "Amon"},
    {"source": "MIROC6", "experiment": "ssp585", "variable": "tas", "member": "r1i1p1f1", "table": "Amon"},
    {"source": "MIROC6", "experiment": "ssp245", "variable": "pr", "member": "r1i1p1f1", "table": "Amon"},
    {"source": "MIROC6", "experiment": "ssp585", "variable": "pr", "member": "r1i1p1f1", "table": "Amon"},
]

def download_cmip6_data(dataset):
    """Download CMIP6 data and save as NetCDF."""
    try:
        source, experiment, variable, member, table = dataset.values()
        output_path = OUTPUT_DIR / f"cmip6_{variable}_{experiment}_{source}.nc"
        
        # Skip download if the file already exists
        if output_path.exists():
            logging.info(f"File already exists, skipping: {output_path}")
            return
        
        # Get dataset URL
        dataset_key = (source, experiment, variable, member, table)
        zarr_url = dataset_lookup.get(dataset_key)

        if not zarr_url:
            logging.warning(f"No data found for {dataset_key}")
            return

        logging.info(f"Downloading {experiment} {variable} data for {source}...")
        ds = xr.open_zarr(zarr_url, consolidated=True)  # Open dataset

        # Save dataset as NetCDF
        ds.to_netcdf(output_path)
        logging.info(f"Saved dataset to {output_path}")

    except Exception as e:
        logging.error(f"Error downloading {dataset}: {e}")

# Use ThreadPoolExecutor to speed up downloads
with ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(download_cmip6_data, datasets)

logging.info("All downloads complete.")

# i also need to download historical data for each model and each variable. can you modifiy the code above to do this?

2025-04-04 12:17:14,128 - INFO - Downloading CMIP6 catalog...
2025-04-04 12:18:13,811 - INFO - Downloading ssp245 tas data for UKESM1-0-LL...
2025-04-04 12:18:13,811 - INFO - Downloading ssp585 tas data for UKESM1-0-LL...
2025-04-04 12:18:13,811 - INFO - Downloading ssp245 pr data for UKESM1-0-LL...
2025-04-04 12:18:13,812 - INFO - Downloading ssp585 pr data for UKESM1-0-LL...
2025-04-04 12:18:16,905 - INFO - Saved dataset to datasets/cmip6_tas_ssp585_UKESM1-0-LL.nc
2025-04-04 12:18:16,907 - INFO - Downloading ssp245 tas data for CESM2-WACCM...
2025-04-04 12:18:17,094 - INFO - Saved dataset to datasets/cmip6_tas_ssp245_UKESM1-0-LL.nc
2025-04-04 12:18:17,096 - INFO - Downloading ssp585 tas data for CESM2-WACCM...
2025-04-04 12:18:17,327 - INFO - Saved dataset to datasets/cmip6_pr_ssp585_UKESM1-0-LL.nc
2025-04-04 12:18:17,329 - INFO - Downloading ssp245 pr data for CESM2-WACCM...
2025-04-04 12:18:17,505 - INFO - Saved dataset to datasets/cmip6_pr_ssp245_UKESM1-0-LL.nc
2025-04-04 12:18:17

In [8]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cartopy.crs as ccrs
import cartopy.feature as cfeature

# Define parameters
LOCATIONS = {
    "California": (32.5, 42),  # Latitude range
}
MODELS = ["CESM2-WACCM"]  # Selected CMIP6 model
SSP_SCENARIOS = ["ssp245", "ssp585"]  # Moderate & High Emission Scenarios
VARIABLES = ["tas", "pr"]  # Temperature & Precipitation
YEARS = slice(2015, 2100)  # Future projections

# Function to extract data for a region
def extract_cmip6_data(dataset, var, lat_range, lon_range):
    """Extracts data for a given latitude and longitude range."""
    ds = dataset[var].sel(lat=slice(*lat_range), lon=slice(*lon_range))
    return ds.mean(dim=["lat", "lon"]).to_dataframe().reset_index()

# Function to compute wildfire risk index
def compute_fire_risk(temp, precip):
    """Computes a simple fire risk index based on temperature and precipitation."""
    temp_anomaly = temp - temp.mean()
    precip_anomaly = precip - precip.mean()
    return temp_anomaly / (precip_anomaly + 1)

# Function to plot wildfire risk maps
def plot_wildfire_risk_map(data, scenario, year_range):
    """Plots a map of wildfire risk in California."""
    plt.figure(figsize=(10, 6))
    ax = plt.axes(projection=ccrs.PlateCarree())
    ax.set_extent([-125, -113, 32.5, 42])  # California boundaries
    ax.add_feature(cfeature.BORDERS, linestyle=':')
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.STATES, linestyle='--')
    
    # Plot data
    sc = plt.scatter(data['lon'], data['lat'], c=data['fire_risk'], cmap='Reds', edgecolors='k')
    plt.colorbar(sc, label="Fire Risk Index")
    plt.title(f"Wildfire Risk Projection ({scenario}, {year_range})")
    plt.show()

# Example: Load a dataset (mocked, as real data requires access)
def load_sample_data():
    """Mock function to simulate CMIP6 data loading."""
    lat = np.linspace(32.5, 42, 10)
    lon = np.linspace(-125, -113, 10)
    time = pd.date_range("2015-01-01", "2100-12-31", freq="A")
    temp_data = np.random.normal(25, 3, (len(lat), len(lon), len(time)))  # Fake temperature data
    pr_data = np.random.normal(50, 10, (len(lat), len(lon), len(time)))  # Fake precipitation data
    return lat, lon, time, temp_data, pr_data

# Main script
if __name__ == "__main__":
    lat, lon, time, temp_data, pr_data = load_sample_data()
    fire_risk = compute_fire_risk(temp_data.mean(axis=(0, 1)), pr_data.mean(axis=(0, 1)))
    
    # Create dataframe
    df = pd.DataFrame({"time": time, "fire_risk": fire_risk})
    print(df)
    #plot_wildfire_risk_map(df, "ssp245", "2041-2060")


         time  fire_risk
0  2015-12-31   0.129392
1  2016-12-31   0.016361
2  2017-12-31  -0.113677
3  2018-12-31   0.100093
4  2019-12-31   0.096342
..        ...        ...
81 2096-12-31  -0.051751
82 2097-12-31   0.142006
83 2098-12-31  -0.159013
84 2099-12-31   0.037936
85 2100-12-31  -0.672775

[86 rows x 2 columns]


  time = pd.date_range("2015-01-01", "2100-12-31", freq="A")


In [13]:
from pathlib import Path
import pandas as pd
import xarray as xr
import logging
from concurrent.futures import ThreadPoolExecutor

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Define constants
CATALOG_URL = "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
OUTPUT_DIR = Path("datasets")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Read the CMIP6 data catalog
logging.info("Downloading CMIP6 catalog...")
df = pd.read_csv(CATALOG_URL)

# Convert catalog to dictionary for faster lookups
dataset_lookup = {
    tuple(row[['source_id', 'experiment_id', 'variable_id', 'member_id', 'table_id']]): row['zstore']
    for _, row in df.iterrows()
}

# Define model configurations
model_configs = {
    "UKESM1-0-LL": "r1i1p1f2",
    "CESM2-WACCM": "r1i1p1f1",
    "MIROC6": "r1i1p1f1"
}
experiments = ["historical", "ssp245", "ssp585"]
variables = ["tas", "pr"]
table_id = "Amon"

# Generate full list of datasets
datasets = []
for model, member in model_configs.items():
    for exp in experiments:
        for var in variables:
            datasets.append({
                "source": model,
                "experiment": exp,
                "variable": var,
                "member": member,
                "table": table_id
            })

def download_cmip6_data(dataset):
    """Download CMIP6 data and save as NetCDF."""
    try:
        source, experiment, variable, member, table = dataset.values()
        output_path = OUTPUT_DIR / f"cmip6_{variable}_{experiment}_{source}.nc"
        
        if output_path.exists():
            logging.info(f"File already exists, skipping: {output_path}")
            return

        dataset_key = (source, experiment, variable, member, table)
        zarr_url = dataset_lookup.get(dataset_key)

        if not zarr_url:
            logging.warning(f"No data found for {dataset_key}")
            return

        logging.info(f"Downloading {experiment} {variable} data for {source}...")
        ds = xr.open_zarr(zarr_url, consolidated=True)
        ds.to_netcdf(output_path)
        logging.info(f"Saved dataset to {output_path}")

    except Exception as e:
        logging.error(f"Error downloading {dataset}: {e}")

# Parallel download
with ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(download_cmip6_data, datasets)

logging.info("All downloads complete.")

2025-04-04 17:58:39,315 - INFO - Downloading CMIP6 catalog...
2025-04-04 17:59:38,993 - INFO - Downloading historical tas data for UKESM1-0-LL...
2025-04-04 17:59:38,994 - INFO - File already exists, skipping: datasets/cmip6_tas_ssp245_UKESM1-0-LL.nc
2025-04-04 17:59:38,994 - INFO - File already exists, skipping: datasets/cmip6_pr_ssp245_UKESM1-0-LL.nc
2025-04-04 17:59:38,994 - INFO - Downloading historical pr data for UKESM1-0-LL...
2025-04-04 17:59:38,998 - INFO - File already exists, skipping: datasets/cmip6_tas_ssp585_UKESM1-0-LL.nc
2025-04-04 17:59:38,999 - INFO - File already exists, skipping: datasets/cmip6_pr_ssp585_UKESM1-0-LL.nc
2025-04-04 17:59:39,000 - INFO - Downloading historical tas data for CESM2-WACCM...
2025-04-04 17:59:39,000 - INFO - Downloading historical pr data for CESM2-WACCM...
2025-04-04 17:59:44,065 - INFO - Saved dataset to datasets/cmip6_tas_historical_UKESM1-0-LL.nc
2025-04-04 17:59:44,070 - INFO - File already exists, skipping: datasets/cmip6_tas_ssp245_C