## Data Sources and Import Statements

Data have been downloaded from the Earth System Grid Federation at https://esgf-node.ipsl.upmc.fr/projects/esgf-ipsl/.

Each file has been concatenated to contain ssp119 and ssp126 scenarios and r1-5 ensemble members from 2015 to 2100. Each has also been regridded to 2.5° resolution.

In [5]:
# IMPORT STATEMENTS

# General useful libraries
import math
import os
# Loading in data (netcdf files)
import h5py
# Handling data
import numpy as np
import netCDF4 as nc
# Installing xarray and its dependencies
import xarray as xr
import scipy 
import dask
import bottleneck
# Plotting figures
import matplotlib.pyplot as plt #Main plotting package
import cartopy.crs as ccrs
from cartopy.util import add_cyclic_point
import cartopy.mpl.ticker as cticker

# Machine Learning package
import tensorflow as tf
tf.compat.v1.disable_v2_behavior() 
print(tf.__version__)

# Interpreting neural networks 
import shap

2.18.0


## Path & Files

In [6]:
base_path = os.getcwd()

data_path = base_path + '/data/'

filenames = [
    "CNRM_ESM2-1_ssp119_ssp126_201501_210012_r1-5_2pt5degree.nc",
    "MIROC6_ssp119_ssp126_201501_210012_r1-5_2pt5degree.nc",
    "MPI-ESM1-2-LR_ssp119_ssp126_201501_210012_r1-5_2pt5degree.nc",
    "MRI-ESM2-0_ssp119_ssp126_201501_210012_r1-5_2pt5degree.nc",
    "UKESM1-0-LL_ssp119_ssp126_201501_210012_r1-5_2pt5degree.nc",
]

files = [os.path.join(data_path, f) for f in filenames]

## Variable Index Map and Units

In [7]:
# Variable order is explicitly given by metadata:
# "tas, tasmax, tasmin, pr, psl, sfcWind, mrsos"
var_to_index = {
    "tas": 0,
    "tasmax": 1,
    "tasmin": 2,
    "pr": 3,
    "psl": 4,
    "sfcWind": 5,
    "mrsos": 6,
}

# Units are also given:
var_units = {
    "tas": "K",
    "tasmax": "K",
    "tasmin": "K",
    "pr": "kg/m2s",
    "psl": "Pa",
    "sfcWind": "m/s",
    "mrsos": "kg/m2",
}

## Time Handling (Monthly Index → Years)

In [8]:
def get_model_name(path: str) -> str:
    # Everything before "_ssp..."
    return os.path.basename(path).split("_ssp")[0]


def months_to_year_month(time_months: np.ndarray, start_year=2015, start_month=1):
    """
    File says time units are 'months' and it spans 2015-2100.
    This creates year + month arrays assuming the first index corresponds to Jan 2015.

    If my time axis is "month count since 2015-01", this is correct.
    If not, it still gives consistent indexing as long as the file starts at 2015-01.
    """
    # time_months is usually 0..1031 or 1..1032 depending on how the file was written but I handle either by shifting to 0-based.
    t = np.array(time_months, dtype=int)
    if t.min() == 1:
        t = t - 1

    # compute year/month
    year = start_year + (start_month - 1 + t) // 12
    month = (start_month - 1 + t) % 12 + 1
    return year, month


def time_mask_for_year_range(ds: nc.Dataset, start_year: int, end_year: int):
    """
    Create a mask over the monthly time axis using year bounds.
    """
    t = ds["time"][:]
    year, month = months_to_year_month(t, start_year=2015, start_month=1)
    return (year >= start_year) & (year <= end_year)

## Unit Conversions and Standardization

In [9]:
def convert_units(varname: str, x: np.ndarray) -> tuple[np.ndarray, str]:
    """
    Convert raw units into more interpretable and plottable units.
    - tas/tasmax/tasmin: K to C
    - pr: kg/m2s to mm/day  (1 kg/m2 = 1 mm water; multiply by 86400)
    - psl: Pa to hPa
    - sfcWind: keep m/s
    - mrsos: keep kg/m2 
    """
    if varname in {"tas", "tasmax", "tasmin"}:
        return x - 273.15, "°C"
    if varname == "pr":
        return x * 86400.0, "mm/day"
    if varname == "psl":
        return x / 100.0, "hPa"
    if varname == "sfcWind":
        return x, "m/s"
    if varname == "mrsos":
        return x, "kg/m²"
    return x, "unknown"

## Statistics Functions (Mean, Std, Median, Percentiles)

In [10]:
def compute_stat_over_time(x: np.ndarray, stat: str) -> np.ndarray:
    """
    x: (ens, time, lat, lon) after loading and swapping
    Returns: (ens, lat, lon) after aggregating over time
    Supported stats:
      - mean (default)
      - std
      - median
      - percentile_XX  (ex. percentile_95)
    """
    s = stat.lower().strip()

    if s == "mean":
        return np.nanmean(x, axis=1)
    if s == "std":
        return np.nanstd(x, axis=1)
    if s == "median":
        return np.nanmedian(x, axis=1)

    m = re.match(r"percentile[_\s-]?(\d+)", s)
    if m:
        p = float(m.group(1))
        return np.nanpercentile(x, p, axis=1)

    raise ValueError(f"Unknown stat '{stat}'. Use mean/std/median/percentile_XX.")

In [11]:
def manipulate_dataset(
    scenario: str,
    period: tuple[int, int] | None = (2015, 2100),
    region: tuple[int, int, int, int] | None = None,
    model_name: str | list[str] | None = None,
): 

    """
    Opens and manipulates given dataset(s) for a chosen scenario, time period, and region where the inputs are:
      scenario: "ssp119" or "ssp126"
      period: None (default), (start_year, end_year)
      region: entire globe (default), lon/lat range
      model_name: if not None, ignore multimodel and plot only that/those model(s)
    """

    # Opening dataset 
    ds = xr.open_dataset(model_name, engine="netcdf4")

    all_ts = []
    target_var = f"data_{scenario}"
    
    # Variable manipulation (Selecting from var_to_index)
    data_array = ds[target_var].isel(variable=var_to_index[target_var])

    # Subset time
    if period is not None:
        ya, yb = period
        data_array = data_array.sel(time=slice(str(ya), str(yb)))
    else:
        ya, yb = int(data_array.time[0]), int(data_array.time[-1])
        
    # Subset regions
    if region is not None:
        lat1, lat2, lon1, lon2 = region
        data_array = data_array.sel(lat=slice(lat1, lat2), lon=slice(lon1, lon2))
    
    # Cosine logic 
    cosl = np.cos(np.pi * data_array.lat / 180)
    
    # Spatial averaging
    ts = data_array.weighted(cosl).mean(dim=("lat", "lon"))
    all_ts.append(ts.compute())

    return ds, ya, yb, lat1, lat2, lon1, lon2

## Time Series Plot

Produces a plot of a time series from 2015 to 2100 for the selected variable, scenario, and model. Will show all 5 trajectories within a model. 

Inputs: 
- Variable
- Scenario
- Base period (2015-2100 default)
- Region (entire globe default, lon/lat range)
- Statistic
- Number models returned (all 5 default, single model name) 
- File name(s)


In [12]:
def time_series_plot(
    varname: str,
    scenario: str,
    period: tuple[int, int] | None = (2015, 2100),
    region: tuple[int, int, int, int] | None = None,
    stat: str = "mean",
    multimodel: bool = True,
    model_name: list[str] | None = None,
):
    """
    Produces a time series plot where the inputs are:
      varname: one of tas, tasmax, tasmin, pr, psl, sfcWind, mrsos
      scenario: "ssp119" or "ssp126"
      period: None (default), (start_year, end_year)
      region: entire globe (default), lon/lat range
      stat: mean (default), std, median, percentile_XX
      multimodel: True (average across all 5 models) (default)
      model_name: if not None, ignore multimodel and plot only that/those model(s)
    """

    # Variable check
    if varname not in var_to_index:
        raise ValueError(f"varname must be one of {list(var_to_index.keys())}")

    # If model name(s) given, filters to only available files
    if model_name:
        file_list = [m for m in model_name if model_name in files]
        if len(file_list) < len(model_name):
            raise ValueError(f"One or more models not found. Available: {files}.")
        if len(file_list) == 0:
            raise ValueError(f"No models found. Available: {files}.")

    # If multimodel (and no model names given)
    if multimodel:
        pass

    # If a model name or names given (and not multimodel)
    else:
        # Pull variables from manipulate_dataset function
        ds, ya, yb, lat1, lat2, lon1, lon2 = manipulate_dataset(scenario, period, region, model_name)
        
        # For i in the range of the number of model names given
        for i in range(len(file_list)):
            # For years ranging from ya to yb, plot ith element of the list of model names with color pink (C6) and alpha 0.3
            plt.plot(np.arange(ya,yb), file_list[i], 'C6', alpha=0.3)
            # And plot the chosen stat of all models together
            ##computed_stat = compute_stat_over_time(model_name[i], stat)
            #plt.plot(np.arange(ya,yb), computed_stat, 'C6',alpha=0.3)
            
        plt.xlabel('x')
        plt.ylabel('y')
        plt.axis('tight')
        plt.grid(color='0.8')
        plt.legend()
        plt.show()

        ds.close()


In [13]:
if __name__ == "__main__":

   # Example 1: Time-series of tas in ssp119 scenario
    time_series_plot(
        varname="tas",
        scenario="ssp119", 
        period=(2020, 2039), 
        region=(-80, 80, 100, 200),
        stat="mean",
        multimodel=False,
        model_name = files
    )


ValueError: One or more models not found. Available: ['/Users/Caroline/Desktop/school/MamalakisResearch/data/CNRM_ESM2-1_ssp119_ssp126_201501_210012_r1-5_2pt5degree.nc', '/Users/Caroline/Desktop/school/MamalakisResearch/data/MIROC6_ssp119_ssp126_201501_210012_r1-5_2pt5degree.nc', '/Users/Caroline/Desktop/school/MamalakisResearch/data/MPI-ESM1-2-LR_ssp119_ssp126_201501_210012_r1-5_2pt5degree.nc', '/Users/Caroline/Desktop/school/MamalakisResearch/data/MRI-ESM2-0_ssp119_ssp126_201501_210012_r1-5_2pt5degree.nc', '/Users/Caroline/Desktop/school/MamalakisResearch/data/UKESM1-0-LL_ssp119_ssp126_201501_210012_r1-5_2pt5degree.nc'].