In [2]:
# First steps for intake-ESGF-Catalog (https://intake-esgf.readthedocs.io/en/latest/quickstart.html)
from intake_esgf import ESGFCatalog
import numpy as np
import xarray as xr
import netCDF4


In [23]:
# Defined all functions used in this script

# Placeholder function to generate a sample of weights
def generate_sample_of_weights(num_models):
    """
    Generate a sample of weights that are non-negative and sum up to 1.
    Args:
    num_models (int): The number of models (and hence the number of weights to generate)
    Returns:
    numpy.ndarray: An array of weights that sum up to 1.
    """
    # Generate uniform samples which naturally fall between 0 and 1
    weights = np.random.uniform(0, 1, size=num_models)
    weights = weights **4
    # Normalize the weights so they sum up to 1
    normalized_weights = weights / np.sum(weights)
    return normalized_weights

# Placeholder for 'models' which would be an array of models.
# Here we assume each model is a function that predicts an output given an input.
# In the actual use case, these should be replaced with real predictive models.
def model_predictions(models_1, models_2, models_3, models_4, models_5, models_6, models_7, models_8, models_9, models_10, weights_norm):
    model_mean_weighted = (weights_norm[0]*models_1) + (weights_norm[1]*models_2) + (weights_norm[2]*models_3) + (weights_norm[3]* models_4) + (weights_norm[4]*models_5) + (weights_norm[5]*models_6) + (weights_norm[6]*models_7) + (weights_norm[7]*models_8) + (weights_norm[8]*models_9) + (weights_norm[9]*models_10)
    return model_mean_weighted

# function for shifiting lon according to Bharat's code
def shift_lon(model):
    # First shift the lon dimension for the model average
    ds_tmp = model.copy(deep=True)
    ds_tmp['lon'] = ds_tmp['lon'] - 180

    gpp_reshape = np.zeros(ds_tmp.shape)
    gpp_reshape[:,ds_tmp['lon'].size//2:] = ds_tmp[:,:ds_tmp['lon'].size//2].data
    gpp_reshape[:,:ds_tmp['lon'].size//2] = ds_tmp[:,ds_tmp['lon'].size//2:].data
    ds_tmp.data = gpp_reshape
    return ds_tmp 

# Create function that will do RMSE for each seperate model

def get_RMSE(model, obs):
    # Step 2: Compute the Difference
    difference = model - obs
    # Step 3: Square the Difference
    squared_difference = difference ** 2
    # Step 4: Compute the Mean Squared Error
    mse = squared_difference.mean(dim=['lat', 'lon'])
    # Step 5: Take the Square Root to get RMSE
    rmse  = np.sqrt(mse)
    rmse_weighted = rmse.values.item()

    return rmse_weighted

In [15]:
#Populate the Catalog - bringing in nothing from the catalog
cat = ESGFCatalog()
print(cat)  # <-- nothing to see here yet

Perform a search() to populate the catalog.


In [16]:
# Import selected models from Intake-ESGF Catalog for selected variable

models = ["ACCESS-ESM1-5","IPSL-CM6A-LR","CESM2", "UKESM1-0-LL","BCC-CSM2-MR","MPI-ESM1-2-HR","CanESM5","GFDL-ESM4","NorESM2-LM", "MIROC-ES2L"]

cat.search(
    experiment_id="historical",
    source_id= models,
    frequency="mon",
    variable_id=["gpp"],
)
cat.remove_ensembles()

   Searching indices:   0%|          |0/2 [       ?index/s]

Summary information for 10 results:
member_id                                      [r1i1p1f1, r1i1p1f2]
experiment_id                                          [historical]
variable_id                                                   [gpp]
mip_era                                                     [CMIP6]
table_id                                                     [Lmon]
institution_id    [NOAA-GFDL, NCC, IPSL, CSIRO, NCAR, BCC, MOHC,...
source_id         [GFDL-ESM4, NorESM2-LM, IPSL-CM6A-LR, ACCESS-E...
grid_label                                            [gr1, gn, gr]
activity_drs                                                 [CMIP]
project                                                     [CMIP6]
dtype: object

In [17]:
#Obtaining the datasets and loading it into a dictionary (putting it in the shopping cart)

dsd = cat.to_dataset_dict()

Get file information:   0%|          |0/2 [       ?index/s]

Adding cell measures:   0%|          |0/10 [     ?dataset/s]

In [18]:
#printing variable keys to see how variable names are set up
print(dsd.keys())

dict_keys(['r1i1p1f1.MPI-M.MPI-ESM1-2-HR.gn', 'r1i1p1f1.BCC.BCC-CSM2-MR.gn', 'r1i1p1f2.MIROC.MIROC-ES2L.gn', 'r1i1p1f1.IPSL.IPSL-CM6A-LR.gr', 'r1i1p1f1.NCC.NorESM2-LM.gn', 'r1i1p1f1.CCCma.CanESM5.gn', 'r1i1p1f2.MOHC.UKESM1-0-LL.gn', 'r1i1p1f1.NCAR.CESM2.gn', 'r1i1p1f1.NOAA-GFDL.GFDL-ESM4.gr1', 'r1i1p1f1.CSIRO.ACCESS-ESM1-5.gn'])


In [19]:
# Define models for selected time periods and fixed units (make sure that model names match the printed dict_key (above))

model_1 = dsd["r1i1p1f1.CSIRO.ACCESS-ESM1-5.gn"]["gpp"].sel(time=slice('1980-01-01', '2013-12-01'))* 86400 * 1000 # .mean(dim="time") * 86400 * 1000
model_2 = dsd["r1i1p1f1.BCC.BCC-CSM2-MR.gn"]["gpp"].sel(time=slice('1980-01-01', '2013-12-01'))* 86400  * 1000# .mean(dim="time") * 86400 * 1000
model_3 = dsd["r1i1p1f1.CCCma.CanESM5.gn"]["gpp"].sel(time=slice('1980-01-01', '2013-12-01'))* 86400  * 1000 # .mean(dim="time") * 86400 * 1000
model_4 = dsd["r1i1p1f1.NCAR.CESM2.gn"]["gpp"].sel(time=slice('1980-01-01', '2013-12-01'))* 86400  * 1000 # .mean(dim="time") * 86400 * 1000
model_5 = dsd["r1i1p1f1.NOAA-GFDL.GFDL-ESM4.gr1"]["gpp"].sel(time=slice('1980-01-01', '2013-12-01'))* 86400  * 1000 # .mean(dim="time") * 86400 * 1000
model_6 = dsd["r1i1p1f1.IPSL.IPSL-CM6A-LR.gr"]["gpp"].sel(time=slice('1980-01-01', '2013-12-01'))* 86400 * 1000 # .mean(dim="time") * 86400 * 1000
model_7 = dsd["r1i1p1f2.MIROC.MIROC-ES2L.gn"]["gpp"].sel(time=slice('1980-01-01', '2013-12-01'))* 86400 * 1000 # .mean(dim="time") * 86400 * 1000
model_8 = dsd["r1i1p1f1.MPI-M.MPI-ESM1-2-HR.gn"]["gpp"].sel(time=slice('1980-01-01', '2013-12-01'))* 86400 * 1000 # .mean(dim="time") * 86400 * 1000
model_9 = dsd["r1i1p1f1.NCC.NorESM2-LM.gn"]["gpp"].sel(time=slice('1980-01-01', '2013-12-01'))* 86400 * 1000 # .mean(dim="time") * 86400 * 1000
model_10 = dsd["r1i1p1f2.MOHC.UKESM1-0-LL.gn"]["gpp"].sel(time=slice('1980-01-01', '2013-12-01'))* 86400* 1000 # .mean(dim="time") * 86400 * 1000

print(model_10["time"])

<xarray.DataArray 'time' (time: 407)> Size: 3kB
array([cftime.Datetime360Day(1980, 1, 16, 0, 0, 0, 0, has_year_zero=True),
       cftime.Datetime360Day(1980, 2, 16, 0, 0, 0, 0, has_year_zero=True),
       cftime.Datetime360Day(1980, 3, 16, 0, 0, 0, 0, has_year_zero=True), ...,
       cftime.Datetime360Day(2013, 9, 16, 0, 0, 0, 0, has_year_zero=True),
       cftime.Datetime360Day(2013, 10, 16, 0, 0, 0, 0, has_year_zero=True),
       cftime.Datetime360Day(2013, 11, 16, 0, 0, 0, 0, has_year_zero=True)],
      dtype=object)
Coordinates:
  * time     (time) object 3kB 1980-01-16 00:00:00 ... 2013-11-16 00:00:00
    type     |S4 4B ...
Attributes:
    bounds:         time_bnds
    axis:           T
    long_name:      time
    standard_name:  time


In [20]:
# Linear Interpolation/Regridding to match lowest resolution model (CanESM5):

#Extract lat/lon grid from CanESM5
lat_target = model_3.lat.values
lon_target = model_3.lon.values

# Regrid each model and take long (not longtitude) term mean 
gpp_model_1_Regridded = model_1.interp(lat=lat_target, lon=lon_target)
gpp_model_1_Regridded_mean = gpp_model_1_Regridded.mean(dim="time")

gpp_model_2_Regridded = model_2.interp(lat=lat_target, lon=lon_target)
gpp_model_2_Regridded_mean = gpp_model_2_Regridded.mean(dim="time")

gpp_model_3_Regridded = model_3.interp(lat=lat_target, lon=lon_target)
gpp_model_3_Regridded_mean = gpp_model_3_Regridded.mean(dim="time")

gpp_model_4_Regridded = model_4.interp(lat=lat_target, lon=lon_target)
gpp_model_4_Regridded_mean = gpp_model_4_Regridded.mean(dim="time")

gpp_model_5_Regridded = model_5.interp(lat=lat_target, lon=lon_target)
gpp_model_5_Regridded_mean = gpp_model_5_Regridded.mean(dim="time")

gpp_model_6_Regridded = model_6.interp(lat=lat_target, lon=lon_target)
gpp_model_6_Regridded_mean = gpp_model_6_Regridded.mean(dim="time")

gpp_model_7_Regridded = model_7.interp(lat=lat_target, lon=lon_target)
gpp_model_7_Regridded_mean = gpp_model_7_Regridded.mean(dim="time")

gpp_model_8_Regridded = model_8.interp(lat=lat_target, lon=lon_target)
gpp_model_8_Regridded_mean = gpp_model_8_Regridded.mean(dim="time")

gpp_model_9_Regridded = model_9.interp(lat=lat_target, lon=lon_target)
gpp_model_9_Regridded_mean = gpp_model_9_Regridded.mean(dim="time")

gpp_model_10_Regridded = model_10.interp(lat=lat_target, lon=lon_target)
gpp_model_10_Regridded_mean = gpp_model_10_Regridded.mean(dim="time")

In [21]:
# Shift every model to proper coordinates
gpp_model_1_Regridded_mean_shifted = shift_lon(gpp_model_1_Regridded_mean)
gpp_model_2_Regridded_mean_shifted = shift_lon(gpp_model_2_Regridded_mean)
gpp_model_3_Regridded_mean_shifted = shift_lon(gpp_model_3_Regridded_mean)
gpp_model_4_Regridded_mean_shifted = shift_lon(gpp_model_4_Regridded_mean)
gpp_model_5_Regridded_mean_shifted = shift_lon(gpp_model_5_Regridded_mean)
gpp_model_6_Regridded_mean_shifted = shift_lon(gpp_model_6_Regridded_mean)
gpp_model_7_Regridded_mean_shifted = shift_lon(gpp_model_7_Regridded_mean)
gpp_model_8_Regridded_mean_shifted = shift_lon(gpp_model_8_Regridded_mean)
gpp_model_9_Regridded_mean_shifted = shift_lon(gpp_model_9_Regridded_mean)
gpp_model_10_Regridded_mean_shifted = shift_lon(gpp_model_10_Regridded_mean)

In [22]:
# Define the observation 

# Open the NetCDF file as an xarray Dataset
ds = xr.open_dataset('/Users/6i0/Documents/Data/gpp_WECANN.nc')

# Access the GPP variable from the Dataset
gpp_data = ds['gpp']
gpp_data_mean = gpp_data.mean(dim="time")
lat = ds['lat']
lon = ds['lon']
time = ds['time']

#gpp_data_Regridded_mean = gpp_data_mean

lat_target = model_3.lat.values
lon_target = model_3.lon.values -180

# Correctly using ds_emean for interpolation
gpp_data_Regridded = gpp_data.interp(lat=lat_target, lon=lon_target)
gpp_data_Regridded_mean = gpp_data_Regridded.mean(dim="time")

#Shift data
gpp_data_Regridded_mean_shifted = shift_lon(gpp_data_Regridded_mean)


In [25]:
# BMA Implementation:

# Placeholder for actual observed data (data_obs)
data_obs = gpp_data_Regridded_mean

# Number of models we are combining
num_models = 10 
# Number of BMA Samples 
n_samples= 100

# Initialize arrays to hold weights, weighted averages and RMSE
new_weights = np.zeros((n_samples, num_models))
weighted_avgs = np.zeros((n_samples, len(data_obs)))
rmse_weighted = np.zeros(n_samples)

# Loop to generate weights, calculate weighted averages and RMSE
for i in range(n_samples):
    weights = generate_sample_of_weights(num_models)
    new_weights[i, :] = weights
    # For this example, we assume that all models have a single input data point, for simplicity.
    # In a real scenario, the input to models would be the data they should predict.
    weighted_avgs = model_predictions(gpp_model_1_Regridded_mean_shifted, gpp_model_2_Regridded_mean_shifted, gpp_model_3_Regridded_mean_shifted, gpp_model_4_Regridded_mean_shifted, gpp_model_5_Regridded_mean_shifted, gpp_model_6_Regridded_mean_shifted, gpp_model_7_Regridded_mean_shifted, gpp_model_8_Regridded_mean_shifted, gpp_model_9_Regridded_mean_shifted, gpp_model_10_Regridded_mean_shifted, new_weights[i, :])
    
    #Estimate RMSE for this BMA samples
    rmse_weighted[i] = get_RMSE(weighted_avgs, data_obs) 

# Sorting RMSE to find the best one 
sorted_indices = np.argsort(rmse_weighted)
best_rmse = rmse_weighted[sorted_indices[0]]
best_location = sorted_indices[0]
best_weights = new_weights[best_location, :]

# Print to screen the best RMSE, Location of that sample, and the corresponding weights
best_rmse, best_location, best_weights


(0.6002035355286313,
 25,
 array([7.49133742e-02, 2.28896631e-02, 2.57622127e-04, 2.90522406e-05,
        1.39155674e-01, 4.21988054e-01, 1.27359052e-01, 1.98152371e-01,
        3.83363315e-03, 1.14215042e-02]))