# Dataset for repository

In [None]:
import os
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import shapely.geometry

from oggm import utils
from glob import glob
from tqdm.notebook import tqdm

os.chdir('/home/rooda/Dropbox/Patagonia')

In [None]:
# basins
basins = gpd.read_file("GIS South/Basins_Patagonia_ice.shp")
basins = basins.set_index("ID")

names = ["Yelcho", "Baker", "Santa Cruz", "Palena", "Grey", "Puelo", "Cisnes", "Aysen", "Pascua"]
basins.loc[basins.basin_area > 5000, "Name"] = names
basins = basins.replace({"Zone": {1:'PPY', 2:'PCA', 3:'NPI-E', 4:'NPI-W', 5:'SPI-N', 6:'SPI-C', 7:'SPI-S', 8:'GCN', 9:'CDI'}})
basins = basins[["Name", "Zone", "basin_area", "geometry"]]
basins = basins.rename(columns = {"Name": "basin_name", "Zone": "basin_zone"})

basins.index.name='basin_id'
basins.to_file("MS2 Results/zenodo/basins_boundaries.shp")

In [None]:
# add area, volume and number of glaciers
RGI6 = gpd.read_file("GIS South/Glaciers/RGI6_v2.shp")
RGI7 = gpd.read_file("GIS South/Glaciers/RGI7_v2.shp")

RGI6_sum = RGI6.groupby("ID_basin")[["O2Region", "area_km2", "vol_F19", "vol_M22"]].sum()
RGI6_sum = RGI6_sum.rename(columns = {"O2Region": "n_RGI6", "area_km2": "area_RGI6"})

RGI7_sum = RGI7.groupby("ID_basin")[["O2Region", "area_km2"]].sum()
RGI7_sum = RGI7_sum.rename(columns = {"O2Region": "n_RGI7", "area_km2": "area_RGI7"})

basins = pd.concat([basins, RGI6_sum, RGI7_sum], axis=1)

# fill with zeros
fillc = ["n_RGI6", "area_RGI6", "vol_F19", "vol_M22", "n_RGI7", "area_RGI7"]
basins[fillc] = basins[fillc].fillna(0) 

In [None]:
# reference climate
climate_ref = pd.read_csv("MS2 Results/dataset_climate_ref.csv").set_index("ID")
climate_ref = climate_ref[['PP_PMET', 'PP_ERA5','PP_CR2MET', 'PP_MSWEP', 'T2M_PMET', 'T2M_ERA5', 'T2M_CR2MET', 'T2M_MSWEP']]

basins = pd.concat([basins, climate_ref], axis=1)
#basins = basins.drop(columns = "geometry").to_csv("MS2 Results/zenodo/dataset_historical.csv",  index_label='basin_id')

In [None]:
# future climate
climate_fut = pd.read_csv("MS2 Results/dataset_climate_future.csv").set_index("ID")
climate_fut = climate_fut[['PP_change_126', 'PP_change_245', 'PP_change_370', 'PP_change_585', 
                           'T2M_change_126','T2M_change_245', 'T2M_change_370', 'T2M_change_585']]

climate_fut = climate_fut.rename(columns = {
    'PP_change_126': 'PPc_ssp126', 'PP_change_245': 'PPc_ssp245', 'PP_change_370': 'PPc_ssp370', 'PP_change_585': 'PPc_ssp585', 
    'T2M_change_126': 'T2Mc_ssp126','T2M_change_245': 'T2Mc_ssp245','T2M_change_370': 'T2Mc_ssp370','T2M_change_585': 'T2Mc_ssp585'})

basins = pd.concat([basins, climate_fut], axis=1)

In [None]:
# volume change
RGI6_ids = gpd.read_file("/home/rooda/Dropbox/Patagonia/GIS South/Glaciers/RGI6_v2.shp")
RGI6_ids = RGI6_ids[RGI6_ids.area_km2 > 1][["RGIId", "ID_basin"]]

RGI7_ids = gpd.read_file("/home/rooda/Dropbox/Patagonia/GIS South/Glaciers/RGI7_v2.shp")
RGI7_ids = RGI7_ids[RGI7_ids.area_km2 > 1]
RGI7_ids = utils.cook_rgidf(RGI7_ids, o1_region='17', o2_region='02', bgndate= RGI7_ids.src_date, 
                            version = "70", assign_column_values= {'ID_basin' : 'ID_basin'})
RGI7_ids = RGI7_ids[["RGIId", "ID_basin"]]

# merge both datasets
ids = pd.concat([RGI6_ids, RGI7_ids]).set_index("RGIId")

def preprocess(ds): # remove unnecessary variables and coordinates
    return ds.drop_vars(['hydro_year', 'hydro_month', 'calendar_year', 'calendar_month'])['volume']

gdirs = glob("/home/rooda/OGGM_results/new/*", recursive = True)

ds    = []
for gdir in tqdm(gdirs):

    # read historical run 
    model_hist   = xr.open_mfdataset(gdir + "/run_outputs_*.nc", preprocess = preprocess)
    model_hist   = model_hist.sel(time=2015).volume # check NAs

    paths = glob(gdir + "/run_output_*ssp*.nc", recursive = True)
    for path in tqdm(paths, leave = False):

        # read future run and concatenate
        model_future = xr.open_dataset(path)
        model_future = preprocess(model_future).sel(time=2100)
        model   = xr.concat([model_hist, model_future], dim = "time").load()

        # add basin ID to each glacier ID (RGI_ID)
        ids_subset = ids[ids.index.isin(model.rgi_id.to_pandas().tolist())]
        model = model.assign_coords(rgi_id = ids_subset.ID_basin.tolist())
        model = model.groupby('rgi_id').sum()
        model = 1 - (model.sel(time = 2100) / model.sel(time = 2015))
        
        # ID of the setup
        experiment_id = pd.Series(data = {'SSP':     os.path.basename(path).split("_")[3]})
        ds_model = pd.DataFrame(pd.concat([experiment_id, model.to_pandas()]), columns=['mass_loss']).transpose()
        ds.append(ds_model)
        
ds = pd.concat(ds)
ds = ds.groupby("SSP").mean()
ds = ds.transpose()
ds = ds.rename(columns = {'ssp126': 'mass_loss_ssp126', 
                    'ssp245': 'mass_loss_ssp245',
                    'ssp370': 'mass_loss_ssp370', 
                    'ssp585': 'mass_loss_ssp585'})

basins = pd.concat([basins, ds], axis=1)

In [None]:
basins = basins[['PPc_ssp126', 'PPc_ssp245', 'PPc_ssp370', 'PPc_ssp585', 
                  'T2Mc_ssp126','T2Mc_ssp245', 'T2Mc_ssp370', 'T2Mc_ssp585',
                  'mass_loss_ssp126', 'mass_loss_ssp245', 'mass_loss_ssp370', 'mass_loss_ssp585']]
basins = basins.drop(columns = "geometry").to_csv("MS2 Results/zenodo/dataset_future.csv",  index_label='basin_id')

In [None]:
# glacio-hydrological signature
metrics = pd.read_csv("MS2 Results/dataset_hydro_signatures.csv")
metrics = metrics.drop(columns = ["Outline", "Climate", "Volume", "GCM", "SSP", "BCM"])
metrics = metrics.rename(columns = {"Unnamed: 0": "variable", "Variable": "metric"})
metrics = metrics.groupby(["metric", "variable"]).mean().transpose()
metrics = metrics.droplevel(0, axis=1) 
metrics.columns = np.concatenate(("mg_" + metrics.columns[0:10].values, "tr_" + metrics.columns[0:10].values), axis=0)
metrics.index = metrics.index.astype("int64")

metrics_su = pd.read_csv("MS2 Results/feature_importance_rmse.csv", index_col = 0)
metrics_su['Most_important'] = metrics_su[["Outline","Climate", "Volume", "GCM", "SSP", "BCM"]].idxmax(axis=1)
metrics_su = metrics_su.drop(columns = ["Outline", "Climate", "Volume", "GCM", "SSP", "BCM"])
metrics_su = metrics_su.pivot(columns = ["Variable", "Metric"], values = "Most_important")
metrics_su = metrics_su.droplevel(0, axis=1)
metrics_su.columns = np.concatenate(("SoU_mg_" + metrics_su.columns[0:10].values, "SoU_tr_" + metrics_su.columns[0:10].values), axis=0)

metrics_hydro = pd.concat([metrics, metrics_su], axis=1)
metrics_hydro.to_csv("MS2 Results/zenodo/dataset_signatures.csv",  index_label='basin_id')