# Uncertanty analysis

In [1]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold 

from tqdm.notebook import tqdm
import plotly.express as px
import numpy as np
import pandas as pd
import os

os.chdir("/home/rooda/Dropbox/Patagonia/MS2 Results/")
sources = ["Outline", "Climate", "Volume", "GGM", "SSP", "BCM"]
metrics = ["Timing", "Rate_change", "Duration", "Magnitude", "Frequency"]

## Data 

In [2]:
paths = ["data_basin_peak_water_year.csv", 
         "data_basin_rate_change.csv", 
         "data_basin_duration.csv", 
         "data_basin_peak_water_value.csv", 
         "data_basin_freq.csv"]

In [3]:
complete_dataset = []

for path in tqdm(range(0,5)):
    
    # read file from datasets_signature.ipynb
    data = pd.read_csv(paths[path], index_col = 0)
    
    # only basins with at least one glacier
    data = data.dropna(axis = 1) 
    
    # categorical variables
    x = data[sources]
    x = OrdinalEncoder().fit_transform(x)
    
    for basin in tqdm(data.columns[6:]):
        
        y = data[basin].values # for each basib
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state=123)

        # random forest regression model
        rf = RandomForestRegressor(n_estimators = 500, random_state = 123, n_jobs = -1)
        rf.fit(x_train, y_train)

        # permutation importance
        result = permutation_importance(rf, x_test, y_test, random_state=123, n_jobs=-1, 
                                        n_repeats=5, scoring = 'neg_root_mean_squared_error')
        
        result = pd.DataFrame([result.importances_mean], columns = sources)
        result["ID"] = basin
        result["Metric"] = metrics[path]
        complete_dataset.append(result)
    
complete_dataset = pd.concat(complete_dataset)
complete_dataset = complete_dataset.reset_index().drop(columns='index')
complete_dataset.to_csv("RF_importance_basin.csv")

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

In [None]:
kf = KFold(n_splits=20, shuffle=True, random_state=123)
rf = RandomForestRegressor(n_estimators = 5000, min_samples_leaf = 0.12, random_state = 123)
cross_val_score(rf, x, y, cv = kf, scoring = "neg_root_mean_squared_error", n_jobs = -1)