# Cross-validation: Hydrological modelling using LSTMs

In [None]:
# general
from tqdm.notebook import tqdm
from pathlib import Path
from glob import glob
import pandas as pd
import shutil
import yaml 
import pickle
import os

# geospatial
import geopandas as gpd

# hydrology
from neuralhydrology.nh_run import start_run
from neuralhydrology.nh_run import eval_run

import torch
torch.cuda.is_available()

os.chdir('/home/rooda/OneDrive/Projects/DeepHydro/')
path_disk = "/home/rooda/Pipeline/DeepHydro/NEURAL"

In [None]:
selection  = gpd.read_file("data/GIS/Basins_PMETobs_points_subset.gpkg")

In [None]:
def pickle_to_ts(pickle_path):
    
    with open(pickle_path, 'rb') as f:
        x = pickle.load(f);

        df = []
        for basin in tqdm(x.keys(), leave = False): 
        
            dataset_xr = x[basin]['1D']['xr']
            dataset_xr = dataset_xr.isel(time_step=0).drop_vars('time_step')
            dataset_xr = dataset_xr.PMET_q_mm_sim.clip(0)
            df.append(dataset_xr.to_pandas())

    df = pd.concat(df, axis = 1)
    df.columns = x.keys()
    return df

# Scenario: OGGM on 

## PUB

In [None]:
# remove previous runs
[shutil.rmtree(d) for d in glob(path_disk + '/runs/historical_PMET_OGGM_on_CV_PUB*')];

metrics = []
df_q = []

for kfold in range(1,11):

    # generate sets
    with open("modelling/basins_id_train.txt", "w") as file: 
        for x in selection.gauge_id[selection.kfold_pub_test != kfold]:
            file.write(x + "\n")

    with open("modelling/basins_id_validation.txt", "w") as file:
        for x in selection.gauge_id[selection.kfold_pub_test == kfold]:
            file.write(x + "\n")

    with open("modelling/basins_id_test.txt", "w") as file:
        for x in selection.gauge_id[selection.kfold_pub_test == kfold]:
            file.write(x + "\n")

    with open('modelling/Hydro_NH_setup.yml') as stream:
        data = yaml.safe_load(stream)
        
        # experiment name + data
        exp_name = "historical_PMET_OGGM_on_CV_PUB_{}".format(str(kfold).zfill(2))
        data['experiment_name'] = exp_name
        
        data['data_dir']        = path_disk + "/data/historical_PMET"
        data['dynamic_inputs']  = ["PMET_precip_full_mm", "PMET_tmax_full_degC", "PMET_tmin_full_degC", "PMET_pet_full_mm", "PMET_glacier_melt_mm"]

        # dates
        data['train_start_date']      = "01/01/2000"
        data['validation_start_date'] = "01/01/2000"
        data['test_start_date']       = "01/01/2000"
        data['train_end_date']        = "31/12/2019"
        data['validation_end_date']   = "31/12/2019"
        data['test_end_date']         = "31/12/2019"

        # hyperparameters 
        data['epochs']          = 5
        data['batch_size']      = 256
        data['hidden_size']     = 128
        data['learning_rate']   = 0.005
        data['output_dropout']  = 0.4
        data['seq_length']      = 365

    with open("modelling/Hydro_NH_setup.yml", 'w') as stream:
        yaml.dump(data, stream, default_flow_style=False)

    torch.cuda.empty_cache()
    start_run(config_file=Path("modelling/Hydro_NH_setup.yml"))
    
    # calculate performance for test set
    path_base = glob(path_disk + "/runs/" + exp_name + "*/", recursive = True)[0]
    eval_run(run_dir=Path(path_base), period="test", epoch = data['epochs'])
    metric_epoch = pd.read_csv(path_base + "test/model_epoch{}/test_metrics.csv".format(str(data['epochs']).zfill(3)))
    metrics.append(metric_epoch)

    df_epoch = pickle_to_ts(path_base + "test/model_epoch{}/test_results.p".format(str(data['epochs']).zfill(3)))
    df_q.append(df_epoch)

# save files
metrics = pd.concat(metrics, axis = 0).set_index("basin")
metrics.to_csv("results/performance/Historical_CV_PUB_LSTM_OGGM_on.csv")

df_q = pd.concat(df_q, axis = 1)
df_q.to_csv("results/runoff/total_runoff_historical_CV_PUB_LSTM_OGGM_on.csv")

## PUR

In [None]:
# remove previous runs
[shutil.rmtree(d) for d in glob(path_disk + '/runs/historical_PMET_OGGM_on_CV_PUR*')];

metrics = []
df_q = []

for kfold in range(1,11):

    # generate sets
    with open("modelling/basins_id_train.txt", "w") as file: 
        for x in selection.gauge_id[selection.kfold_pur_test != kfold]:
            file.write(x + "\n")

    with open("modelling/basins_id_validation.txt", "w") as file:
        for x in selection.gauge_id[selection.kfold_pur_test == kfold]:
            file.write(x + "\n")

    with open("modelling/basins_id_test.txt", "w") as file:
        for x in selection.gauge_id[selection.kfold_pur_test == kfold]:
            file.write(x + "\n")

    with open('modelling/Hydro_NH_setup.yml') as stream:
        data = yaml.safe_load(stream)
        
        # experiment name + data
        exp_name = "historical_PMET_OGGM_on_CV_PUR_{}".format(str(kfold).zfill(2))
        data['experiment_name'] = exp_name

        data['data_dir']        = path_disk + "/data/historical_PMET"
        data['dynamic_inputs']  = ["PMET_precip_full_mm", "PMET_tmax_full_degC", "PMET_tmin_full_degC", "PMET_pet_full_mm", "PMET_glacier_melt_mm"]

        # dates
        data['train_start_date']      = "01/01/2000"
        data['validation_start_date'] = "01/01/2000"
        data['test_start_date']       = "01/01/2000"
        data['train_end_date']        = "31/12/2019"
        data['validation_end_date']   = "31/12/2019"
        data['test_end_date']         = "31/12/2019"

        # hyperparameters 
        data['epochs']          = 5
        data['batch_size']      = 256
        data['hidden_size']     = 128
        data['learning_rate']   = 0.005
        data['output_dropout']  = 0.4
        data['seq_length']      = 365

    with open("modelling/Hydro_NH_setup.yml", 'w') as stream:
        yaml.dump(data, stream, default_flow_style=False)

    torch.cuda.empty_cache()
    start_run(config_file=Path("modelling/Hydro_NH_setup.yml"))

    # calculate performance for test set
    path_base = glob(path_disk + "/runs/" + exp_name + "*/", recursive = True)[0]
    eval_run(run_dir=Path(path_base), period="test", epoch = data['epochs'])
    metric_epoch = pd.read_csv(path_base + "test/model_epoch{}/test_metrics.csv".format(str(data['epochs']).zfill(3)))
    metrics.append(metric_epoch)

    df_epoch = pickle_to_ts(path_base + "test/model_epoch{}/test_results.p".format(str(data['epochs']).zfill(3)))
    df_q.append(df_epoch)

# save files
metrics = pd.concat(metrics, axis = 0).set_index("basin")
metrics.to_csv("results/performance/Historical_CV_PUR_LSTM_OGGM_on.csv")

df_q = pd.concat(df_q, axis = 1)
df_q.to_csv("results/runoff/total_runoff_historical_CV_PUR_LSTM_OGGM_on.csv")

# Scenario: OGGM off

## PUB

In [None]:
[shutil.rmtree(d) for d in glob(path_disk + '/runs/historical_PMET_OGGM_off_CV_PUB*')];

metrics = []
df_q = []

for kfold in range(1,11):

    # generate sets
    with open("modelling/basins_id_train.txt", "w") as file: 
        for x in selection.gauge_id[selection.kfold_pub_test != kfold]:
            file.write(x + "\n")

    with open("modelling/basins_id_validation.txt", "w") as file:
        for x in selection.gauge_id[selection.kfold_pub_test == kfold]:
            file.write(x + "\n")

    with open("modelling/basins_id_test.txt", "w") as file:
        for x in selection.gauge_id[selection.kfold_pub_test == kfold]:
            file.write(x + "\n")

    with open('modelling/Hydro_NH_setup.yml') as stream:
        data = yaml.safe_load(stream)
        
        # experiment name + data
        exp_name = "historical_PMET_OGGM_off_CV_PUB_{}".format(str(kfold).zfill(2))
        data['experiment_name'] = exp_name
        
        data['data_dir']        = path_disk + "/data/historical_PMET"
        data['dynamic_inputs']  = ["PMET_precip_full_mm", "PMET_tmax_full_degC", "PMET_tmin_full_degC", "PMET_pet_full_mm"]

        # dates
        data['train_start_date']      = "01/01/2000"
        data['validation_start_date'] = "01/01/2000"
        data['test_start_date']       = "01/01/2000"
        data['train_end_date']        = "31/12/2019"
        data['validation_end_date']   = "31/12/2019"
        data['test_end_date']         = "31/12/2019"

        # hyperparameters 
        data['epochs']          = 5
        data['batch_size']      = 256
        data['hidden_size']     = 128
        data['learning_rate']   = 0.005
        data['output_dropout']  = 0.4
        data['seq_length']      = 365

    with open("modelling/Hydro_NH_setup.yml", 'w') as stream:
        yaml.dump(data, stream, default_flow_style=False)

    torch.cuda.empty_cache()
    start_run(config_file=Path("modelling/Hydro_NH_setup.yml"))

    # calculate performance for test set
    path_base = glob(path_disk + "/runs/" + exp_name + "*/", recursive = True)[0]
    eval_run(run_dir=Path(path_base), period="test", epoch = data['epochs'])
    metric_epoch = pd.read_csv(path_base + "test/model_epoch{}/test_metrics.csv".format(str(data['epochs']).zfill(3)))
    metrics.append(metric_epoch)

    df_epoch = pickle_to_ts(path_base + "test/model_epoch{}/test_results.p".format(str(data['epochs']).zfill(3)))
    df_q.append(df_epoch)

# save files
metrics = pd.concat(metrics, axis = 0).set_index("basin")
metrics.to_csv("results/performance/Historical_CV_PUB_LSTM_OGGM_off.csv")

df_q = pd.concat(df_q, axis = 1)
df_q.to_csv("results/runoff/total_runoff_historical_CV_PUB_LSTM_OGGM_off.csv")

## PUR

In [None]:
# remove previous runs
[shutil.rmtree(d) for d in glob(path_disk + '/runs/historical_PMET_OGGM_off_CV_PUR*')];

metrics = []
df_q = []

for kfold in range(1,11):

    # generate sets
    with open("modelling/basins_id_train.txt", "w") as file: 
        for x in selection.gauge_id[selection.kfold_pur_test != kfold]:
            file.write(x + "\n")

    with open("modelling/basins_id_validation.txt", "w") as file:
        for x in selection.gauge_id[selection.kfold_pur_test == kfold]:
            file.write(x + "\n")

    with open("modelling/basins_id_test.txt", "w") as file:
        for x in selection.gauge_id[selection.kfold_pur_test == kfold]:
            file.write(x + "\n")

    with open('modelling/Hydro_NH_setup.yml') as stream:
        data = yaml.safe_load(stream)
        
        # experiment name + data
        exp_name = "historical_PMET_OGGM_off_CV_PUR_{}".format(str(kfold).zfill(2))
        data['experiment_name'] = exp_name
        
        data['data_dir']        = path_disk + "/data/historical_PMET"
        data['dynamic_inputs']  = ["PMET_precip_full_mm", "PMET_tmax_full_degC", "PMET_tmin_full_degC", "PMET_pet_full_mm"]

        # dates
        data['train_start_date']      = "01/01/2000"
        data['validation_start_date'] = "01/01/2000"
        data['test_start_date']       = "01/01/2000"
        data['train_end_date']        = "31/12/2019"
        data['validation_end_date']   = "31/12/2019"
        data['test_end_date']         = "31/12/2019"

        # hyperparameters 
        data['epochs']          = 5
        data['batch_size']      = 256
        data['hidden_size']     = 128
        data['learning_rate']   = 0.005
        data['output_dropout']  = 0.4
        data['seq_length']      = 365

    with open("modelling/Hydro_NH_setup.yml", 'w') as stream:
        yaml.dump(data, stream, default_flow_style=False)

    torch.cuda.empty_cache()
    start_run(config_file=Path("modelling/Hydro_NH_setup.yml"))

    # calculate performance for test set
    path_base = glob(path_disk + "/runs/" + exp_name + "*/", recursive = True)[0]
    eval_run(run_dir=Path(path_base), period="test", epoch = data['epochs'])
    metric_epoch = pd.read_csv(path_base + "test/model_epoch{}/test_metrics.csv".format(str(data['epochs']).zfill(3)))
    metrics.append(metric_epoch)

    df_epoch = pickle_to_ts(path_base + "test/model_epoch{}/test_results.p".format(str(data['epochs']).zfill(3)))
    df_q.append(df_epoch)

# save files
metrics = pd.concat(metrics, axis = 0).set_index("basin")
metrics.to_csv("results/performance/Historical_CV_PUR_LSTM_OGGM_off.csv")

df_q = pd.concat(df_q, axis = 1)
df_q.to_csv("results/runoff/total_runoff_historical_CV_PUR_LSTM_OGGM_off.csv")