### Partitions

The basins are filtered to the 296 study basins in the `S2-Join_CA_Data` notebook. The partitions for Experiment A and Experiment E are also defined in that notebook. The following code defines the partitions for the remaining Experiments B, C, and D.

In [None]:
import pickle
import pandas as pd
import geopandas as gpd
import xarray as xr
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
import datetime
from scipy.stats import pearsonr
import os
import yaml
import io

In [None]:
# get available gages
files = os.listdir('Data/basins/time_series')
gages = []
for f in files:
    gages.append(f.split('.')[0])
    
att_df = pd.read_csv('Data/basins/attributes_lat_lon.csv')

In [None]:
# random groups
all_basins = pd.read_csv('Data/basins/e_basins.txt', header=None)
all_basins['group_n'] = np.random.randint(3, size=len(all_basins))

group_a = all_basins.loc[all_basins.group_n == 0, 0].tolist()
group_b = all_basins.loc[all_basins.group_n == 1, 0].tolist()
group_c = all_basins.loc[all_basins.group_n == 2, 0].tolist()

att_df['random_group'] = np.nan
for i in all_basins[0]:
    g = all_basins.loc[all_basins[0] == i, 'group_n'].item()
    att_df.loc[att_df['index'] == i, 'random_group'] = g

# att_df.dropna(subset=['random_group'], axis=0).to_csv('Data/basins/attributes_lat_lon.csv', index=False)

In [None]:
# groups based on water balance
att_df['wb'] = np.nan

for f in files:
    nc_xr = xr.open_dataset('Data/basins/time_series/' + f)
    nc_df = nc_xr.to_dataframe()

    q_p = nc_df.q_cms.mean() / nc_df.total_precipitation__sum__era5l_daily.mean()
    pe_p = nc_df.potential_evaporation__sum__era5l_daily.mean()/ nc_df.total_precipitation__sum__era5l_daily.mean()
    wb = q_p - (1 - pe_p)

    att_df.loc[att_df['index'] == int(f.split('.')[0]), 'wb'] = wb
    
att_df['wb_group'] = np.nan
att_df.loc[att_df.wb > -100, 'wb_group'] = 'a'
att_df.loc[(att_df.wb <= -100) & (att_df.wb >= -2000), 'wb_group'] = 'b'
# att_df.loc[(att_df.wb < -2000) & (att_df.wb > -4000), 'wb_group'] = 'c'
att_df.loc[att_df.wb < -2000, 'wb_group'] = 'c'

att_df.wb_group.value_counts()
# att_df.to_csv('Data/basins/attributes_lat_lon.csv', index=False)

### Hyperparameter Grid Search
The following code generates the `yml` files used for the hyperparameter grid search for each experiment. The following parameters are explored:

- hidden size: 32, 121, 256
- sequence length: 10, 90, 365

In [None]:
# write hyperparameter ymls

subgroups = ['e_group_dam', 'e_group_flashy', 'e_group_natural']
hs = [32, 121, 256]
sl = [10, 90, 365]

for s in subgroups:
    with open("Data/hyperparameters/sample_ca_parameters.yml", 'r') as stream:
        run_config = yaml.safe_load(stream)

    for seq in sl:
        for hid_sz in hs:
            run_config['train_basin_file'] = 'Data/basins/' + s + '.txt'
            run_config['validation_basin_file'] = 'Data/basins/' + s + '.txt'
            run_config['test_basin_file'] = 'Data/basins/' + s + '.txt'
            run_config['seq_length'] = seq
            run_config['hidden_size'] = hid_sz

            sub_name = 'Data/hyperparameters/e_ca_ymls/' + s + '_sl_' + str(seq) + '_hs_' + str(hid_sz) + '.yml'
            ex_name = s + '_sl_' + str(seq) + '_hs_' + str(hid_sz)
            run_config['experiment_name'] = ex_name

            # save updated yml
            with io.open(sub_name, 'w', encoding='utf8') as outfile:
                yaml.dump(run_config, outfile, default_flow_style=False, allow_unicode=True)