### Download Data Using VotE
Data is accessed using the `VotE` python package. All of the available gauges in the CRB are downloaded and formatted into `neuralhydrology` format.

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from VotE.streamflow import export_streamflow as es
import xarray as xr
import os

In [None]:
geo_col = gpd.read_file("Data/CRB_basin.shp")
CRB_gages = es.gage_selector({"within": geo_col.geometry[0],"gap_free": False})
CRB_watersheds = es.get_gages(CRB_gages, geom_col='basin_geom_vote')

In [None]:
# create time series and attributes directories
path_root = 'Data/basins'
path_ts = os.path.join(path_root, 'time_series')
path_attr = os.path.join(path_root, 'attributes')
for p in [path_root, path_ts, path_attr]:
    if os.path.isdir(p) is False:
        os.mkdir(p)

vote_gages = CRB_watersheds.id_gage.to_list()
tokens = {}

# get attribute names
token = es.basin_token(-15176336368)
attr = pd.DataFrame(token['static'], index=[0])
attr_l = list(attr.columns)
attr_l.insert(0, 'index')
df = pd.DataFrame(columns = attr_l)

# access VotE data
for id_gage in vote_gages:
    token = es.basin_token(id_gage, normalize_Q=False)

    if token is not None:
        tokens[id_gage] = token
        # time series to netcdfs
        sf = token['timeseries']
        sf.set_index('date', inplace=True)
        sf.index = pd.DatetimeIndex(sf.index)
        sf.drop('q_quality', axis=1, inplace=True)
        sf['dayofyear'] = sf.index.dayofyear
        sf['year'] = sf.index.year
        sf['month'] = sf.index.month
        sf['day'] = sf.index.day
        sf = sf.resample('D').mean()
        sf.to_xarray().to_netcdf(os.path.join(path_ts, str(id_gage) + '.nc'))

        # store the static attributes
        b_attr = pd.DataFrame(token['static'], index=[0])
        b_attr['index'] = id_gage
        df = pd.concat([df, b_attr], ignore_index=True)
df.to_csv(os.path.join(path_attr, 'all.csv'), index=False)
df.to_file('Data/basins/attributes.gpkg')

### Combine Data with NID Data
`VotE` data is combined with NID data to understand when and where there are operating reservoirs. Using the NID reservoir construction date, basins with reservoirs are separated into post-reservoir periods with a buffer of 5 years after the reservoir construction date to adjust for time needed to fill the reservoir.

In [None]:
# join with NID data
nid_df = gpd.read_file("Data/crb-nid-dams.gpkg")
CRB_watersheds = gpd.read_file('Data/attributes.gpkg')

# point within polygon join
nid_basins = gpd.sjoin(CRB_watersheds, nid_df, how='left', predicate='contains')
nid_basins = nid_basins.groupby('index')['NID ID'].apply(list).reset_index(name='nid_dam')
CRB_watersheds['nid_dam'] = nid_basins['nid_dam']

static_df = pd.read_csv("Data/basins/attributes/all.csv")

# list of NID dams per basin
for i in range(len(CRB_watersheds)):
    if np.nan in CRB_watersheds.nid_dam[i]:
        CRB_watersheds.nid_dam[i] = []

# add number of NID dams variable
CRB_watersheds['nid_n_dam'] = CRB_watersheds.nid_dam.apply(lambda x: len(x))

# add year reservoir was completed variable
static_df['yr_completed'] = np.nan
for d in static_df['index']:
    nid_list = CRB_watersheds.loc[CRB_watersheds['index'] == d, 'nid_dam'].item()
    yr_list = nid_df.loc[nid_df['NID ID'].isin(nid_list), 'Year Completed'].tolist()
    if ('' in yr_list) or (yr_list == []):
        yr = np.nan # later remove dams with missing construction date
    else:
        yr = max([int(i) for i in yr_list])
    static_df.loc[static_df['index'] == d, 'yr_completed'] = yr
    
# static_df.dropna(subset=['yr_completed'], inplace=True)

# add reservoir volume variable
static_df['nid_vol_c_yds'] = np.nan
for d in static_df['index']:
    nid_list = CRB_watersheds.loc[CRB_watersheds['index'] == d, 'nid_dam'].item()
    vol_list = nid_df.loc[nid_df['NID ID'].isin(nid_list), 'Volume (Cubic Yards)'].tolist()
    if '' in vol_list:
        vol_total = np.nan # later add average volume
    else:
        vol_total = sum([int(i) for i in vol_list])
    static_df.loc[static_df['index'] == d, 'nid_vol_c_yds'] = vol_total
    
static_df['nid_vol_c_yds'].fillna(static_df['nid_vol_c_yds'].mean(), inplace=True)

# add reservoir name variable
static_df['nid_name'] = np.nan

for d in static_df['index']:
    nid_list = CRB_watersheds.loc[CRB_watersheds['index'] == d, 'nid_dam'].item()
    nam_list = nid_df.loc[nid_df['NID ID'].isin(nid_list), 'Dam Name'].tolist()
    if '' in nam_list:
        nam_str = '' # nid_name is '' for no reservoirs
    else:
        nam_str = ''
        counter = 0
        for i in nam_list:
            if counter == len(nam_list) - 1:
                n = ''
            else:
                n = ', '
            counter += 1
            nam_str += i + n # create string of reservoir names
    static_df.loc[static_df['index'] == d, 'nid_name'] = nam_str
    
static_df['nid_name'].fillna('None', inplace=True)

for d in static_df['index']:
    nid_name = static_df.loc[static_df['index'] == d].nid_name
    if nid_name.values[0] == '':
        static_df.loc[static_df['index'] == d, 'yr_completed'] = 2023.0
    else:
        yr_completed = static_df.loc[static_df['index'] == d].yr_completed
        if np.isnan(yr_completed.values[0]):
            static_df.loc[static_df['index'] == d, 'yr_completed'] = 1900

# static_df.to_csv('Data/basins/attributes-nid.csv', index=False)