# HyTest Hydrologic Model Assessment
* get observed and modeled data via Intake catalog
* use Dask to compute metrics in parallel
* Use community tools (Pandas & Xarray)

In [None]:
from dask_jobqueue import SLURMCluster
from dask.distributed import Client, LocalCluster
import xarray as xr
import numpy as np
import pandas as pd
import intake
import dask

In [None]:
'''
A selection of traditional statistical metrics for comparing against d-score components
'''

import numpy as np

def nse(obs, mod):
    """
    Calculate the Nash-Sutcliffe Efficiency (NSE)
    (https://www.sciencedirect.com/science/article/pii/0022169470902556?via%3Dihub)
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Nash-Sutcliffe Efficiency
    """
    return 1 - (mse(obs, mod) / np.var(obs))


def mse(obs, mod):
    """
    Calculate the mean squared error (MSE)
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        mean squared error
    """
    return np.mean((obs - mod) ** 2)


def pbias(obs, mod):
    """
    Calculate the percent bias
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Percent bias
    """
    return 100 * ((np.sum(mod - obs)) / (np.sum(obs)))


def pbias_percentile(obs, model, percentile, fun):
    """
    Calculate the percent bias for a percentile bin
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
        percentile: float
        fun: comparison function (e.g., np.greater)
    Returns:
        Percent bias for bin
    """
    threshold = np.percentile(obs, q=percentile)
    i = fun(obs, threshold)
    
    return pbias(obs[i], model[i])
    


def pearson_r(obs, mod):
    """
    Calculate Pearson's r
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Pearson's r
    """
    #return np.cov(mod, obs) / np.sqrt( np.var(mod) * np.var(obs))
    return np.corrcoef(mod, obs)[0,1]


def spearman_r(obs, mod):
    """
    Calculate Spearman's r
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Spearman's r
    """
    return pearson_r(np.argsort(mod), np.argsort(obs))


def kge(obs, mod):
    """
    Calculate the Kling-Gupta Efficiency (KGE)
    (https://www.sciencedirect.com/science/article/pii/S0022169409004843)
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Kling-Gupta Efficiency
    """
    #d_obs = obs - np.mean(obs)
    #d_mod = mod - np.mean(mod)
    #r = np.sum(d_obs * d_mod) / np.sqrt(np.sum(d_mod ** 2) * np.sum(d_obs ** 2))
    r = pearson_r(obs, mod)
    #alpha = np.std(mod) / np.std(obs)
    alpha = sd_ratio(obs, mod)
    beta = np.sum(mod) / np.sum(obs)

    ED = np.sqrt((r - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2)
    return 1 - ED

def sd_ratio(obs, mod):
    """
    Calculate the standard deviation ratio of the model predictions and observations
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Standard deviation ratio   
    """
    return np.std(mod) / np.std(obs)


In [None]:
# Start cluster

In [None]:
## Denali setup
#cluster = SLURMCluster() #TOH: my config defaults to Denali, but this won't work for other users

cluster = LocalCluster(threads_per_worker=1)
# Tallgrass setup
#cluster = SLURMCluster(queue='cpu', cores=1, interface='ib0',
#                       job_extra=['--nodes=1', '--ntasks-per-node=1', '--cpus-per-task=1'],
#                       scheduler_options={'dashboard_address':36999},
                       #cores=1, extra=['--resources processes=1']
#                       memory='6GB')

## PC setup
#import os
#n_cores = os.cpu_count() # set to match your machine

client = Client(cluster)
#client

In [None]:
#cluster.adapt(maximum_jobs=100)

In [None]:
client

In [None]:
#client.close()

# Intake catalog
We use an Intake catalog to help manage the various datasets that might be used in an evaluation.

In [None]:
url = 'https://raw.githubusercontent.com/nhm-usgs/data-pipeline-helpers/main/hytest/hytest_intake_catalog.yml'
cat = intake.open_catalog(url)
list(cat)

In [None]:
# read in intake
obs_ds = cat['nwis-streamflow-usgs-gages-onprem'].to_dask()
model_ds = cat['nwm21-streamflow-usgs-gages-onprem'].to_dask()


obs = obs_ds['streamflow']
mod = model_ds['streamflow'].astype('float32')

obs.name = 'observed'
mod.name = 'predicted'

In [None]:
obs_ds

In [None]:
client.scatter(obs)
client.scatter(mod)
#client.scatter(ds_results)

In [None]:
%%time
# selecting a single gage is fast
gage_id = 'USGS-01030350'
x = obs.sel(gage_id=gage_id).load()

In [None]:
def compute_metrics(gage_id):
    # select the data for the given gage_id
    # TODO the selection may be distributed, but can we force it onto a single node? Maybe by allocating 2 cores?
    obs1 = obs.sel(gage_id=gage_id).to_series()
    mod1 = mod.sel(gage_id=gage_id).to_series().resample('1D', offset='5h').mean() # Resampling could be done in preanalysis
    # make sure the indices match
    obs1.index = obs1.index.to_period('D')
    mod1.index = mod1.index.to_period('D')


    # merge obs and predictions and drop nans.
    df = pd.merge(obs1, mod1, left_index=True, right_index=True).dropna(how='any')
    obs1 = df['observed']
    mod1 = df['predicted']
    
    # compute log flow for use in log NSE
    threshold = 0.01
    log_obs = np.log(obs1.where(obs1 > threshold, threshold))
    log_model = np.log(mod1.where(mod1 > threshold, threshold))
    
    scores = pd.Series(dtype='float')
    scores['nse'] = nse(obs1, mod1)
    scores['log_nse'] = nse(log_obs, log_model)
    scores['kge'] = kge(obs1, mod1)
    
    scores['pbias'] = pbias(obs1, mod1)
    scores['pearson_r'] = pearson_r(obs1, mod1)
    scores['spearman_r'] = spearman_r(obs1, mod1)
    scores['sd_ratio'] = sd_ratio(obs1, mod1)
    
    # compute high flow and low flow bias
    high_percentile = 98
    low_percentile = 30
    
    scores['pbias_q' + str(high_percentile)] = pbias_percentile(obs1, mod1, high_percentile, np.greater)
    scores['pbias_q' + str(low_percentile)] = pbias_percentile(obs1, mod1, high_percentile, np.less_equal)
    scores.name = gage_id
    
    return scores

In [None]:
%%time
# run for a single site using 1 core
gage_id = 'USGS-01030350'
compute_metrics(gage_id)

In [None]:
gages = list(obs.gage_id.values)

In [None]:
len(gages)

In [None]:
gages[0]

#### Try Dask Delayed, computing a list of dask delayed objects

In [None]:
%%time
results = dask.compute(*[dask.delayed(compute_metrics)(str(gage)) for gage in gages[:20]], retries=10);

#### Try Dask Bag

In [None]:
import dask.bag as db

b = db.from_sequence(gages[:20], npartitions=10)
b = b.map(compute_metrics)

In [None]:
%%time
results = b.compute()

In [None]:
df = pd.concat(results, axis=1)
df1 = df.T.reset_index()
ds_results = xr.Dataset.from_dataframe(df1)
ds_results

In [None]:
ds_results.sel(index='USGS-01030350')

In [None]:
ds_results.to_netcdf('results.nc')

#### Open CSV file with obs info from Sydney Foks

In [None]:
import fsspec
import pandas as pd

In [None]:
fs = fsspec.filesystem('s3', anon=True)

In [None]:
url = 's3://esip-qhub-public/usgs/hytest/streamflow_benchmark_sites_v09.csv'

In [None]:
df = pd.read_csv(fs.open(url), dtype={'site_no':str, 'huc_cd':str, 'reachcode':str, 'comid':str})

In [None]:
site_ids = [ f'USGS-{site}'  for site in list(df['site_no'].values)]

In [None]:
len(site_ids)

In [None]:
results = dask.compute(*[dask.delayed(compute_metrics)(site) for site in site_ids[:20]], retries=10);