In [1]:
# my packages
from evaluation_table import EvalTable
from figure_generator import EvalPlot
from model import CustomBiLSTM
from tuning_tools import tuning_game, tune_model 
from data_preprocess import data_prepare, data_split
from final_eval import general_viz, regime_eval, signature_eval, eval_drought

# basic packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import math
import joblib

# system packages
from datetime import datetime, date, timedelta
import pickle
import warnings
warnings.filterwarnings("ignore")
import platform
import time
from tqdm import tqdm
import os

# hydrological packages
import hydroeval as he
from hydrotools.nwm_client import utils # I had to pip install this

# data analysis packages
from scipy import optimize
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# deep learning packages
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

# Identify the path
home = os.getcwd()
parent_path = os.path.dirname(home)
input_path = f'{parent_path}/02.input/'
output_path = f'{parent_path}/03.output/'
main_path = home

  _pyproj_global_context_initialize()


In [2]:
params = joblib.load(f'{output_path}best_hyperparameters_lstm.pkl')

# Load the train and test dataset
data_train = pd.read_pickle(f"{output_path}train_dataset.pkl")
data_test = pd.read_pickle(f"{output_path}test_dataset.pkl")
dataset = pd.read_pickle(f"{output_path}dataset.pkl")

station_list = list(data_test.station_id.unique())

length_lookback = 10
x_train_scaled, y_train_scaled, x_test_scaled, y_test_scaled, scaler_x, scaler_y, y_train, x_test, y_test = data_prepare(data_train, data_test, length_lookback=length_lookback)


FileNotFoundError: [Errno 2] No such file or directory: '/home/jovyan/mydrive/devcon_2025/hydromachine-tutorials/neural_nets/lstm/03.output/best_hyperparameters_lstm.pkl'

### 5. Model Training
#### 5.1 Training
- If we want to use GPUs to run the model before we feed the data to the training fucntion we have to transfer the data and the model to the GPU.
- Then we will use the TensorDataset function as wrapper for our feature and target to combine them together.   
- Next, we should use the DataLoader function of the Pytorch library. DataLoader automatically creates minibatches of the dataset for the training process and speeds up the data loading process by parallelizing the loading of data from disc to the GPU/CPU. 
- We will use Adam optimizer for calculating the weights and biases. 

In [None]:
epochs = 10
batch_size = params['batch_size']
learning_rate = params['learning_rate']
hidden_size = params['hidden_size']
num_layers = params['num_layers']
input_size = x_train_scaled[list(x_train_scaled.keys())[0]].shape[2]
path_model = f'{output_path}best_model.pth'

# Move the model and data to GPU. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

In [None]:
train_loader= {}

for station_id in station_list:

    x_train_tensor = torch.Tensor(x_train_scaled[station_id].astype(float))
    y_train_tensor = torch.Tensor(y_train_scaled[station_id].astype(float))
    train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
    train_loader[station_id] = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)



In [None]:
%%time
# Create the Model
bilstm_model = CustomBiLSTM(input_size, hidden_size, num_layers, 1, device, embedding=False, station_list=station_list)

# Create the Optimizer
bilstm_optimizer = optim.Adam(bilstm_model.parameters(), lr=learning_rate, weight_decay=0)

# Run the training function
model_parameters = bilstm_model.train_model(train_loader, epochs, bilstm_optimizer, early_stopping_patience=0, val_loader=None)

# Save the Model Parameters
bilstm_model.save_model(path_model)


#### 5.2. Training Tricks
* **`early_stop`** Early stopping is a regularization technique that stops training when the model’s performance on a validation set stops improving, helping to prevent overfitting.

* **`weight_decay`** Weight decay is a form of L2 regularization that penalizes large weights by adding a term to the loss function, encouraging the model to keep its parameters small and generalize better.

* **`embedding`** Embedding is a technique used to represent categorical variables or high-dimensional data (like words or spatial identifiers) as dense, lower-dimensional vectors. In neural networks, embeddings help capture relationships or similarities between inputs in a form that models can learn from efficiently.

## 6. Model Evaluation 
#### 6.1. Model Evaluation Metrics
- We fist have to transform the results to their original scale. 
- To evaluate our model we will use KGE, RMSE, and PBias metrics. 
- We will also compare the cumulative streamflow in each year. 

In [None]:
# Initialize empty DataFrames to store evaluation results if not already defined.
EvalDF_all_rf = pd.DataFrame()
SupplyEvalDF_all_rf = pd.DataFrame()
df_eval_rf = pd.DataFrame()
df_result_data= {}


# Iterate over each station name in the list of station IDs.
for station_name in station_list:
    x_test_tensor = torch.Tensor(x_test_scaled[station_name].astype(float))
    y_test_tensor = torch.Tensor(y_test_scaled[station_name].astype(float))
    test_dataset = TensorDataset(x_test_tensor, y_test_tensor)
    test_loader = DataLoader(test_dataset, batch_size=test_dataset.tensors[0].shape[0], shuffle=False)
    yhat_test_scaled, val_loss = bilstm_model.evaluate_model(test_loader)
    
    # Inverse transform the scaled predictions to their original scale.
    yhat_test = scaler_y.inverse_transform(yhat_test_scaled.reshape(-1, 1))
    
    # Assuming EvalTable is a predefined function that compares predictions to actuals and returns evaluation DataFrames.
    EvalDF_all_rf_temp, SupplyEvalDF_all_rf_temp, df_eval_rf_temp = EvalTable(yhat_test.reshape(-1), data_test[data_test.station_id == station_name][length_lookback:], 'lstm')

    df_result_data[station_name] = data_test[data_test.station_id == station_name][length_lookback:].copy()

    df_result_data[station_name]['lstm_flow'] = yhat_test

    # Append the results from each station to the respective DataFrame.
    EvalDF_all_rf = pd.concat([EvalDF_all_rf, EvalDF_all_rf_temp], ignore_index=True)
    SupplyEvalDF_all_rf = pd.concat([SupplyEvalDF_all_rf, SupplyEvalDF_all_rf_temp], ignore_index=True)
    df_eval_rf_ = pd.concat([df_eval_rf, df_eval_rf_temp], ignore_index=True)

print("Model Performance for Daily cfs")
display(EvalDF_all_rf)   
print("Model Performance for Daily Accumulated Supply (Acre-Feet)")
display(SupplyEvalDF_all_rf)

In [None]:
general_viz(df_result_data, station_list, 0)
general_viz(df_result_data, station_list, 1)


#### 6.2. Evaluation of different flow regimes

In [None]:
regime_eval(df_result_data, station_list)

#### 6.3. Hydrological Signatures


In [None]:
signature_eval(df_result_data, station_list, output_path)

#### 6.4. Hydrological Drought

In [3]:
duration = 2
eval_drought(df_result_data, station_list, duration, dataset, EvalDF_all_rf)

NameError: name 'df_result_data' is not defined