In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset

import numpy as np

import math

import time

import dataloader
import models
import training_fun

import optuna

import joblib

import pygad

import HydroErr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SEQ_LENGTH = 365 * 2
TARGET_SEQ_LENGTH = 365
BASE_LENGTH = SEQ_LENGTH - TARGET_SEQ_LENGTH

FORCING_DIM = 3

# training hyperparameters
use_amp = True
compile_model = False

if compile_model:
    torch.set_float32_matmul_precision("high")

memory_saving = False
if memory_saving:
    storge_device = "cpu"
    computing_device = DEVICE
    VAL_STEPS = 500
else:
    storge_device = DEVICE
    computing_device = DEVICE


In [3]:
embedding = torch.load("data/final_models/final_lstm_embedding0.pt", map_location=torch.device('cpu')).to(computing_device)
decoder = torch.load("data/final_models/final_lstm_decoder0.pt", map_location=torch.device('cpu')).to(computing_device)

embedding.eval()
decoder.eval()

# dimension of embedding
catchment_embeddings=[x.data for x in embedding.parameters()][0]
LATENT_dim = catchment_embeddings.shape[1]


In [4]:
dtrain_val = dataloader.Forcing_Data(
    "data/data/CAMELS/camels_train_val.csv",
    record_length=3652,
    storge_device=storge_device,
    seq_length=SEQ_LENGTH,
    target_seq_length=TARGET_SEQ_LENGTH,
    base_length=BASE_LENGTH,
)


dtest = dataloader.Forcing_Data(
    "data/data/CAMELS/camels_test.csv",
    record_length=4383,
    storge_device=storge_device,
    seq_length=SEQ_LENGTH,
    target_seq_length=TARGET_SEQ_LENGTH,
    base_length=BASE_LENGTH,
)

In [5]:
class Objective_builder_batch:
    def __init__(self, x, y, eval_fun):
        self.eval_fun = eval_fun
        
        self.x = x.contiguous() 
        self.year = x.shape[0] # the long time series is split into x.shape[0] years
        
        self.y = y
    
    def eval(self, ga_instance, solutions, solution_idx):
        
        if len(solutions.shape)==1:
          solutions = np.expand_dims(solutions, axis=0) 
        
        batch_size = solutions.shape[0]
        
        x = self.x.repeat(batch_size, 1, 1).contiguous() # repeat batch_size times
        y = self.y.reshape(-1).contiguous() # combine to a multiple year time series
        
        # numpy to torch tensor
        solutions = torch.from_numpy(solutions).to(dtype=torch.float32).to(computing_device)
        # repeat to match the size of x, which split a long time series into multiple years
        solutions = solutions.repeat_interleave(self.year, dim = 0)
        
        pred = decoder.decode(solutions, x).reshape(batch_size, -1).detach().cpu().numpy()
        ob = y.detach().cpu().numpy()

        gofs = np.ones([batch_size])
        for i in range(batch_size):
          gofs[i] = self.eval_fun(simulated_array=pred[i,:], observed_array=ob)    
        
        return gofs.tolist()
      
    def predict_discharge(self, solutions):

        if len(solutions.shape)==1:
          solutions = np.expand_dims(solutions, axis=0) 
        
        batch_size = solutions.shape[0]
        
        x = self.x.repeat(batch_size, 1, 1).contiguous() # repeat batch_size times
        y = self.y.reshape(-1).contiguous() # combine to a multiple year time series
        
        # numpy to torch tensor
        solutions = torch.from_numpy(solutions).to(dtype=torch.float32).to(computing_device)
        # repeat to match the size of x, which split a long time series into multiple years
        solutions = solutions.repeat_interleave(self.year, dim = 0)
        
        pred = decoder.decode(solutions, x).reshape(batch_size, -1).detach().cpu().numpy()
        ob = y.detach().cpu().numpy()
        
        return pred, ob
      
      

In [6]:
x_batch_train_val, y_batch_train_val = dtrain_val.get_val_batch()
x_batch_test, y_batch_test = dtest.get_val_batch()

In [7]:
# Hyperparameters of GA
num_generations = 500
num_parents_mating = 10

sol_per_pop = 200
num_genes = LATENT_dim

# Calculate the minimal and maximal values for each column
min_vals, _ = catchment_embeddings.min(dim=0)
max_vals, _ = catchment_embeddings.max(dim=0)

# Scale the values by 20%, considering the sign
min_scaled_values = [(min_val * 1.2 if min_val < 0 else min_val * 0.8) for min_val in min_vals]
max_scaled_values = [(max_val * 0.8 if max_val < 0 else max_val * 1.2) for max_val in max_vals]

# Convert the results to lists
init_range_low = [val.item() for val in min_scaled_values]
init_range_high = [val.item() for val in max_scaled_values]

# Print the results
parent_selection_type = "sss"

crossover_type = "single_point"

mutation_type = "random"
mutation_probability = 0.25

In [8]:
def fitting_wrapper(selected_catchment, batch_size=50):

    # Subsetting training, validation, and test data of selected catchments
    x_train_val = x_batch_train_val[:,selected_catchment,:,:].to(computing_device)
    y_train_val = y_batch_train_val[:,selected_catchment,:].to(computing_device)

    x_test = x_batch_test[:,selected_catchment,:,:].to(computing_device)
    y_test = y_batch_test[:,selected_catchment,:].to(computing_device)

    # Creating evaluation functions
    fn_train_val = Objective_builder_batch(x_train_val,y_train_val,HydroErr.kge_2009)
    fn_test = Objective_builder_batch(x_test,y_test,HydroErr.kge_2009)

    # Setting up callback functions for early stop
    # Identifying optimal number of generations
    ga_instance = pygad.GA(num_generations=num_generations,
                        num_parents_mating=num_parents_mating,
                        fitness_func=fn_train_val.eval,
                        sol_per_pop=sol_per_pop,
                        num_genes=num_genes,
                        init_range_low=init_range_low,
                        init_range_high=init_range_high,
                        parent_selection_type=parent_selection_type,
                        fitness_batch_size = batch_size,
                        crossover_type=crossover_type,
                        mutation_type=mutation_type,
                        mutation_probability = mutation_probability,
                        stop_criteria="saturate_10")

    ga_instance.run()

    # Evaluating best solution
    #solution = ga_instance.best_solutions[np.argmax(val_losses),:]
    solution, solution_fitness, solution_idx = ga_instance.best_solution()
    
    torch.cuda.empty_cache()

    return fn_test.eval(ga_instance, solution, 1), solution

In [9]:
REPEATS = 10
calibrated_KGES = np.ones(REPEATS)
camels_embeddings = np.ones([REPEATS, LATENT_dim])
preds = np.ones([REPEATS, y_batch_test.shape[0]*365])
ob = np.ones([1, y_batch_test.shape[0]*365])

for i in range(REPEATS):
    print(f'i={i} starts')
    calibrated_KGE, camels_embedding = fitting_wrapper(0)
    calibrated_KGES[i], camels_embeddings[i,:]  = np.array(calibrated_KGE)[0], camels_embedding
    
    x_test = x_batch_test[:,0,:,:].to(computing_device)
    y_test = y_batch_test[:,0,:].to(computing_device)
    fn_test = Objective_builder_batch(x_test,y_test,HydroErr.kge_2009)

    preds[i], ob = fn_test.predict_discharge(camels_embedding)
    
    print(f'fit={calibrated_KGES[i]}')


i=0 starts


 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314
 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328
 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342
 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356
 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370
 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384
 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398
 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412
 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426
 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440
 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454
 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468
 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482
 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496
 3497 

fit=0.8950190730008679
i=1 starts
fit=0.9237605152830838
i=2 starts
fit=0.7912154840168886
i=3 starts
fit=0.8090339093651917
i=4 starts
fit=0.897060520135855
i=5 starts
fit=0.8504251623964988
i=6 starts
fit=0.8350450744720426
i=7 starts
fit=0.9221166213648747
i=8 starts
fit=0.8832417447643821
i=9 starts
fit=0.928395282938269


In [11]:
calibrated_KGES

normalized_embeddings = (camels_embeddings - np.array(init_range_low)) / (np.array(init_range_high) - np.array(init_range_low))

np.savetxt("data/optimal_latent_variable_exp_results/KGEs.csv", calibrated_KGES, delimiter=",")
np.savetxt("data/optimal_latent_variable_exp_results/embeddings.csv", camels_embeddings, delimiter=",")
np.savetxt("data/optimal_latent_variable_exp_results/normalized_embeddings.csv", normalized_embeddings, delimiter=",")

np.savetxt("data/optimal_latent_variable_exp_results/ob.csv", ob, delimiter=",")
np.savetxt("data/optimal_latent_variable_exp_results/preds.csv", preds, delimiter=",")