---
execute:
  cache: false
  eval: true
  echo: true
  warning: false
jupyter: python3
---

# Hyperparameter Tuning with PyTorch Lightning and User Data Sets  {#sec-light-user-data-1001}

In [None]:
#| echo: false
#| label: 1001_user_data_imports
import numpy as np
import os
from math import inf
import numpy as np
import warnings
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from spotpython.hyperdict.light_hyper_dict import LightHyperDict
from spotpython.fun.hyperlight import HyperLight
from spotpython.utils.init import (fun_control_init, surrogate_control_init, design_control_init)
from spotpython.utils.eda import print_res_table
from spotpython.hyperparameters.values import set_hyperparameter
from spotpython.spot import Spot
from math import inf
warnings.filterwarnings("ignore")

## Creating thr Data 

In [2]:
# epochs = 2000
# seeds = [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]
data = pd.read_csv("/Users/bartz/workspace/schu25a_netgen_gecco/data/data_man_tca88.csv")
data["source_file"] = ["tca88"] * len(data)

In [3]:

from spotpython.data.manydataset import ManyToManyDataset



In [None]:

def load_data(data,
              input_features=['V tot V red [m³/s]'], 
              target= 'PI tot V [-]' ,
              drop = ['Bereich u2red','source_file'],
              group_by='Bereich u2red',
              feature_scaling=None, 
              target_scaling=None, 
              create_dataset=True, 
              dataset_type='many_to_many'):

    if feature_scaling is not None:
        data[input_features] = feature_scaling.fit_transform(data[input_features])
    
    if target_scaling is not None:
        data[target] = target_scaling.fit_transform(data[target])    
    
    if create_dataset == False:
        return data
    else:
        groups = []
        groups_name = []
        data_groups = data.groupby(group_by)

    for name, group in data_groups:
        groups.append(group)
        groups_name.append(name)
    
    if dataset_type == 'many_to_many':
        return ManyToManyDataset(groups, target=target, drop=drop), data
    elif dataset_type == "many_to_one":
        return ManyToOneDataset(groups, target=target, drop=drop), data
kennlinienfelder = data.groupby("source_file")
pred_dict = {}

seeds = [42]
for kennlinienfeld in kennlinienfelder:
    print(f"kennlinienfeld: {kennlinienfeld[0]}")
    data_name = kennlinienfeld[0]
    print(data_name)
    ds, data = ds, _ = load_data(kennlinienfeld[1], 
                                input_features=['PI tot V [-]' ], 
                                target='V tot V red [m³/s]',
                                drop = ['source_file', "Bereich u2red"],
                                group_by="Bereich u2red",
                                # feature_scaling=MinMaxScaler()
                                )
    
    pred_dict[data_name] = {}

    for seed in seeds:
        print(f"seed: {seed}")
        # seed_everything(seed)
        g = torch.Generator()
        g.manual_seed(seed)
        
        pred_dict[data_name][seed] = {}
        pred_dict[data_name][seed]['x'] = []
        pred_dict[data_name][seed]['y_hat'] = []
        pred_dict[data_name][seed]['y'] = []
        pred_dict[data_name][seed]['mape'] = []
        pred_dict[data_name][seed]['rmse'] = []
        
        # Create indices for the split
        indices = list(range(len(ds)))        
        for i in indices:
            test_indices = [indices[i]]
            train_indices = [index for index in indices if index != test_indices[0]]
            
            train_dataset = torch.utils.data.Subset(ds, train_indices)
            test_dataset = torch.utils.data.Subset(ds, test_indices)

## Preparing the spotpyhon Run

In [None]:
fun_control=fun_control_init(
    penalty_NA=200,
    ocba_delta=1,
    TENSORBOARD_CLEAN=False,
    tensorboard_log=False,
    accelerator="cpu",
    collate_fn_name="PadSequenceManyToMany",
    show_config=True,
    verbosity=1,
    save_experiment=True,
    save_result=True,
    PREFIX="1002",
    fun_evals=inf,
    fun_repeats=2,
    max_time=1,
    data_full_train = train_dataset,
    data_val=test_dataset,
    data_test=test_dataset,
    shuffle_train=False,
    shuffle_val=False,    
    core_model_name="light.regression.ManyToManyRNNRegressor",
    hyperdict=LightHyperDict,
    log_level=50,
    _L_in=1,
    _L_out=1)

# set_hyperparameter(fun_control, "optimizer", [ "Adadelta", "Adam", "Adamax"])
set_hyperparameter(fun_control, "rnn_units", [6, 12])
set_hyperparameter(fun_control, "fc_units", [6, 12])
set_hyperparameter(fun_control, "epochs", [6 , 8])
set_hyperparameter(fun_control, "batch_size", [6,12])
set_hyperparameter(fun_control, "dropout_prob", [0.0, 0.025])
set_hyperparameter(fun_control, "patience", [2, 5])
set_hyperparameter(fun_control, "lr_mult", [0.1, 20.0])

design_control = design_control_init(init_size=20, repeats=2)

surrogate_control = surrogate_control_init(log_level=50, noise=True)

fun = HyperLight().fun

spot_tuner = Spot(fun=fun,fun_control=fun_control, design_control=design_control, surrogate_control=surrogate_control)

In [None]:
from spotpython.utils.file import load_and_run_spot_python_experiment
from spotpython.data.manydataset import ManyToManyDataset
load_and_run_spot_python_experiment(filename="1002_exp.pkl")

In [None]:
#| label: 1001_user_data_run
res = spot_tuner.run()

In [None]:
print_res_table(spot_tuner)
spot_tuner.plot_important_hyperparameter_contour(max_imp=3)

In [None]:
len(spot_tuner.y)