# Optimize hyperparameters of ResNet for each mutation matrix
We will optimize hyperparameters of the Residual neural Network (ResNet) model for each mutation matrix. The optimization will be based on the MAE performance of model over four validation seasons from 2012NH to 2013SH.

## Imports

In [64]:
from pathlib import Path
import pandas as pd
import numpy as np
import random
from ast import literal_eval
import pickle
import gc

# self defined functions
import utilities

# for model development
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Input, ReLU, Add, BatchNormalization
from sklearn.preprocessing import MinMaxScaler

# for hyperparameter optimization
import optuna

# for reproduciblility, fix the randomly generated numbers
SEED = 100
tf.keras.utils.set_random_seed(SEED)

## Variables

In [65]:
Valid_Seasons = ['2012NH', '2012SH', '2013NH', '2013SH'] # seasons from 2012NH to 2013SH

HA1_features  = [f"HA1_{x}" for x in range(1,329+1)]
meta_features = [
                 'virus',   # virus avidity
                 'serum',   # antiserum potency
                 'virusPassCat',
                 'serumPassCat'
                 ]   # metadata features

metadata   = 'a+p+vPC+sPC'   # label to record which metadata is being used
model_name = 'ResNet'   # identifier for the type of model to be used

## Paths and filenames

In [66]:
# paths
path_data   = "../data/"   # path of data
path_result = "../results/SuppFig6_comparison/"   # results will be saved in this directory
Path(path_result+f"/optuna_{model_name}/").mkdir(parents=True, exist_ok=True)   # make directory if it does not exist already

# filenames
mut_mat_fn  = path_data + "aaIndID_selected.txt"   # filename of list of valid mutation matrics
optimize_fn = path_result+f"SuppFig6_optimize_{model_name}_mut_mat_optuna.csv"   # to save optimization results

## Read valid mutation matrices used for encoding genetic difference

In [67]:
mut_mat_List = ['AZAE970101']

## Indices of training and validation datasets for validation seasons

In [68]:
# read dataset temporarily
dummy = pd.read_csv(path_data+"nhts_ha1_binary.csv",
                    converters={"seq_diff": literal_eval})

# to collect train and valid indices for each validation season
indices_folds = []

# loop through each validation season
for valid_season in Valid_Seasons:
    '''
    Train Test Split
        - based on seasonal framework
        - Train: past virus isolates paired with past sera
        - Test: circulating virus isolates paired with past sera
    '''
    ind_train, ind_valid = utilities.seasonal_trainTestSplit(dummy.copy(), valid_season)
    
    indices_folds.append((ind_train, ind_valid))

del dummy, ind_train, ind_valid

## Objective function for optuna
The objective is to minimize the average MAE over validation seasons. For a single trial of optuna, this function will train the model using following hyperparameters (with their auto-selected values) and return the average MAE.
- learning_rate (float):  learning rate of the optimizer
- epochs (int): number of epochs
- n_units_linear (int): number of units/neurons in first linear layer
- n_resnet_blocks (int): number of layers of ResNetBlocks
- units_rnb (int): number of units in ResNetBlock non-linear layer
- dropout_rnb (float): dropout ratio
- residual_dropout_rnb (float): residual dropout ratio after addition of input

> **Parameters**
> - trial object of optuna

> **Returns**
> - avg_mae (float): MAE averaged over validation seasons

In [69]:
def objective(trial):
    actual_all  = []   # to collect measured NHTs across validation seasons
    predict_all = []   # to collect predicted NHTs across validation seasons
    
    # loop through validation seasons
    for ind_train, ind_valid in indices_folds:
        '''
        Assign training and validation datasets
        '''
        # training dataset
        data_train = data.iloc[ind_train].copy()
        data_train.reset_index(drop=True, inplace=True)

        # validation dataset
        data_valid = data.iloc[ind_valid].copy()
        data_valid.reset_index(drop=True, inplace=True)
        
        
        '''
        Input features (genetic difference)
        '''
        # training dataset
        X_train = pd.DataFrame(data_train.seq_diff.to_list(),
                               index=data_train.index,
                               columns=HA1_features)
        X_train.fillna(0, inplace=True)   # replace nan with 0

        # validation dataset
        X_valid = pd.DataFrame(data_valid.seq_diff.to_list(),
                               index=data_valid.index,
                               columns=HA1_features)
        X_valid.fillna(0, inplace=True)   # replace nan with 0


        '''
        Input features (metadata features)
        '''
        X_train_meta = data_train[meta_features].fillna('None').astype('str')
        X_valid_meta = data_valid[meta_features].fillna('None').astype('str')


        # one hot encoding
        ohe = OneHotEncoder(handle_unknown='ignore')
        X_train_meta = ohe.fit_transform(X_train_meta).toarray()
        X_valid_meta = ohe.transform(X_valid_meta).toarray()

        X_train = np.hstack((X_train.values, X_train_meta))
        X_valid = np.hstack((X_valid.values, X_valid_meta))


        del X_train_meta, X_valid_meta
        
        
        '''
        Scaling
        '''
        # Input normalization
        normalizer = MinMaxScaler()
        X_train    = normalizer.fit_transform(X_train)
        X_valid    = normalizer.transform(X_valid)
        
        # target reshaping
        y_train = data_train.nht.values.reshape(-1, 1)
        y_valid = data_valid.nht.values.reshape(-1, 1)
        
        del data_train, data_valid
        gc.collect()
        
        '''
        Model
        '''
        # hyperparameters for optimization
        learning_rate  = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
        epochs         = trial.suggest_int("epochs", 10, 200, step=10)
        n_units_linear = trial.suggest_int("n_units_linear", 100, 5000, step=100)
        
        # input to the model
        input1 = Input(shape=(X_train.shape[1],))
        
        # initial Linear layer
        x1 = Dense(n_units_linear)(input1)
        
        # ResNetBlocks
        n_resnet_blocks = trial.suggest_int("n_layers", 1, 5)
        
        for resnet_block in range(1, n_resnet_blocks+1):
            # hyperparameters for optimization
            units_rnb            = trial.suggest_int(f"n_units_rnb_{resnet_block}", 100, 5000, step=100)
            dropout_rnb          = trial.suggest_float(f"dropout_rnb_{resnet_block}", 0.0, 0.5, step=0.1)
            residual_dropout_rnb = trial.suggest_float(f"res_dropout_rnb_{resnet_block}", 0.0, 0.5, step=0.1)
        
            x1 = BatchNormalization()(x1)
            x1 = Dense(units_rnb, activation='relu')(x1)
            x1 = Dropout(dropout_rnb)(x1)
            x1 = Dense(X_train.shape[1])(x1)
            x1 = Dropout(residual_dropout_rnb)(x1)
            x1 = Add()([x1, input1])
        
        # Prediction block
        x1 = BatchNormalization()(x1)
        x1 = ReLU()(x1)
        x1 = Dense(1)(x1)
        
        model = tf.keras.models.Model(inputs=input1, outputs = x1)
        model.compile(loss = tf.keras.losses.MeanSquaredError(),
                      optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
                     )
        
        
        '''
        Training and validation
        '''
        model.fit(X_train, y_train,
                  epochs = epochs,
                  batch_size = 1024,
                  shuffle = True,
                  verbose=0
                 )
        predict_valid = model.predict(X_valid, verbose=0).squeeze()
        
        
        '''
        save actuals and predictions
        '''
        actual_all.append(y_valid.squeeze())
        predict_all.append(predict_valid)
        
        ##################
        # End seasons loop
        ##################
    
    actuals     = np.concatenate(actual_all)
    predictions = np.concatenate(predict_all)
    
    
    '''
    metric or loss (MAE)
    '''
    avg_mae = mean_absolute_error(actuals, predictions)
    
    return avg_mae

## Optimization

In [70]:
'''
loop through mutation matrices
'''
for mut_mat in mut_mat_List:
    
    print("Mutation matrix: ", mut_mat)
    
    '''
    Dataset
    '''
    # Genetic difference (seq_diff) encoded as per the mutation matrix
    # Converter is used to load the genetic difference saved as a list of floats
    data = pd.read_csv(path_data+f"nhts_ha1_{mut_mat}.csv",
                       converters={"seq_diff": literal_eval})
    

    '''
    Hyper-parameter optimization
    '''
    try:
        '''
        load the optuna study object
        '''
        with open(path_result+f"optuna_{model_name}/study_{mut_mat}.optuna", "rb") as f:
            study     = pickle.load(f)
            n_trials  = 5
    except:
        # hyperopt initialize trials object
        study = optuna.create_study(direction="minimize")
        n_trials = 50
    
    # optuna minimization
    study.optimize(objective, n_trials=n_trials, gc_after_trial=True)
    
    
    '''
    Best hyperparameters
    '''
    hyperparams = {'model': model_name,
                   'metadata': metadata,
                   'mut_mat': mut_mat,
                   'mae': study.best_value
                  }
    hyperparams.update(study.best_params)
    print(hyperparams)
    
    # save to CSV
    utilities.saveDict2CSV([hyperparams], optimize_fn)
    
    
    '''
    save the trials object
    '''
    with open(path_result+f"optuna_{model_name}/study_{mut_mat}.optuna", "wb") as f:
        pickle.dump(study, f)

Mutation matrix:  AZAE970101


[I 2023-10-08 01:27:08,242] A new study created in memory with name: no-name-a49db103-11fd-4318-af4e-f1d229269632
[I 2023-10-08 01:37:25,358] Trial 0 finished with value: 0.9863022943761699 and parameters: {'learning_rate': 0.0017359564480423334, 'epochs': 110, 'n_units_linear': 3900, 'n_layers': 4, 'n_units_rnb_1': 2600, 'dropout_rnb_1': 0.30000000000000004, 'res_dropout_rnb_1': 0.4, 'n_units_rnb_2': 2400, 'dropout_rnb_2': 0.2, 'res_dropout_rnb_2': 0.5, 'n_units_rnb_3': 1800, 'dropout_rnb_3': 0.5, 'res_dropout_rnb_3': 0.2, 'n_units_rnb_4': 2100, 'dropout_rnb_4': 0.1, 'res_dropout_rnb_4': 0.1}. Best is trial 0 with value: 0.9863022943761699.
[I 2023-10-08 01:47:50,074] Trial 1 finished with value: 0.9588899217906008 and parameters: {'learning_rate': 0.013587720872108043, 'epochs': 110, 'n_units_linear': 2500, 'n_layers': 3, 'n_units_rnb_1': 3900, 'dropout_rnb_1': 0.5, 'res_dropout_rnb_1': 0.1, 'n_units_rnb_2': 5000, 'dropout_rnb_2': 0.4, 'res_dropout_rnb_2': 0.1, 'n_units_rnb_3': 1900,

[I 2023-10-08 04:39:08,039] Trial 18 finished with value: 0.9203784602764838 and parameters: {'learning_rate': 0.04561531910519763, 'epochs': 200, 'n_units_linear': 5000, 'n_layers': 2, 'n_units_rnb_1': 4400, 'dropout_rnb_1': 0.2, 'res_dropout_rnb_1': 0.2, 'n_units_rnb_2': 3800, 'dropout_rnb_2': 0.5, 'res_dropout_rnb_2': 0.1}. Best is trial 17 with value: 0.8890298436097646.
[I 2023-10-08 04:48:50,833] Trial 19 finished with value: 1.2300378359490702 and parameters: {'learning_rate': 0.005343436741534966, 'epochs': 170, 'n_units_linear': 4400, 'n_layers': 1, 'n_units_rnb_1': 3100, 'dropout_rnb_1': 0.1, 'res_dropout_rnb_1': 0.30000000000000004}. Best is trial 17 with value: 0.8890298436097646.
[I 2023-10-08 05:05:49,980] Trial 20 finished with value: 0.864398197920666 and parameters: {'learning_rate': 0.029552605163529966, 'epochs': 170, 'n_units_linear': 4500, 'n_layers': 2, 'n_units_rnb_1': 4400, 'dropout_rnb_1': 0.0, 'res_dropout_rnb_1': 0.2, 'n_units_rnb_2': 3800, 'dropout_rnb_2': 0

[I 2023-10-08 08:32:52,765] Trial 43 finished with value: 0.8926524137774389 and parameters: {'learning_rate': 0.06353679518825933, 'epochs': 120, 'n_units_linear': 3500, 'n_layers': 1, 'n_units_rnb_1': 4600, 'dropout_rnb_1': 0.0, 'res_dropout_rnb_1': 0.0}. Best is trial 20 with value: 0.864398197920666.
[I 2023-10-08 08:40:34,220] Trial 44 finished with value: 1.1526099345515683 and parameters: {'learning_rate': 0.016292522987740934, 'epochs': 90, 'n_units_linear': 3800, 'n_layers': 2, 'n_units_rnb_1': 4200, 'dropout_rnb_1': 0.1, 'res_dropout_rnb_1': 0.0, 'n_units_rnb_2': 2700, 'dropout_rnb_2': 0.1, 'res_dropout_rnb_2': 0.4}. Best is trial 20 with value: 0.864398197920666.
[I 2023-10-08 08:51:29,930] Trial 45 finished with value: 1.2595668367128259 and parameters: {'learning_rate': 0.027886680891849026, 'epochs': 120, 'n_units_linear': 4100, 'n_layers': 3, 'n_units_rnb_1': 3000, 'dropout_rnb_1': 0.0, 'res_dropout_rnb_1': 0.1, 'n_units_rnb_2': 900, 'dropout_rnb_2': 0.30000000000000004,

{'model': 'ResNet', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'AZAE970101', 'mae': 0.864398197920666, 'learning_rate': 0.029552605163529966, 'epochs': 170, 'n_units_linear': 4500, 'n_layers': 2, 'n_units_rnb_1': 4400, 'dropout_rnb_1': 0.0, 'res_dropout_rnb_1': 0.2, 'n_units_rnb_2': 3800, 'dropout_rnb_2': 0.5, 'res_dropout_rnb_2': 0.0}
