# Optimize hyperparameters of MLP for each mutation matrix
We will optimize hyperparameters of the Multilayer Perceptron (MLP) model for each mutation matrix. The optimization will be based on the MAE performance of model over four validation seasons from 2012NH to 2013SH.

## Imports

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import random
from ast import literal_eval
import pickle
import gc

# self defined functions
import utilities

# for model development
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Input, LeakyReLU
from sklearn.preprocessing import MinMaxScaler

# for hyperparameter optimization
import optuna

# for reproduciblility, fix the randomly generated numbers
SEED = 100
tf.keras.utils.set_random_seed(SEED)

## Variables

In [2]:
Valid_Seasons = ['2012NH', '2012SH', '2013NH', '2013SH'] # seasons from 2012NH to 2013SH

HA1_features  = [f"HA1_{x}" for x in range(1,329+1)]
meta_features = [
                 'virus',   # virus avidity
                 'serum',   # antiserum potency
                 'virusPassCat',
                 'serumPassCat'
                 ]   # metadata features

metadata   = 'a+p+vPC+sPC'   # label to record which metadata is being used
model_name = 'MLP'   # identifier for the type of model to be used

## Paths and filenames

In [3]:
# paths
path_data   = "../data/"   # path of data
path_result = "../results/SuppFig6_comparison/"   # results will be saved in this directory
Path(path_result+f"/optuna_{model_name}/").mkdir(parents=True, exist_ok=True)   # make directory if it does not exist already

# filenames
mut_mat_fn  = path_data + "aaIndID_selected.txt"   # filename of list of valid mutation matrics
optimize_fn = path_result+f"SuppFig6_optimize_{model_name}_mut_mat_optuna.csv"   # to save optimization results

## Read valid mutation matrices used for encoding genetic difference

In [4]:
mut_mat_List = ['RUSR970101']

## Indices of training and validation datasets for validation seasons

In [5]:
# read dataset temporarily
dummy = pd.read_csv(path_data+"nhts_ha1_binary.csv",
                    converters={"seq_diff": literal_eval})

# to collect train and valid indices for each validation season
indices_folds = []

# loop through each validation season
for valid_season in Valid_Seasons:
    '''
    Train Test Split
        - based on seasonal framework
        - Train: past virus isolates paired with past sera
        - Test: circulating virus isolates paired with past sera
    '''
    ind_train, ind_valid = utilities.seasonal_trainTestSplit(dummy.copy(), valid_season)
    
    indices_folds.append((ind_train, ind_valid))

del dummy, ind_train, ind_valid

## Objective function for optuna
The objective is to minimize the average MAE over validation seasons. This function will train the RF model with provided hyperparameters and return the average MAE.

> **Parameters**
> - trial: object of optuna

> **Returns**
> - avg_mae (float): MAE averaged over validation seasons

In [6]:
def objective(trial):
    actual_all  = []   # to collect measured NHTs across validation seasons
    predict_all = []   # to collect predicted NHTs across validation seasons
    
    # loop through validation seasons
    for ind_train, ind_valid in indices_folds:
        '''
        Assign training and validation datasets
        '''
        # training dataset
        data_train = data.iloc[ind_train].copy()
        data_train.reset_index(drop=True, inplace=True)

        # validation dataset
        data_valid = data.iloc[ind_valid].copy()
        data_valid.reset_index(drop=True, inplace=True)
        
        
        '''
        Input features (genetic difference)
        '''
        # training dataset
        X_train = pd.DataFrame(data_train.seq_diff.to_list(),
                               index=data_train.index,
                               columns=HA1_features)
        X_train.fillna(0, inplace=True)   # replace nan with 0

        # validation dataset
        X_valid = pd.DataFrame(data_valid.seq_diff.to_list(),
                               index=data_valid.index,
                               columns=HA1_features)
        X_valid.fillna(0, inplace=True)   # replace nan with 0


        '''
        Input features (metadata features)
        '''
        X_train_meta = data_train[meta_features].fillna('None').astype('str')
        X_valid_meta = data_valid[meta_features].fillna('None').astype('str')


        # one hot encoding
        ohe = OneHotEncoder(handle_unknown='ignore')
        X_train_meta = ohe.fit_transform(X_train_meta).toarray()
        X_valid_meta = ohe.transform(X_valid_meta).toarray()

        X_train = np.hstack((X_train.values, X_train_meta))
        X_valid = np.hstack((X_valid.values, X_valid_meta))


        del X_train_meta, X_valid_meta
        
        
        '''
        Scaling
        '''
        # Input normalization
        normalizer = MinMaxScaler()
        X_train    = normalizer.fit_transform(X_train)
        X_valid    = normalizer.transform(X_valid)
        
        # target reshaping
        y_train = data_train.nht.values.reshape(-1, 1)
        y_valid = data_valid.nht.values.reshape(-1, 1)
        
        del data_train, data_valid
        gc.collect()
        
        
        '''
        Model
        '''
        # hyperparameters for optimization
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
        epochs = trial.suggest_int("epochs", 10, 200, step=10)
        
        # model
        input1 = Input(shape=(X_train.shape[1],))
            
        # hidden layers
        n_layers = trial.suggest_int("n_layers", 1, 5)
        
        for layer in range(1, n_layers+1):
            n_units = trial.suggest_int(f"n_units_l{layer}", 100, 5000, step=100)   # search variable
            dropout = trial.suggest_float(f"dropout_l{layer}", 0.0, 0.5, step=0.1)   # search variable
            
            if layer == 1:
                # first hidden layer uses input1
                x1 = Dense(n_units)(input1)
            else:
                x1 = Dense(n_units)(x1)
            x1 = LeakyReLU()(x1)
            x1 = Dropout(dropout)(x1)
        
        # output layer
        x1 = Dense(1)(x1)


        model = tf.keras.models.Model(inputs=input1, outputs = x1)
        model.compile(loss = tf.keras.losses.MeanSquaredError(),
                      optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
                     )
        
        
        '''
        Training and validation
        '''
        model.fit(X_train, y_train,
                  epochs = epochs,
                  batch_size = 1024,
                  shuffle = True,
                  verbose=0
                 )
        predict_valid = model.predict(X_valid, verbose=0).squeeze()
        
        
        '''
        save actuals and predictions
        '''
        actual_all.append(y_valid.squeeze())
        predict_all.append(predict_valid)
        
        ##################
        # End seasons loop
        ##################
    
    actuals     = np.concatenate(actual_all)
    predictions = np.concatenate(predict_all)
    
    
    '''
    metric or loss (MAE)
    '''
    avg_mae = mean_absolute_error(actuals, predictions)
    
    return avg_mae

## Optimization

In [None]:
'''
loop through mutation matrices
'''
for mut_mat in mut_mat_List:
    
    print("Mutation matrix: ", mut_mat)
    
    '''
    Dataset
    '''
    # Genetic difference (seq_diff) encoded as per the mutation matrix
    # Converter is used to load the genetic difference saved as a list of floats
    data = pd.read_csv(path_data+f"nhts_ha1_{mut_mat}.csv",
                       converters={"seq_diff": literal_eval})
    

    '''
    Hyper-parameter optimization
    '''
    try:
        '''
        load the optuna study object
        '''
        with open(path_result+f"optuna_{model_name}/study_{mut_mat}.optuna", "rb") as f:
            study     = pickle.load(f)
            n_trials  = 5
    except:
        # hyperopt initialize trials object
        study = optuna.create_study(direction="minimize")
        n_trials = 50
    
    # optuna minimization
    study.optimize(objective, n_trials=n_trials, gc_after_trial=True)
    
    
    '''
    Best hyperparameters
    '''
    hyperparams = {'model': model_name,
                   'metadata': metadata,
                   'mut_mat': mut_mat,
                   'mae': study.best_value
                  }
    hyperparams.update(study.best_params)
    print(hyperparams)
    
    # save to CSV
    utilities.saveDict2CSV([hyperparams], optimize_fn)
    
    
    '''
    save the trials object
    '''
    with open(path_result+f"optuna_{model_name}/study_{mut_mat}.optuna", "wb") as f:
        pickle.dump(study, f)

Mutation matrix:  RUSR970101


[I 2023-10-13 03:58:46,063] A new study created in memory with name: no-name-f79e0369-b6e7-4287-ae85-aafac12e4771
[I 2023-10-13 04:02:15,776] Trial 0 finished with value: 1.0251533836819375 and parameters: {'learning_rate': 0.0003596278592172085, 'epochs': 10, 'n_layers': 2, 'n_units_l1': 4200, 'dropout_l1': 0.4, 'n_units_l2': 4000, 'dropout_l2': 0.5}. Best is trial 0 with value: 1.0251533836819375.
[I 2023-10-13 04:12:07,865] Trial 1 finished with value: 0.9728140740767982 and parameters: {'learning_rate': 0.0029831942429423097, 'epochs': 110, 'n_layers': 1, 'n_units_l1': 4900, 'dropout_l1': 0.30000000000000004}. Best is trial 1 with value: 0.9728140740767982.
[I 2023-10-13 04:27:49,210] Trial 2 finished with value: 0.9284832231609706 and parameters: {'learning_rate': 0.0003024328698422773, 'epochs': 20, 'n_layers': 5, 'n_units_l1': 3300, 'dropout_l1': 0.0, 'n_units_l2': 5000, 'dropout_l2': 0.30000000000000004, 'n_units_l3': 2300, 'dropout_l3': 0.5, 'n_units_l4': 3300, 'dropout_l4': 0