# Optimize hyperparameters of AdaBoost for each mutation matrix
We will optimize hyperparameters of AdaBoost model for each mutation matrix. The optimization will be based on the MAE performance of the model over four validation seasons from 2012NH to 2013SH.

## Imports

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import random
from ast import literal_eval
import pickle

# self defined functions
import utilities

# for model development
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

# for hyperparameter optimization
from hyperopt import Trials, tpe, hp, fmin, space_eval
from hyperopt.pyll import scope

# for reproduciblility, fix the randomly generated numbers
SEED = 100
random.seed(SEED)
np.random.seed(SEED)

## Variables

In [2]:
Valid_Seasons = ['2012NH', '2012SH', '2013NH', '2013SH'] # seasons from 2012NH to 2013SH

HA1_features  = [f"HA1_{x}" for x in range(1,329+1)]
meta_features = [
                 'virus',   # virus avidity
                 'serum',   # antiserum potency
                 'virusPassCat',
                 'serumPassCat'
                 ]   # metadata features

metadata   = 'a+p+vPC+sPC'   # label to record which metadata is being used
model_name = 'AdaBoost'   # identifier for the type of model to be used
mut_mat    = 'one_hot'

# 20 valid amino acids
aa = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']

## Paths and filenames

In [3]:
# paths
path_data   = "../data/"   # path of data
path_result = "../results/SuppFig3_optimization/"   # results will be saved in this directory
Path(path_result+"/hyperopt_trials/").mkdir(parents=True, exist_ok=True)   # make directory if it does not exist already

# filenames
data_fn     = path_data + "nhts_ha1_binary.csv"   # input data
optimize_fn = path_result+"SuppFig3c_optimize_mut_mat_hyperopt.csv"   # to save optimization results

## Read data
- Binary encoded genetic difference (seq_diff) (not used)

In [4]:
data = pd.read_csv(data_fn)

## Objective function for hyperopt
The objective is to minimize the average MAE over validation seasons. This function will train the model with provided hyperparameters and return the average MAE.

> **Parameters**
> - params (dict): dictionary of hyperparameters and corresponding values
> - data (dataframe): dataset

> **Returns**
> - avg_mae (float): MAE averaged over validation seasons

In [5]:
def objective(params):
    actual_all  = []   # to collect measured NHTs across validation seasons
    predict_all = []   # to collect predicted NHTs across validation seasons
    
    # loop through validation seasons
    for valid_season in Valid_Seasons:
        '''
        Train Test Split
            - based on seasonal framework
            - Train: past virus isolates paired with past sera
            - Test: circulating virus isolates paired with past sera
        '''
        ind_train, ind_valid = utilities.seasonal_trainTestSplit(data[['virus', 'serum', 'virusDate', 'serumDate']],
                                                                 valid_season)
        
        
        '''
        Assign training and validation datasets
        '''
        # training dataset
        data_train = data.iloc[ind_train].copy()
        data_train.reset_index(drop=True, inplace=True)

        # validation dataset
        data_valid = data.iloc[ind_valid].copy()
        data_valid.reset_index(drop=True, inplace=True)
        
        
        '''
        Input features (genetic difference)
        '''
        # training dataset
        # get sequences
        X_train_virusSeq = pd.DataFrame(data_train.virusSeq.apply(list).tolist(),
                                        index=data_train.index,
                                        columns=HA1_features)
        X_train_serumSeq = pd.DataFrame(data_train.serumSeq.apply(list).tolist(),
                                        index=data_train.index,
                                        columns=HA1_features)
        
        # initialize encoders
        # fixed encoding, for each site, encode an amino acid into binary vector of length 20
        ohe_virus = OneHotEncoder(categories=[aa] * X_train_virusSeq.shape[1], handle_unknown='ignore')
        ohe_serum = OneHotEncoder(categories=[aa] * X_train_serumSeq.shape[1], handle_unknown='ignore')
        
        # train encoder and transform
        X_train_virusSeq = ohe_virus.fit_transform(X_train_virusSeq).toarray()
        X_train_serumSeq = ohe_serum.fit_transform(X_train_serumSeq).toarray()
        
        # element-wise OR operation
        X_train = np.logical_or(X_train_virusSeq, X_train_serumSeq) * 1


        # validation dataset
        # get sequences
        X_valid_virusSeq = pd.DataFrame(data_valid.virusSeq.apply(list).tolist(),
                                        index=data_valid.index,
                                        columns=HA1_features)
        X_valid_serumSeq = pd.DataFrame(data_valid.serumSeq.apply(list).tolist(),
                                        index=data_valid.index,
                                        columns=HA1_features)

        # transform
        X_valid_virusSeq = ohe_virus.transform(X_valid_virusSeq).toarray()
        X_valid_serumSeq = ohe_serum.transform(X_valid_serumSeq).toarray()
        
        # element-wise OR operation
        X_valid = np.logical_or(X_valid_virusSeq, X_valid_serumSeq) * 1

        del X_train_virusSeq, X_train_serumSeq, X_valid_virusSeq, X_valid_serumSeq

        
        '''
        Input features (metadata features)
        '''
        X_train_meta = data_train[meta_features].fillna('None').astype('str')
        X_valid_meta = data_valid[meta_features].fillna('None').astype('str')


        # one hot encoding
        ohe = OneHotEncoder(handle_unknown='ignore')
        X_train_meta = ohe.fit_transform(X_train_meta).toarray()
        X_valid_meta = ohe.transform(X_valid_meta).toarray()

        X_train = np.hstack((X_train, X_train_meta))
        X_valid = np.hstack((X_valid, X_valid_meta))


        del X_train_meta, X_valid_meta
        
        
        '''
        Training and validation
        '''
        # to avoid max depth of less than 1
        if params['max_depth'] < 1:
            params['max_depth'] = 1
        
        model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=params['max_depth'], max_features=params['max_features']),
                                  n_estimators=params['n_estimators'],
                                  learning_rate=params['learning_rate'],
                                  random_state=SEED)
        model.fit(X_train, data_train.nht.values)
        predict_valid = model.predict(X_valid)
        
        '''
        save actuals and predictions
        '''
        actual_all.append(data_valid.nht.values)
        predict_all.append(predict_valid)
        
        ##################
        # End seasons loop
        ##################
    
    # combine for averaging
    actuals     = np.concatenate(actual_all)
    predictions = np.concatenate(predict_all)
    
    
    '''
    metric or loss (MAE)
    '''
    avg_mae = mean_absolute_error(actuals, predictions)
    
    return avg_mae

## Optimization

In [6]:
try:
    '''
    load the trials object
    '''
    with open(path_result+f"hyperopt_trials/trials_{mut_mat}.hyperopt", "rb") as f:
        trial     = pickle.load(f)
        max_evals = len(trial) + 45 
except:
    # hyperopt initialize trials object
    trial = Trials()
    max_evals = 45

# hyperparameters search space
space={'n_estimators': scope.int(hp.quniform('n_estimators', 10, 1000, 10)),
       'learning_rate': scope.float(hp.uniform('learning_rate', 0.1, 1.5)),
       'max_features': scope.float(hp.uniform('max_features', 0.1, 1)),
       'max_depth': scope.int(hp.quniform('max_depth', 1, 10000, 10))
      }

# hyperopt minimization
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=max_evals, 
            trials=trial,
            rstate=np.random.default_rng(SEED))


'''
Best hyperparameters
'''
hyperparams = {'model': model_name,
               'metadata': metadata,
               'mut_mat': mut_mat,
               'mae': trial.best_trial['result']['loss']}

best_params = space_eval(space, best)
# to avoid max depth less than 1 as we actually used in objective function
if best_params['max_depth'] < 1:
    best_params['max_depth'] = 1

hyperparams.update(best_params)
print(hyperparams)

'''
save results
'''
utilities.saveDict2CSV([hyperparams], optimize_fn)


'''
save the trials object
'''
with open(path_result+f"hyperopt_trials/trials_{mut_mat}.hyperopt", "wb") as f:
    pickle.dump(trial, f)


100%|████| 50/50 [51:58:27<00:00, 4157.93s/trial, best loss: 0.7527625599483879]
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'one_hot', 'mae': 0.7527625599483879, 'learning_rate': 1.0760969037630008, 'max_depth': 6490, 'max_features': 0.24438375582247596, 'n_estimators': 780}
