# Optimize hyperparameters of AdaBoost for each mutation matrix
We will optimize hyperparameters of AdaBoost model for each mutation matrix. The optimization will be based on the MAE performance of the model over four validation seasons from 2012NH to 2013SH.

## Imports

In [9]:
from pathlib import Path
import pandas as pd
import numpy as np
import random
from ast import literal_eval
import pickle

# self defined functions
import utilities

# for model development
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

# for hyperparameter optimization
from hyperopt import Trials, tpe, hp, fmin, space_eval
from hyperopt.pyll import scope

# for parallel computation
from functools import partial
from joblib import Parallel, delayed

# for reproduciblility, fix the randomly generated numbers
SEED = 100
random.seed(SEED)
np.random.seed(SEED)

## Variables

In [11]:
Valid_Seasons = ['2012NH', '2012SH', '2013NH', '2013SH'] # seasons from 2012NH to 2013SH

HA1_features  = [f"HA1_{x}" for x in range(1,329+1)]
meta_features = [
                 'virus',   # virus avidity
                 'serum',   # antiserum potency
                 'virusPassCat',
                 'serumPassCat'
                 ]   # metadata features

metadata   = 'a+p+vPC+sPC'   # label to record which metadata is being used
model_name = 'AdaBoost'   # identifier for the type of model to be used

## Paths and filenames

In [12]:
# paths
path_data   = "../../Dataset H3N2/Crick/Encoded_Data/"   # path of data
path_result = "../results/SuppFig3_optimization/"   # results will be saved in this directory
Path(path_result+"/hyperopt_trials/").mkdir(parents=True, exist_ok=True)   # make directory if it does not exist already

# filenames
mut_mat_fn  = path_data + "aaIndID_selected.txt"   # filename of list of valid mutation matrics
optimize_fn = path_result+"SuppFig3c_optimize_mut_mat_hyperopt.csv"   # to save optimization results

## Read valid mutation matrices used for encoding genetic difference

In [13]:
mut_mat_List = pd.read_csv(mut_mat_fn, header = None)
mut_mat_List = mut_mat_List[0].tolist()

## Indices of training and validation datasets for validation seasons

In [6]:
# read dataset temporarily
dummy = pd.read_csv(path_data+"nhts_ha1_binary.csv",
                    converters={"seq_diff": literal_eval})

# to collect train and valid indices for each validation season
indices_folds = []

# loop through each validation season
for valid_season in Valid_Seasons:
    '''
    Train Test Split
        - based on seasonal framework
        - Train: past virus isolates paired with past sera
        - Test: circulating virus isolates paired with past sera
    '''
    ind_train, ind_valid = utilities.seasonal_trainTestSplit(dummy.copy(), valid_season)
    
    indices_folds.append((ind_train, ind_valid))

del dummy, ind_train, ind_valid

## Objective function for hyperopt
The objective is to minimize the average MAE over validation seasons. This function will train the RF model with provided hyperparameters and return the average MAE.

> **Parameters**
> - params (dict): dictionary of hyperparameters and corresponding values
> - data (dataframe): dataset

> **Returns**
> - avg_mae (float): MAE averaged over validation seasons

In [10]:
def objective(params, data):
    actual_all  = []   # to collect measured NHTs across validation seasons
    predict_all = []   # to collect predicted NHTs across validation seasons
    
    # loop through validation seasons
    for ind_train, ind_valid in indices_folds:
        '''
        Assign training and validation datasets
        '''
        # training dataset
        data_train = data.iloc[ind_train].copy()
        data_train.reset_index(drop=True, inplace=True)

        # validation dataset
        data_valid = data.iloc[ind_valid].copy()
        data_valid.reset_index(drop=True, inplace=True)
        
        
        '''
        Input features (genetic difference)
        '''
        # training dataset
        X_train = pd.DataFrame(data_train.seq_diff.to_list(),
                               index=data_train.index,
                               columns=HA1_features)
        X_train.fillna(0, inplace=True)   # replace nan with 0

        # validation dataset
        X_valid = pd.DataFrame(data_valid.seq_diff.to_list(),
                               index=data_valid.index,
                               columns=HA1_features)
        X_valid.fillna(0, inplace=True)   # replace nan with 0


        '''
        Input features (metadata features)
        '''
        X_train_meta = data_train[meta_features].fillna('None').astype('str')
        X_valid_meta = data_valid[meta_features].fillna('None').astype('str')


        # one hot encoding
        ohe = OneHotEncoder(handle_unknown='ignore')
        X_train_meta = ohe.fit_transform(X_train_meta).toarray()
        X_valid_meta = ohe.transform(X_valid_meta).toarray()

        X_train = np.hstack((X_train.values, X_train_meta))
        X_valid = np.hstack((X_valid.values, X_valid_meta))


        del X_train_meta, X_valid_meta
        
        
        '''
        Training and validation
        '''
        # to avoid max depth of less than 1
        if params['max_depth'] < 1:
            params['max_depth'] = 1
        
        model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=params['max_depth'], max_features=params['max_features']),
                                  n_estimators=params['n_estimators'],
                                  learning_rate=params['learning_rate'],
                                  random_state=SEED)
        model.fit(X_train, data_train.nht.values)
        predict_valid = model.predict(X_valid)
        
        '''
        save actuals and predictions
        '''
        actual_all.append(data_valid.nht.values)
        predict_all.append(predict_valid)
        
        ##################
        # End seasons loop
        ##################
    
    # combine for averaging
    actuals     = np.concatenate(actual_all)
    predictions = np.concatenate(predict_all)
    
    
    '''
    metric or loss (MAE)
    '''
    avg_mae = mean_absolute_error(actuals, predictions)
    
    return avg_mae

## Optimization function for a mutation matrix

In [15]:
'''
Objective function with data
'''
def objective_data(params, data):
    output = objective(params, data)
    return output


'''
optimization function for a single mutation matrix
'''
def optimize_mutMat(mut_mat):
    
    '''
    Dataset
    '''
    # Genetic difference (seq_diff) encoded as per the mutation matrix
    # Converter is used to load the genetic difference saved as a list of floats
    data = pd.read_csv(path_data+f"nhts_ha1_{mut_mat}.csv",
                       converters={"seq_diff": literal_eval})
    
    '''
    Initialize objective optimization function with data
    '''
    fmin_objective_data = partial(objective_data, data=data)
    

    '''
    Hyper-parameter optimization
    '''
    try:
        '''
        load the trials object
        '''
        with open(path_result+f"hyperopt_trials/trials_{mut_mat}.hyperopt", "rb") as f:
            trial     = pickle.load(f)
            max_evals = len(trial) + 5
    except:
        # hyperopt initialize trials object
        trial = Trials()
        max_evals = 100
    
    # hyperparameters search space
    space={'n_estimators': scope.int(hp.quniform('n_estimators', 10, 1000, 10)),
           'learning_rate': scope.float(hp.uniform('learning_rate', 0.1, 1.5)),
           'max_features': scope.float(hp.uniform('max_features', 0.1, 1)),
           'max_depth': scope.int(hp.quniform('max_depth', 1, 10000, 10))
          }
    
    # hyperopt minimization
    best = fmin(fn=fmin_objective_data,
                space=space,
                algo=tpe.suggest,
                max_evals=max_evals, 
                trials=trial,
                rstate=np.random.default_rng(SEED))
    
    
    '''
    Best hyperparameters
    '''
    hyperparams = {'model': model_name,
                   'metadata': metadata,
                   'mut_mat': mut_mat,
                   'mae': trial.best_trial['result']['loss']}
    
    best_params = space_eval(space, best)
    # to avoid max depth less than 1 as we actually used in objective function
    if best_params['max_depth'] < 1:
        best_params['max_depth'] = 1
        
    hyperparams.update(best_params)
    print(hyperparams)
    
    '''
    save results
    '''
    utilities.saveDict2CSV([hyperparams], optimize_fn)
    
    
    '''
    save the trials object
    '''
    with open(path_result+f"hyperopt_trials/trials_{mut_mat}.hyperopt", "wb") as f:
        pickle.dump(trial, f)
    

## Parallel execution of hyperparameter optimization for each mutation matrix

In [16]:
Parallel(n_jobs=-1)(delayed(optimize_mutMat)(mut_mat) for mut_mat in mut_mat_List)

100%|██████████| 100/100 [19:02:41<00:00, 685.62s/trial, best loss: 0.7580750797209777]]8]
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'MOHR870101', 'mae': 0.7580750797209777, 'learning_rate': 1.013395674902753, 'max_depth': 4310, 'max_features': 0.5848392370959125, 'n_estimators': 190}
100%|██████████| 100/100 [21:16:59<00:00, 766.19s/trial, best loss: 0.754419473746152]    
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'KOSJ950113', 'mae': 0.754419473746152, 'learning_rate': 0.9792050853020604, 'max_depth': 5230, 'max_features': 0.39280082597161403, 'n_estimators': 340}
100%|██████████| 100/100 [21:30:28<00:00, 774.29s/trial, best loss: 0.7583696847753996]] ]
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'MIYS930101', 'mae': 0.7583696847753996, 'learning_rate': 1.251187458639534, 'max_depth': 7300, 'max_features': 0.27289592234603993, 'n_estimators': 410}
100%|██████████| 100/100 [21:34:01<00:00, 776.42s/trial, best loss: 0.7613514147869057

100%|██████████| 100/100 [25:52:26<00:00, 931.46s/trial, best loss: 0.7622724527436685]] ]
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'NIEK910102', 'mae': 0.7622724527436685, 'learning_rate': 0.35978918137811877, 'max_depth': 8750, 'max_features': 0.4460957763834802, 'n_estimators': 540}
 75%|███████▌  | 75/100 [19:36:34<5:23:34, 776.58s/trial, best loss: 0.7538864255835724]1]
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'JOHM930101', 'mae': 0.7577311218467401, 'learning_rate': 0.4603476262447509, 'max_depth': 8440, 'max_features': 0.854744363576084, 'n_estimators': 80}
100%|██████████| 100/100 [19:16:30<00:00, 693.91s/trial, best loss: 0.7591741577691978]] ]
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'HENS920104', 'mae': 0.7591741577691978, 'learning_rate': 0.8420988671148246, 'max_depth': 6160, 'max_features': 0.936196109084406, 'n_estimators': 250}
 66%|██████▌   | 66/100 [17:10:35<13:10:13, 1394.53s/trial, best loss: 0.75362029323712

100%|██████████| 100/100 [23:59:12<00:00, 863.52s/trial, best loss: 0.7559365190373785]8]]
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'KOSJ950109', 'mae': 0.7559365190373785, 'learning_rate': 0.7938027170163487, 'max_depth': 740, 'max_features': 0.7161498752715146, 'n_estimators': 170}
100%|██████████| 100/100 [25:42:02<00:00, 925.22s/trial, best loss: 0.7597367671469509] ] 
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'GONG920101', 'mae': 0.7597367671469509, 'learning_rate': 0.8130718243915634, 'max_depth': 4610, 'max_features': 0.8684099207316397, 'n_estimators': 910}
 89%|████████▉ | 89/100 [24:06:39<4:35:07, 1500.68s/trial, best loss: 0.753903305585497]] 
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'DOSZ010103', 'mae': 0.7541834991869458, 'learning_rate': 0.7249797798045267, 'max_depth': 1720, 'max_features': 0.2964981477828437, 'n_estimators': 380}
100%|██████████| 100/100 [33:37:47<00:00, 1210.67s/trial, best loss: 0.760514441908708

 94%|█████████▍| 94/100 [20:47:49<1:00:17, 602.98s/trial, best loss: 0.7599305973591503]]
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'OGAK980101', 'mae': 0.761262069956679, 'learning_rate': 0.830120839844891, 'max_depth': 5270, 'max_features': 0.7704404513294377, 'n_estimators': 610}
 82%|████████▏ | 82/100 [21:15:18<4:15:59, 853.30s/trial, best loss: 0.7592134703414103] 
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'MUET020102', 'mae': 0.7605512664796038, 'learning_rate': 0.9283232001829835, 'max_depth': 5180, 'max_features': 0.5543423642264698, 'n_estimators': 660}
 93%|█████████▎| 93/100 [23:15:38<1:12:03, 617.67s/trial, best loss: 0.7592134703414103]]
{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'LUTR910105', 'mae': 0.7599305973591503, 'learning_rate': 0.19671166618561486, 'max_depth': 5290, 'max_features': 0.46144891135868826, 'n_estimators': 20}
100%|██████████| 100/100 [25:53:05<00:00, 931.86s/trial, best loss: 0.7563426151853693]1]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]


{'model': 'AdaBoost', 'metadata': 'a+p+vPC+sPC', 'mut_mat': 'NGPC000101', 'mae': 0.7597523919515171, 'learning_rate': 0.8074218158831638, 'max_depth': 5010, 'max_features': 0.5299895594656931, 'n_estimators': 560}
