# Comparison of multiple models
We will evaluate the performance of multiple optimized models over 14 test seasons from 204NH to 2020SH.
- AdaBoost (NextFlu matched params)
    - mut_mat = 'binary'
    - meta_features = ['virus', 'serum']
    - metadata = 'a+p'
    - model_name = 'AdaBoost_binary'
- Random forest (RF)
    - mut_mat = 'AZAE970101'
    - meta_features = ['virus', 'serum', 'virusPassCat', 'serumPassCat']
    - metadata = 'a+p+vPC+sPC'
    - model_name = 'RF'
- eXtreme Gradient Boosting (XGBoost)
    - mut_mat = 'GIAG010101'
    - meta_features = ['virus', 'serum', 'virusPassCat', 'serumPassCat']
    - metadata = 'a+p+vPC+sPC'
    - model_name = 'XGBoost'
- Multilayer perceptron (MLP)
    - mut_mat = 'WEIL970102'
    - meta_features = ['virus', 'serum', 'virusPassCat', 'serumPassCat']
    - metadata = 'a+p+vPC+sPC'
    - model_name = 'MLP'
- Residual neural network (ResNet)
    - mut_mat = 'MUET010101'
    - meta_features = ['virus', 'serum', 'virusPassCat', 'serumPassCat']
    - metadata = 'a+p+vPC+sPC'
    - model_name = 'ResNet'

## Imports

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from ast import literal_eval
import random

# self defined functions and models
import utilities
import model_utilities

# for encoding of metadata information
from sklearn.preprocessing import OneHotEncoder

# for parallel computation
from joblib import Parallel, delayed

# for reproduciblility, fix the randomly generated numbers
SEED = 100
random.seed(SEED)
np.random.seed(SEED)

## Variables

In [2]:
mut_mat      = 'WEIL970102'   # mutation matrix
Test_Seasons = [str(year)+s for year in range (2014, 2021) for s in ["NH", "SH"]]   # seasons from 2014NH to 2020SH

HA1_features  = [f"HA1_{x}" for x in range(1,329+1)]
meta_features = [
                 'virus',   # virus avidity
                 'serum',   # antiserum potency
                 'virusPassCat',
                 'serumPassCat'
                 ]   # metadata features

metadata   = 'a+p+vPC+sPC'   # label to record which metadata is being used
model_name = 'MLP'   # the type of model to be used

## Paths and filenames

In [3]:
# paths
path_data   = "../data/"   # path of data
path_result = "../results/SuppFig6_comparison/"   # results will be saved in this directory
Path(path_result).mkdir(parents=True, exist_ok=True)   # make directory if it does not exist already

# filenames
data_fn    = path_data + f"nhts_ha1_{mut_mat}.csv"   # input data
results_fn = path_result + "SuppFig6_testScores.csv"   # to save performance scores for test dataset

## Read data
- Genetic difference (seq_diff) encoded as per the mutation matrix
- Converter is used to load the genetic difference saved as a list of floats

In [4]:
data = pd.read_csv(data_fn, converters={"seq_diff": literal_eval})

## Function to compute performance of model for a given season
- Split the data into training and validation datasets
- Prepare encoded inputs (genetic difference and metadata features)
- Train and validate the model
- Compute predictive performance

> **Parameters**
> - season (str): identifier for the Northern of Southern Hemisphere season such as "2015NH"

> **Returns**
> - (numpy array): actual NHTs for the given season
> - (numpy array): predicted NHTs for the given season

In [5]:
def train_test_season(season):
    print(season)
    
    '''
    Train Test Split
        - based on seasonal framework
        - Train: past virus isolates paired with past sera
        - Test: circulating virus isolates paired with past sera
    '''
    ind_train, ind_test = utilities.seasonal_trainTestSplit(data[['virus', 'serum', 'virusDate', 'serumDate']], season)
    
    # training dataset
    data_train = data.iloc[ind_train].copy()
    data_train.reset_index(drop=True, inplace=True)
    
    # test dataset
    data_test = data.iloc[ind_test].copy()
    data_test.reset_index(drop=True, inplace=True)
    

    '''
    Input features (genetic difference)
    '''
    # training dataset
    X_train = pd.DataFrame(data_train.seq_diff.to_list(),
                           index=data_train.index,
                           columns=HA1_features)
    X_train.fillna(0, inplace=True)   # replace nan with 0
    
    # test dataset
    X_test = pd.DataFrame(data_test.seq_diff.to_list(),
                          index=data_test.index,
                          columns=HA1_features)
    X_test.fillna(0, inplace=True)   # replace nan with 0
    
    
    '''
    Input features (metadata features)
    '''
    X_train_meta = data_train[meta_features].fillna('None').astype('str')
    X_test_meta  = data_test[meta_features].fillna('None').astype('str')
    
    
    # one hot encoding
    ohe = OneHotEncoder(handle_unknown='ignore')
    X_train_meta = ohe.fit_transform(X_train_meta).toarray()
    X_test_meta  = ohe.transform(X_test_meta).toarray()
      
    X_train = np.hstack((X_train.values, X_train_meta))
    X_test  = np.hstack((X_test.values, X_test_meta))


    del X_train_meta, X_test_meta
    
    
    '''
    Target (NHT)
    '''
    y_train = data_train.nht.values
    y_test  = data_test.nht.values
    
    del data_train, data_test
        
    
    '''
    Training and testing
    '''
    # baseline model
    model = getattr(model_utilities, f"model_{model_name}")
    results = model(X_train,
                    y_train,
                    X_test = X_test)
    
    
    '''
    Test Scores
    '''
    cols       = ['mut_mat', 'model', 'metadata', 'season']
    col_values = [mut_mat, model_name, metadata, season]
    utilities.compute_scores(y_test, results['pred_test'], results_fn, col=cols, col_val=col_values)
    
    
    return season, y_test, results['pred_test']

## Performance of the model
- For each test season from 2014NH to 2020SH
    - Split the data into training and test datasets
    - Prepare encoded inputs (genetic difference and metadata features)
    - Train and evaluate the model
    - Compute predictive performance
    - Save the predictions
- Compute average predictive performance

In [6]:
# train test for each test season in parallel
result = Parallel(n_jobs=1, verbose=1)(delayed(train_test_season)(test_season) for test_season in Test_Seasons)

# get results
_, actual_all, predict_all = zip(*result)

'''
Micro-average over seasons
'''
actual  = np.concatenate(actual_all)
predict = np.concatenate(predict_all)

# Test scores
cols       = ['mut_mat', 'model', 'metadata', 'season']
col_values = [mut_mat, model_name, metadata, 'Average']
utilities.compute_scores(actual, predict, results_fn, col=cols, col_val=col_values)


print("Training and testing completed")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


2014NH
Time for training: 1268.380601644516
Training and testing completed


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 21.3min finished
