# Optimize metadata features
We will analyze the effect of metadata features on the MAE performance of the baseline model. We will use the baseline model (AdaBoost with default/unopitimized hyper-parameters) and binary encoded genetic difference. The selection of features will be based on the performance of model over four validation seasons from 2012NH to 2013SH.

## Imports

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import random
from ast import literal_eval

# self defined functions and models
import utilities
import model_utilities

# for encoding of metadata information
from sklearn.preprocessing import OneHotEncoder

# for parallel computation
from functools import partial
from joblib import Parallel, delayed

# for reproduciblility, fix the randomly generated numbers
SEED = 100
random.seed(SEED)
np.random.seed(SEED)

## Variables

In [3]:
mut_mat       = "binary"   # mutation matrix
Valid_Seasons = ['2012NH', '2012SH', '2013NH', '2013SH'] # seasons from 2012NH to 2013SH

HA1_features  = [f"HA1_{x}" for x in range(1,329+1)]
meta_features = [
                 #'virus',   # virus avidity
                 #'serum',   # antiserum potency
                 #'virusPassCat',
                 #'serumPassCat'
                 ]   # metadata features

metadata   = 'no metadata'   # label to record which metadata is being used
model_name = 'baseline'   # the type of model to be used

## Paths and filenames

In [4]:
# paths
path_data   = "../data/"   # path of data
path_result = "../results/SuppFig3_optimization/"   # results will be saved in this directory
Path(path_result).mkdir(parents=True, exist_ok=True)   # make directory if it does not exist already

# filenames
data_fn    = path_data + f"nhts_ha1_{mut_mat}.csv"   # input data
results_fn = path_result + "SuppFig3a_optimize_metadata_validScores.csv"   # to save performance scores for validation dataset

## Read data
- Genetic difference (seq_diff) encoded as per the mutation matrix
- Converter is used to load the genetic difference saved as a list of floats

In [None]:
data = pd.read_csv(data_fn, converters={"seq_diff": literal_eval})

## Function to compute performance of model for a given season
- Split the data into training and validation datasets
- Prepare encoded inputs (genetic difference and metadata features)
- Train and validate the model
- Compute predictive performance

> **Parameters**
> - season (str): identifier for the Northern of Southern Hemisphere season such as "2015NH"

> **Returns**
> - (numpy array): actual NHTs for the given season
> - (numpy array): predicted NHTs for the given season

In [None]:
def train_test_season(season):
    '''
    Train Test Split
        - based on seasonal framework
        - Train: past virus isolates paired with past sera
        - Test: circulating virus isolates paired with past sera
    '''
    ind_train, ind_test = utilities.seasonal_trainTestSplit(data[['virus', 'serum', 'virusDate', 'serumDate']], season)
    
    # training dataset
    data_train = data.iloc[ind_train].copy()
    data_train.reset_index(drop=True, inplace=True)
    
    # test dataset
    data_test = data.iloc[ind_test].copy()
    data_test.reset_index(drop=True, inplace=True)


    '''
    Input features (genetic difference)
    '''
    # training dataset
    X_train = pd.DataFrame(data_train.seq_diff.to_list(),
                           index=data_train.index,
                           columns=HA1_features)
    X_train.fillna(0, inplace=True)   # replace nan with 0
    
    # test dataset
    X_test = pd.DataFrame(data_test.seq_diff.to_list(),
                          index=data_test.index,
                          columns=HA1_features)
    X_test.fillna(0, inplace=True)   # replace nan with 0
    
    
    '''
    Input features (metadata features)
    '''
    X_train_meta = data_train[meta_features].fillna('None').astype('str')
    X_test_meta  = data_test[meta_features].fillna('None').astype('str')
    
    
    # one hot encoding
    ohe = OneHotEncoder(handle_unknown='ignore')
    X_train_meta = ohe.fit_transform(X_train_meta).toarray()
    X_test_meta  = ohe.transform(X_test_meta).toarray()
      
    X_train = np.hstack((X_train.values, X_train_meta))
    X_test  = np.hstack((X_test.values, X_test_meta))


    del X_train_meta, X_test_meta
        
    
    '''
    Training and testing
    '''
    # baseline model
    model = getattr(model_utilities, f"model_{model_name}")
    results = model(X_train,
                    data_train.nht.values,
                    X_test = X_test)
    
    
    '''
    Test Scores
    '''
    cols       = ['mut_mat', 'model', 'metadata', 'season']
    col_values = [mut_mat, model_name, metadata, season]
    utilities.compute_scores(data_test.nht.values, results['pred_test'], results_fn, col=cols, col_val=col_values)
    
    return season, data_test.nht.values, results['pred_test']

## Performance of the model over validation seasons
- For each validation season from 2012NH to 2013SH
    - Split the data into training and validation datasets
    - Prepare encoded inputs (genetic difference and metadata features)
    - Train and validate the model
    - Compute predictive performance
    - Save the predictions
- Compute average predictive performance

In [None]:
# train test for each validation season in parallel
result = Parallel(n_jobs=-1, verbose=1)(delayed(train_test_season)(valid_season) for valid_season in Valid_Seasons)

# get results
seasons, actual_valid_all, predict_valid_all = zip(*result)

'''
Micro-average over seasons
'''
actual_valid  = np.concatenate(actual_valid_all)
predict_valid = np.concatenate(predict_valid_all)

# Validation scores
cols       = ['mut_mat', 'model', 'metadata', 'season']
col_values = [mut_mat, model_name, metadata, 'Average']
utilities.compute_scores(actual_valid, predict_valid, results_fn, col=cols, col_val=col_values)

print("Training and testing completed")

## Repeat for different metadata features
Repeat the code cells first under heading "Variables" and then "Performance of the model" for following values of the variables "meta_features", and "metadata":
- **meta_features**=['virus'], **metadata**="virus avidity (a)"
- **meta_features**=['serum'], **metadata**="antiserum potency (p)"
- **meta_features**=['virusPassCat'], **metadata**="virus passage category (vPC)"
- **meta_features**=['serumPassCat'], **metadata**="antiserum PC (sPC)"
- **meta_features**=['virus', 'serum'], **metadata**="a+p"
- **meta_features**=['virusPassCat', 'serumPassCat'], **metadata**="vPC+sPC"
- **meta_features**=['virus', ''virusPassCat], **metadata**="a+vPC"
- **meta_features**=['serum', ''serumPassCat], **metadata**="p+sPC"
- **meta_features**=['virus', 'serum', 'virusPassCat', 'serumPassCat'], **metadata**="a+p+vPC+sPC"