# Evaluate RF (NextFlu-matched-params) model
We will evaluate the RF (NextFlu-matched-params) model over 14 test seasons from 204NH to 2020SH. The model with be based on baseline model with unoptimized hyperparameters, binary encoded genetic difference, and included virus avidity and antiserum potency parameters similar to NextFlu substitution model.

## Imports

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import utilities   # self defined functions
import model_utilities   # self defined models
from ast import literal_eval
from sklearn.preprocessing import OneHotEncoder

# (for reproduciblility) fix the randomly generated numbers
SEED = 100
np.random.seed(SEED)

## Variables

In [None]:
mut_mat       = "binary"   # mutation matrix
Test_Seasons = [str(year)+s for year in range (2014, 2021) for s in ["NH", "SH"]]   # seasons from 2014NH to 2020SH

HA1_features  = [f"HA1_{x}" for x in range(1,329+1)]
meta_features = [
                 'virus',   # virus avidity
                 'serum',   # antiserum potency
                 # 'virusPassCat',
                 # 'serumPassCat'
                 ]   # metadata features

metadata   = 'a+p'   # label to record which metadata is being used
model_name = 'baseline'   # the type of model to be used

## Paths and filenames

In [None]:
# paths
path_data   = "../data/"   # path of data
path_result = "../results/SuppFig6_comparison/"   # results will be saved in this directory
Path(path_result).mkdir(parents=True, exist_ok=True)   # make directory if it does not exist already

# filenames
data_fn   = path_data + f"nhts_ha1_{mut_mat}.csv"   # input data
test_fn   = path_result + "SuppFig6b_testScores_RF_NextFlu_matched_params.csv"   # to save performance scores for test dataset

## Read data
- Genetic difference (seq_diff) encoded as per the mutation matrix
- Converter is used to load the genetic difference saved as a list of floats

In [None]:
data = pd.read_csv(data_fn, converters={"seq_diff": literal_eval})

## Performance of the model
- For each test season from 2014NH to 2020SH
    - Split the data into training and test datasets
    - Prepare encoded inputs (genetic difference and metadata features)
    - Train and evaluate the model
    - Compute predictive performance
    - Save the predictions
- Compute average predictive performance

In [None]:
# to collect actuals and predictions for micro-averaged scores over all test seasons
actual_all  = {}
predict_all = {}

# loop through test seasons
for test_season in Test_Seasons:
    print("Test Season: ", test_season)

    '''
    Train Test Split
        - based on seasonal framework
        - Train: past virus isolates paired with past sera
        - Test: circulating virus isolates paired with past sera
    '''
    ind_train, ind_test = utilities.seasonal_trainTestSplit(data.copy(), test_season)
    
    # training dataset
    data_train = data.iloc[ind_train].copy()
    data_train.reset_index(drop=True, inplace=True)
    
    # test dataset
    data_test = data.iloc[ind_test].copy()
    data_test.reset_index(drop=True, inplace=True)


    '''
    Input features (genetic difference)
    '''
    # training dataset
    X_train = pd.DataFrame(data_train.seq_diff.to_list(),
                           index=data_train.index,
                           columns=HA1_features)
    X_train.fillna(0, inplace=True)   # replace nan with 0
    
    # test dataset
    X_test = pd.DataFrame(data_test.seq_diff.to_list(),
                           index=data_test.index,
                           columns=HA1_features)
    X_test.fillna(0, inplace=True)   # replace nan with 0
    
    
    '''
    Input features (metadata features)
    '''
    X_train_meta = data_train[meta_features].fillna('None').astype('str')
    X_test_meta  = data_test[meta_features].fillna('None').astype('str')
    
    
    # one hot encoding
    ohe = OneHotEncoder(handle_unknown='ignore')
    X_train_meta = ohe.fit_transform(X_train_meta).toarray()
    X_test_meta = ohe.transform(X_test_meta).toarray()
      
    X_train = np.hstack((X_train.values, X_train_meta))
    X_test  = np.hstack((X_test.values, X_test_meta))


    del X_train_meta, X_test_meta
        
    
    '''
    Training and evaluation
    '''
    # optimized model
    model = getattr(model_utilities, f"model_{model_name}")
    results = model(X_train,
                    data_train.nht.values,
                    X_test = X_test)
    
   
    '''
    Test Scores
    '''
    cols       = ['mut_mat', 'model', 'metadata', 'season']
    col_values = [mut_mat, model_name, metadata, test_season]
    utilities.compute_scores(data_test.nht.values, results['pred_test'], test_fn, col=cols, col_val=col_values)
    
    
    '''
    Save actuals and predictions
    '''
    actual_all[f'{test_season}']  = data_test.nht.values
    predict_all[f'{test_season}'] = results['pred_test']
    
    #################
    # End season loop
    #################


'''
Micro-average over seasons
'''
actual  = np.concatenate(list(actual_all.values()))
predict = np.concatenate(list(predict_all.values()))

# Test scores
col_values = [mut_mat, model_name, metadata, 'Average']
utilities.compute_scores(actual, predict, test_fn, col=cols, col_val=col_values)


print("Training and testing completed")